Merge pull request #1088 from aroulin/x64-emitter-abi-call

x64: Proper stack alignment in shader JIT function calls
This commit is contained in:
bunnei 2015-09-02 08:46:58 -04:00
commit 918ca40c68
7 changed files with 301 additions and 455 deletions

View file

@ -24,6 +24,7 @@ set(SRCS
set(HEADERS set(HEADERS
assert.h assert.h
bit_field.h bit_field.h
bit_set.h
break_points.h break_points.h
chunk_file.h chunk_file.h
code_block.h code_block.h

189
src/common/bit_set.h Normal file
View file

@ -0,0 +1,189 @@
// This file is under the public domain.
#pragma once
#include <cstddef>
#ifdef _WIN32
#include <intrin.h>
#endif
#include <initializer_list>
#include <type_traits>
#include "common/common_types.h"
// namespace avoids conflict with OS X Carbon; don't use BitSet<T> directly
namespace Common {
// Helper functions:
#ifdef _WIN32
template <typename T>
static inline int CountSetBits(T v)
{
// from https://graphics.stanford.edu/~seander/bithacks.html
// GCC has this built in, but MSVC's intrinsic will only emit the actual
// POPCNT instruction, which we're not depending on
v = v - ((v >> 1) & (T)~(T)0/3);
v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3);
v = (v + (v >> 4)) & (T)~(T)0/255*15;
return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8;
}
static inline int LeastSignificantSetBit(u8 val)
{
unsigned long index;
_BitScanForward(&index, val);
return (int)index;
}
static inline int LeastSignificantSetBit(u16 val)
{
unsigned long index;
_BitScanForward(&index, val);
return (int)index;
}
static inline int LeastSignificantSetBit(u32 val)
{
unsigned long index;
_BitScanForward(&index, val);
return (int)index;
}
static inline int LeastSignificantSetBit(u64 val)
{
unsigned long index;
_BitScanForward64(&index, val);
return (int)index;
}
#else
static inline int CountSetBits(u8 val) { return __builtin_popcount(val); }
static inline int CountSetBits(u16 val) { return __builtin_popcount(val); }
static inline int CountSetBits(u32 val) { return __builtin_popcount(val); }
static inline int CountSetBits(u64 val) { return __builtin_popcountll(val); }
static inline int LeastSignificantSetBit(u8 val) { return __builtin_ctz(val); }
static inline int LeastSignificantSetBit(u16 val) { return __builtin_ctz(val); }
static inline int LeastSignificantSetBit(u32 val) { return __builtin_ctz(val); }
static inline int LeastSignificantSetBit(u64 val) { return __builtin_ctzll(val); }
#endif
// Similar to std::bitset, this is a class which encapsulates a bitset, i.e.
// using the set bits of an integer to represent a set of integers. Like that
// class, it acts like an array of bools:
// BitSet32 bs;
// bs[1] = true;
// but also like the underlying integer ([0] = least significant bit):
// BitSet32 bs2 = ...;
// bs = (bs ^ bs2) & BitSet32(0xffff);
// The following additional functionality is provided:
// - Construction using an initializer list.
// BitSet bs { 1, 2, 4, 8 };
// - Efficiently iterating through the set bits:
// for (int i : bs)
// [i is the *index* of a set bit]
// (This uses the appropriate CPU instruction to find the next set bit in one
// operation.)
// - Counting set bits using .Count() - see comment on that method.
// TODO: use constexpr when MSVC gets out of the Dark Ages
template <typename IntTy>
class BitSet
{
static_assert(!std::is_signed<IntTy>::value, "BitSet should not be used with signed types");
public:
// A reference to a particular bit, returned from operator[].
class Ref
{
public:
Ref(Ref&& other) : m_bs(other.m_bs), m_mask(other.m_mask) {}
Ref(BitSet* bs, IntTy mask) : m_bs(bs), m_mask(mask) {}
operator bool() const { return (m_bs->m_val & m_mask) != 0; }
bool operator=(bool set)
{
m_bs->m_val = (m_bs->m_val & ~m_mask) | (set ? m_mask : 0);
return set;
}
private:
BitSet* m_bs;
IntTy m_mask;
};
// A STL-like iterator is required to be able to use range-based for loops.
class Iterator
{
public:
Iterator(const Iterator& other) : m_val(other.m_val), m_bit(other.m_bit) {}
Iterator(IntTy val, int bit) : m_val(val), m_bit(bit) {}
Iterator& operator=(Iterator other) { new (this) Iterator(other); return *this; }
int operator*() { return m_bit; }
Iterator& operator++()
{
if (m_val == 0)
{
m_bit = -1;
}
else
{
int bit = LeastSignificantSetBit(m_val);
m_val &= ~(1 << bit);
m_bit = bit;
}
return *this;
}
Iterator operator++(int _)
{
Iterator other(*this);
++*this;
return other;
}
bool operator==(Iterator other) const { return m_bit == other.m_bit; }
bool operator!=(Iterator other) const { return m_bit != other.m_bit; }
private:
IntTy m_val;
int m_bit;
};
BitSet() : m_val(0) {}
explicit BitSet(IntTy val) : m_val(val) {}
BitSet(std::initializer_list<int> init)
{
m_val = 0;
for (int bit : init)
m_val |= (IntTy)1 << bit;
}
static BitSet AllTrue(size_t count)
{
return BitSet(count == sizeof(IntTy)*8 ? ~(IntTy)0 : (((IntTy)1 << count) - 1));
}
Ref operator[](size_t bit) { return Ref(this, (IntTy)1 << bit); }
const Ref operator[](size_t bit) const { return (*const_cast<BitSet*>(this))[bit]; }
bool operator==(BitSet other) const { return m_val == other.m_val; }
bool operator!=(BitSet other) const { return m_val != other.m_val; }
bool operator<(BitSet other) const { return m_val < other.m_val; }
bool operator>(BitSet other) const { return m_val > other.m_val; }
BitSet operator|(BitSet other) const { return BitSet(m_val | other.m_val); }
BitSet operator&(BitSet other) const { return BitSet(m_val & other.m_val); }
BitSet operator^(BitSet other) const { return BitSet(m_val ^ other.m_val); }
BitSet operator~() const { return BitSet(~m_val); }
BitSet& operator|=(BitSet other) { return *this = *this | other; }
BitSet& operator&=(BitSet other) { return *this = *this & other; }
BitSet& operator^=(BitSet other) { return *this = *this ^ other; }
operator u32() = delete;
operator bool() { return m_val != 0; }
// Warning: Even though on modern CPUs this is a single fast instruction,
// Dolphin's official builds do not currently assume POPCNT support on x86,
// so slower explicit bit twiddling is generated. Still should generally
// be faster than a loop.
unsigned int Count() const { return CountSetBits(m_val); }
Iterator begin() const { Iterator it(m_val, 0); return ++it; }
Iterator end() const { return Iterator(m_val, -1); }
IntTy m_val;
};
} // Common
typedef Common::BitSet<u8> BitSet8;
typedef Common::BitSet<u16> BitSet16;
typedef Common::BitSet<u32> BitSet32;
typedef Common::BitSet<u64> BitSet64;

View file

@ -22,247 +22,69 @@ using namespace Gen;
// Shared code between Win64 and Unix64 // Shared code between Win64 and Unix64
// Sets up a __cdecl function. void XEmitter::ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp) {
void XEmitter::ABI_EmitPrologue(int maxCallParams) size_t shadow = 0;
{ #if defined(_WIN32)
#ifdef _M_IX86 shadow = 0x20;
// Don't really need to do anything
#elif defined(ARCHITECTURE_x86_64)
#if _WIN32
int stacksize = ((maxCallParams + 1) & ~1) * 8 + 8;
// Set up a stack frame so that we can call functions
// TODO: use maxCallParams
SUB(64, R(RSP), Imm8(stacksize));
#endif
#else
#error Arch not supported
#endif #endif
int count = (mask & ABI_ALL_GPRS).Count();
rsp_alignment -= count * 8;
size_t subtraction = 0;
int fpr_count = (mask & ABI_ALL_FPRS).Count();
if (fpr_count) {
// If we have any XMMs to save, we must align the stack here.
subtraction = rsp_alignment & 0xf;
}
subtraction += 16 * fpr_count;
size_t xmm_base_subtraction = subtraction;
subtraction += needed_frame_size;
subtraction += shadow;
// Final alignment.
rsp_alignment -= subtraction;
subtraction += rsp_alignment & 0xf;
*shadowp = shadow;
*subtractionp = subtraction;
*xmm_offsetp = subtraction - xmm_base_subtraction;
} }
void XEmitter::ABI_EmitEpilogue(int maxCallParams) size_t XEmitter::ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size) {
{ size_t shadow, subtraction, xmm_offset;
#ifdef _M_IX86 ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction, &xmm_offset);
RET();
#elif defined(ARCHITECTURE_x86_64)
#ifdef _WIN32
int stacksize = ((maxCallParams+1)&~1)*8 + 8;
ADD(64, R(RSP), Imm8(stacksize));
#endif
RET();
#else
#error Arch not supported
for (int r : mask & ABI_ALL_GPRS)
PUSH((X64Reg)r);
#endif if (subtraction)
SUB(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction));
for (int x : mask & ABI_ALL_FPRS) {
MOVAPD(MDisp(RSP, (int)xmm_offset), (X64Reg)(x - 16));
xmm_offset += 16;
}
return shadow;
} }
#ifdef _M_IX86 // All32 void XEmitter::ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size) {
size_t shadow, subtraction, xmm_offset;
ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction, &xmm_offset);
// Shared code between Win32 and Unix32 for (int x : mask & ABI_ALL_FPRS) {
void XEmitter::ABI_CallFunction(const void *func) { MOVAPD((X64Reg) (x - 16), MDisp(RSP, (int)xmm_offset));
ABI_AlignStack(0); xmm_offset += 16;
CALL(func); }
ABI_RestoreStack(0);
}
void XEmitter::ABI_CallFunctionC16(const void *func, u16 param1) { if (subtraction)
ABI_AlignStack(1 * 2); ADD(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction));
PUSH(16, Imm16(param1));
CALL(func);
ABI_RestoreStack(1 * 2);
}
void XEmitter::ABI_CallFunctionCC16(const void *func, u32 param1, u16 param2) { for (int r = 15; r >= 0; r--) {
ABI_AlignStack(1 * 2 + 1 * 4); if (mask[r])
PUSH(16, Imm16(param2)); POP((X64Reg)r);
PUSH(32, Imm32(param1));
CALL(func);
ABI_RestoreStack(1 * 2 + 1 * 4);
}
void XEmitter::ABI_CallFunctionC(const void *func, u32 param1) {
ABI_AlignStack(1 * 4);
PUSH(32, Imm32(param1));
CALL(func);
ABI_RestoreStack(1 * 4);
}
void XEmitter::ABI_CallFunctionCC(const void *func, u32 param1, u32 param2) {
ABI_AlignStack(2 * 4);
PUSH(32, Imm32(param2));
PUSH(32, Imm32(param1));
CALL(func);
ABI_RestoreStack(2 * 4);
}
void XEmitter::ABI_CallFunctionCCC(const void *func, u32 param1, u32 param2, u32 param3) {
ABI_AlignStack(3 * 4);
PUSH(32, Imm32(param3));
PUSH(32, Imm32(param2));
PUSH(32, Imm32(param1));
CALL(func);
ABI_RestoreStack(3 * 4);
}
void XEmitter::ABI_CallFunctionCCP(const void *func, u32 param1, u32 param2, void *param3) {
ABI_AlignStack(3 * 4);
PUSH(32, ImmPtr(param3));
PUSH(32, Imm32(param2));
PUSH(32, Imm32(param1));
CALL(func);
ABI_RestoreStack(3 * 4);
}
void XEmitter::ABI_CallFunctionCCCP(const void *func, u32 param1, u32 param2,u32 param3, void *param4) {
ABI_AlignStack(4 * 4);
PUSH(32, ImmPtr(param4));
PUSH(32, Imm32(param3));
PUSH(32, Imm32(param2));
PUSH(32, Imm32(param1));
CALL(func);
ABI_RestoreStack(4 * 4);
}
void XEmitter::ABI_CallFunctionP(const void *func, void *param1) {
ABI_AlignStack(1 * 4);
PUSH(32, ImmPtr(param1));
CALL(func);
ABI_RestoreStack(1 * 4);
}
void XEmitter::ABI_CallFunctionPA(const void *func, void *param1, const Gen::OpArg &arg2) {
ABI_AlignStack(2 * 4);
PUSH(32, arg2);
PUSH(32, ImmPtr(param1));
CALL(func);
ABI_RestoreStack(2 * 4);
}
void XEmitter::ABI_CallFunctionPAA(const void *func, void *param1, const Gen::OpArg &arg2, const Gen::OpArg &arg3) {
ABI_AlignStack(3 * 4);
PUSH(32, arg3);
PUSH(32, arg2);
PUSH(32, ImmPtr(param1));
CALL(func);
ABI_RestoreStack(3 * 4);
}
void XEmitter::ABI_CallFunctionPPC(const void *func, void *param1, void *param2, u32 param3) {
ABI_AlignStack(3 * 4);
PUSH(32, Imm32(param3));
PUSH(32, ImmPtr(param2));
PUSH(32, ImmPtr(param1));
CALL(func);
ABI_RestoreStack(3 * 4);
}
// Pass a register as a parameter.
void XEmitter::ABI_CallFunctionR(const void *func, X64Reg reg1) {
ABI_AlignStack(1 * 4);
PUSH(32, R(reg1));
CALL(func);
ABI_RestoreStack(1 * 4);
}
// Pass two registers as parameters.
void XEmitter::ABI_CallFunctionRR(const void *func, Gen::X64Reg reg1, Gen::X64Reg reg2)
{
ABI_AlignStack(2 * 4);
PUSH(32, R(reg2));
PUSH(32, R(reg1));
CALL(func);
ABI_RestoreStack(2 * 4);
}
void XEmitter::ABI_CallFunctionAC(const void *func, const Gen::OpArg &arg1, u32 param2)
{
ABI_AlignStack(2 * 4);
PUSH(32, Imm32(param2));
PUSH(32, arg1);
CALL(func);
ABI_RestoreStack(2 * 4);
}
void XEmitter::ABI_CallFunctionACC(const void *func, const Gen::OpArg &arg1, u32 param2, u32 param3)
{
ABI_AlignStack(3 * 4);
PUSH(32, Imm32(param3));
PUSH(32, Imm32(param2));
PUSH(32, arg1);
CALL(func);
ABI_RestoreStack(3 * 4);
}
void XEmitter::ABI_CallFunctionA(const void *func, const Gen::OpArg &arg1)
{
ABI_AlignStack(1 * 4);
PUSH(32, arg1);
CALL(func);
ABI_RestoreStack(1 * 4);
}
void XEmitter::ABI_CallFunctionAA(const void *func, const Gen::OpArg &arg1, const Gen::OpArg &arg2)
{
ABI_AlignStack(2 * 4);
PUSH(32, arg2);
PUSH(32, arg1);
CALL(func);
ABI_RestoreStack(2 * 4);
}
void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() {
// Note: 4 * 4 = 16 bytes, so alignment is preserved.
PUSH(EBP);
PUSH(EBX);
PUSH(ESI);
PUSH(EDI);
}
void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() {
POP(EDI);
POP(ESI);
POP(EBX);
POP(EBP);
}
unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) {
frameSize += 4; // reserve space for return address
unsigned int alignedSize =
#ifdef __GNUC__
(frameSize + 15) & -16;
#else
(frameSize + 3) & -4;
#endif
return alignedSize;
}
void XEmitter::ABI_AlignStack(unsigned int frameSize) {
// Mac OS X requires the stack to be 16-byte aligned before every call.
// Linux requires the stack to be 16-byte aligned before calls that put SSE
// vectors on the stack, but since we do not keep track of which calls do that,
// it is effectively every call as well.
// Windows binaries compiled with MSVC do not have such a restriction*, but I
// expect that GCC on Windows acts the same as GCC on Linux in this respect.
// It would be nice if someone could verify this.
// *However, the MSVC optimizing compiler assumes a 4-byte-aligned stack at times.
unsigned int fillSize =
ABI_GetAlignedFrameSize(frameSize) - (frameSize + 4);
if (fillSize != 0) {
SUB(32, R(ESP), Imm8(fillSize));
} }
} }
void XEmitter::ABI_RestoreStack(unsigned int frameSize) {
unsigned int alignedSize = ABI_GetAlignedFrameSize(frameSize);
alignedSize -= 4; // return address is POPped at end of call
if (alignedSize != 0) {
ADD(32, R(ESP), Imm8(alignedSize));
}
}
#else //64bit
// Common functions // Common functions
void XEmitter::ABI_CallFunction(const void *func) { void XEmitter::ABI_CallFunction(const void *func) {
u64 distance = u64(func) - (u64(code) + 5); u64 distance = u64(func) - (u64(code) + 5);
@ -538,143 +360,4 @@ void XEmitter::ABI_CallFunctionAA(const void *func, const Gen::OpArg &arg1, cons
} else { } else {
CALL(func); CALL(func);
} }
} }
unsigned int XEmitter::ABI_GetAlignedFrameSize(unsigned int frameSize) {
return frameSize;
}
#ifdef _WIN32
// The Windows x64 ABI requires XMM6 - XMM15 to be callee saved. 10 regs.
// But, not saving XMM4 and XMM5 breaks things in VS 2010, even though they are volatile regs.
// Let's just save all 16.
const int XMM_STACK_SPACE = 16 * 16;
// Win64 Specific Code
void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() {
//we only want to do this once
PUSH(RBX);
PUSH(RSI);
PUSH(RDI);
PUSH(RBP);
PUSH(R12);
PUSH(R13);
PUSH(R14);
PUSH(R15);
ABI_AlignStack(0);
// Do this after aligning, because before it's offset by 8.
SUB(64, R(RSP), Imm32(XMM_STACK_SPACE));
for (int i = 0; i < 16; ++i)
MOVAPS(MDisp(RSP, i * 16), (X64Reg)(XMM0 + i));
}
void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() {
for (int i = 0; i < 16; ++i)
MOVAPS((X64Reg)(XMM0 + i), MDisp(RSP, i * 16));
ADD(64, R(RSP), Imm32(XMM_STACK_SPACE));
ABI_RestoreStack(0);
POP(R15);
POP(R14);
POP(R13);
POP(R12);
POP(RBP);
POP(RDI);
POP(RSI);
POP(RBX);
}
// Win64 Specific Code
void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() {
PUSH(RCX);
PUSH(RDX);
PUSH(RSI);
PUSH(RDI);
PUSH(R8);
PUSH(R9);
PUSH(R10);
PUSH(R11);
// TODO: Callers preserve XMM4-5 (XMM0-3 are args.)
ABI_AlignStack(0);
}
void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() {
ABI_RestoreStack(0);
POP(R11);
POP(R10);
POP(R9);
POP(R8);
POP(RDI);
POP(RSI);
POP(RDX);
POP(RCX);
}
void XEmitter::ABI_AlignStack(unsigned int /*frameSize*/) {
SUB(64, R(RSP), Imm8(0x28));
}
void XEmitter::ABI_RestoreStack(unsigned int /*frameSize*/) {
ADD(64, R(RSP), Imm8(0x28));
}
#else
// Unix64 Specific Code
void XEmitter::ABI_PushAllCalleeSavedRegsAndAdjustStack() {
PUSH(RBX);
PUSH(RBP);
PUSH(R12);
PUSH(R13);
PUSH(R14);
PUSH(R15);
PUSH(R15); //just to align stack. duped push/pop doesn't hurt.
// TODO: XMM?
}
void XEmitter::ABI_PopAllCalleeSavedRegsAndAdjustStack() {
POP(R15);
POP(R15);
POP(R14);
POP(R13);
POP(R12);
POP(RBP);
POP(RBX);
}
void XEmitter::ABI_PushAllCallerSavedRegsAndAdjustStack() {
PUSH(RCX);
PUSH(RDX);
PUSH(RSI);
PUSH(RDI);
PUSH(R8);
PUSH(R9);
PUSH(R10);
PUSH(R11);
PUSH(R11);
}
void XEmitter::ABI_PopAllCallerSavedRegsAndAdjustStack() {
POP(R11);
POP(R11);
POP(R10);
POP(R9);
POP(R8);
POP(RDI);
POP(RSI);
POP(RDX);
POP(RCX);
}
void XEmitter::ABI_AlignStack(unsigned int /*frameSize*/) {
SUB(64, R(RSP), Imm8(0x08));
}
void XEmitter::ABI_RestoreStack(unsigned int /*frameSize*/) {
ADD(64, R(RSP), Imm8(0x08));
}
#endif // WIN32
#endif // 32bit

View file

@ -1,35 +1,15 @@
// Copyright (C) 2003 Dolphin Project. // Copyright 2008 Dolphin Emulator Project
// Licensed under GPLv2+
// This program is free software: you can redistribute it and/or modify // Refer to the license.txt file included.
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0 or later versions.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License 2.0 for more details.
// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/
// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/
#pragma once #pragma once
#include "common/common_types.h" #include "common/bit_set.h"
#include "emitter.h"
// x86/x64 ABI:s, and helpers to help follow them when JIT-ing code. // x64 ABI:s, and helpers to help follow them when JIT-ing code.
// All convensions return values in EAX (+ possibly EDX). // All convensions return values in EAX (+ possibly EDX).
// Linux 32-bit, Windows 32-bit (cdecl, System V):
// * Caller pushes left to right
// * Caller fixes stack after call
// * function subtract from stack for local storage only.
// Scratch: EAX ECX EDX
// Callee-save: EBX ESI EDI EBP
// Parameters: -
// Windows 64-bit // Windows 64-bit
// * 4-reg "fastcall" variant, very new-skool stack handling // * 4-reg "fastcall" variant, very new-skool stack handling
// * Callee moves stack pointer, to make room for shadow regs for the biggest function _it itself calls_ // * Callee moves stack pointer, to make room for shadow regs for the biggest function _it itself calls_
@ -44,18 +24,8 @@
// Callee-save: RBX RBP R12 R13 R14 R15 // Callee-save: RBX RBP R12 R13 R14 R15
// Parameters: RDI RSI RDX RCX R8 R9 // Parameters: RDI RSI RDX RCX R8 R9
#ifdef _M_IX86 // 32 bit calling convention, shared by all #define ABI_ALL_FPRS BitSet32(0xffff0000)
#define ABI_ALL_GPRS BitSet32(0x0000ffff)
// 32-bit don't pass parameters in regs, but these are convenient to have anyway when we have to
// choose regs to put stuff in.
#define ABI_PARAM1 RCX
#define ABI_PARAM2 RDX
// There are no ABI_PARAM* here, since args are pushed.
// 32-bit bog standard cdecl, shared between linux and windows
// MacOSX 32-bit is same as System V with a few exceptions that we probably don't care much about.
#elif ARCHITECTURE_x86_64 // 64 bit calling convention
#ifdef _WIN32 // 64-bit Windows - the really exotic calling convention #ifdef _WIN32 // 64-bit Windows - the really exotic calling convention
@ -64,7 +34,11 @@
#define ABI_PARAM3 R8 #define ABI_PARAM3 R8
#define ABI_PARAM4 R9 #define ABI_PARAM4 R9
#else //64-bit Unix (hopefully MacOSX too) // xmm0-xmm15 use the upper 16 bits in the functions that push/pop registers.
#define ABI_ALL_CALLER_SAVED \
(BitSet32 { RAX, RCX, RDX, R8, R9, R10, R11, \
XMM0+16, XMM1+16, XMM2+16, XMM3+16, XMM4+16, XMM5+16 })
#else //64-bit Unix / OS X
#define ABI_PARAM1 RDI #define ABI_PARAM1 RDI
#define ABI_PARAM2 RSI #define ABI_PARAM2 RSI
@ -73,6 +47,13 @@
#define ABI_PARAM5 R8 #define ABI_PARAM5 R8
#define ABI_PARAM6 R9 #define ABI_PARAM6 R9
// TODO: Avoid pushing all 16 XMM registers when possible. Most functions we call probably
// don't actually clobber them.
#define ABI_ALL_CALLER_SAVED \
(BitSet32 { RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11 } | \
ABI_ALL_FPRS)
#endif // WIN32 #endif // WIN32
#endif // X86 #define ABI_ALL_CALLEE_SAVED (~ABI_ALL_CALLER_SAVED)
#define ABI_RETURN RAX

View file

@ -18,6 +18,7 @@
#pragma once #pragma once
#include "common/assert.h" #include "common/assert.h"
#include "common/bit_set.h"
#include "common/common_types.h" #include "common/common_types.h"
#include "common/code_block.h" #include "common/code_block.h"
@ -356,7 +357,7 @@ private:
void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg); void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg);
void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg& a1, const OpArg& a2); void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg& a1, const OpArg& a2);
void ABI_CalculateFrameSize(u32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp); void ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp);
protected: protected:
void Write8(u8 value); void Write8(u8 value);
@ -1007,25 +1008,26 @@ public:
ABI_CallFunctionC((const void*)func, param1); ABI_CallFunctionC((const void*)func, param1);
} }
// A function that doesn't have any control over what it will do to regs, /**
// such as the dispatcher, should be surrounded by these. * Saves specified registers and adjusts the stack to be 16-byte aligned as required by the ABI
void ABI_PushAllCalleeSavedRegsAndAdjustStack(); *
void ABI_PopAllCalleeSavedRegsAndAdjustStack(); * @param mask Registers to push on the stack (high 16 bits are XMMs, low 16 bits are GPRs)
* @param rsp_alignment Current alignment of the stack pointer, must be 0 or 8
* @param needed_frame_size Additional space needed, e.g., for function arguments passed on the stack
* @return Size of the shadow space, i.e., offset of the frame
*/
size_t ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size = 0);
// A function that doesn't know anything about it's surroundings, should /**
// be surrounded by these to establish a safe environment, where it can roam free. * Restores specified registers and adjusts the stack to its original alignment, i.e., the alignment before
// An example is a backpatch injected function. * the matching PushRegistersAndAdjustStack.
void ABI_PushAllCallerSavedRegsAndAdjustStack(); *
void ABI_PopAllCallerSavedRegsAndAdjustStack(); * @param mask Registers to restores from the stack (high 16 bits are XMMs, low 16 bits are GPRs)
* @param rsp_alignment Original alignment before the matching PushRegistersAndAdjustStack, must be 0 or 8
unsigned int ABI_GetAlignedFrameSize(unsigned int frameSize); * @param needed_frame_size Additional space that was needed
void ABI_AlignStack(unsigned int frameSize); * @warning Stack must be currently 16-byte aligned
void ABI_RestoreStack(unsigned int frameSize); */
void ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size = 0);
// Sets up a __cdecl function.
// Only x64 really needs the parameter count.
void ABI_EmitPrologue(int maxCallParams);
void ABI_EmitEpilogue(int maxCallParams);
#ifdef _M_IX86 #ifdef _M_IX86
static int ABI_GetNumXMMRegs() { return 8; } static int ABI_GetNumXMMRegs() { return 8; }

View file

@ -122,6 +122,14 @@ static const X64Reg ONE = XMM14;
/// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR /// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR
static const X64Reg NEGBIT = XMM15; static const X64Reg NEGBIT = XMM15;
// State registers that must not be modified by external functions calls
// Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed
static const BitSet32 persistent_regs = {
UNIFORMS, REGISTERS, // Pointers to register blocks
ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1, // Cached registers
ONE+16, NEGBIT+16, // Constants
};
/// Raw constant for the source register selector that indicates no swizzling is performed /// Raw constant for the source register selector that indicates no swizzling is performed
static const u8 NO_SRC_REG_SWIZZLE = 0x1b; static const u8 NO_SRC_REG_SWIZZLE = 0x1b;
/// Raw constant for the destination register enable mask that indicates all components are enabled /// Raw constant for the destination register enable mask that indicates all components are enabled
@ -295,20 +303,8 @@ void JitCompiler::Compile_UniformCondition(Instruction instr) {
CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0)); CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0));
} }
void JitCompiler::Compile_PushCallerSavedXMM() { BitSet32 JitCompiler::PersistentCallerSavedRegs() {
#ifndef _WIN32 return persistent_regs & ABI_ALL_CALLER_SAVED;
SUB(64, R(RSP), Imm8(2 * 16));
MOVUPS(MDisp(RSP, 16), ONE);
MOVUPS(MDisp(RSP, 0), NEGBIT);
#endif
}
void JitCompiler::Compile_PopCallerSavedXMM() {
#ifndef _WIN32
MOVUPS(NEGBIT, MDisp(RSP, 0));
MOVUPS(ONE, MDisp(RSP, 16));
ADD(64, R(RSP), Imm8(2 * 16));
#endif
} }
void JitCompiler::Compile_ADD(Instruction instr) { void JitCompiler::Compile_ADD(Instruction instr) {
@ -390,12 +386,9 @@ void JitCompiler::Compile_EX2(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
MOVSS(XMM0, R(SRC1)); MOVSS(XMM0, R(SRC1));
// The following will actually break the stack alignment ABI_PushRegistersAndAdjustStack(PersistentCallerSavedRegs(), 0);
ABI_PushAllCallerSavedRegsAndAdjustStack();
Compile_PushCallerSavedXMM();
ABI_CallFunction(reinterpret_cast<const void*>(exp2f)); ABI_CallFunction(reinterpret_cast<const void*>(exp2f));
Compile_PopCallerSavedXMM(); ABI_PopRegistersAndAdjustStack(PersistentCallerSavedRegs(), 0);
ABI_PopAllCallerSavedRegsAndAdjustStack();
SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0)); SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));
MOVAPS(SRC1, R(XMM0)); MOVAPS(SRC1, R(XMM0));
@ -406,12 +399,9 @@ void JitCompiler::Compile_LG2(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
MOVSS(XMM0, R(SRC1)); MOVSS(XMM0, R(SRC1));
// The following will actually break the stack alignment ABI_PushRegistersAndAdjustStack(PersistentCallerSavedRegs(), 0);
ABI_PushAllCallerSavedRegsAndAdjustStack();
Compile_PushCallerSavedXMM();
ABI_CallFunction(reinterpret_cast<const void*>(log2f)); ABI_CallFunction(reinterpret_cast<const void*>(log2f));
Compile_PopCallerSavedXMM(); ABI_PopRegistersAndAdjustStack(PersistentCallerSavedRegs(), 0);
ABI_PopAllCallerSavedRegsAndAdjustStack();
SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0)); SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));
MOVAPS(SRC1, R(XMM0)); MOVAPS(SRC1, R(XMM0));
@ -560,7 +550,7 @@ void JitCompiler::Compile_NOP(Instruction instr) {
} }
void JitCompiler::Compile_END(Instruction instr) { void JitCompiler::Compile_END(Instruction instr) {
ABI_PopAllCalleeSavedRegsAndAdjustStack(); ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8);
RET(); RET();
} }
@ -756,7 +746,8 @@ CompiledShader* JitCompiler::Compile() {
const auto& code = g_state.vs.program_code; const auto& code = g_state.vs.program_code;
unsigned offset = g_state.regs.vs.main_offset; unsigned offset = g_state.regs.vs.main_offset;
ABI_PushAllCalleeSavedRegsAndAdjustStack(); // The stack pointer is 8 modulo 16 at the entry of a procedure
ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8);
MOV(PTRBITS, R(REGISTERS), R(ABI_PARAM1)); MOV(PTRBITS, R(REGISTERS), R(ABI_PARAM1));
MOV(PTRBITS, R(UNIFORMS), ImmPtr(&g_state.vs.uniforms)); MOV(PTRBITS, R(UNIFORMS), ImmPtr(&g_state.vs.uniforms));

View file

@ -77,8 +77,7 @@ private:
void Compile_EvaluateCondition(Instruction instr); void Compile_EvaluateCondition(Instruction instr);
void Compile_UniformCondition(Instruction instr); void Compile_UniformCondition(Instruction instr);
void Compile_PushCallerSavedXMM(); BitSet32 PersistentCallerSavedRegs();
void Compile_PopCallerSavedXMM();
/// Pointer to the variable that stores the current Pica code offset. Used to handle nested code blocks. /// Pointer to the variable that stores the current Pica code offset. Used to handle nested code blocks.
unsigned* offset_ptr = nullptr; unsigned* offset_ptr = nullptr;