Add all the files

This commit is contained in:
Exzap 2022-08-22 22:21:23 +02:00
parent e3db07a16a
commit d60742f52b
1445 changed files with 430238 additions and 0 deletions

View file

@ -0,0 +1,293 @@
#pragma once
#include "Cafe/HW/Espresso/EspressoISA.h"
#include "Cafe/HW/MMU/MMU.h"
bool GamePatch_IsNonReturnFunction(uint32 hleIndex);
// utility class to determine shape of a function
class PPCFunctionBoundaryTracker
{
public:
struct PPCRange_t
{
PPCRange_t() {};
PPCRange_t(uint32 _startAddress) : startAddress(_startAddress) {};
uint32 startAddress{};
uint32 length{};
//bool isProcessed{false};
uint32 getEndAddress() const { return startAddress + length; };
};
public:
void trackStartPoint(MPTR startAddress)
{
processRange(startAddress, nullptr, nullptr);
processBranchTargets();
}
bool getRangeForAddress(uint32 address, PPCRange_t& range)
{
for (auto itr : map_ranges)
{
if (address >= itr->startAddress && address < (itr->startAddress + itr->length))
{
range = *itr;
return true;
}
}
return false;
}
private:
void addBranchDestination(PPCRange_t* sourceRange, MPTR address)
{
map_branchTargets.emplace(address);
}
// process flow of instruction
// returns false if the IP cannot increment past the current instruction
bool processInstruction(PPCRange_t* range, MPTR address)
{
// parse instructions
uint32 opcode = memory_readU32(address);
switch (Espresso::GetPrimaryOpcode(opcode))
{
case Espresso::PrimaryOpcode::ZERO:
{
if (opcode == 0)
return false; // invalid instruction
break;
}
case Espresso::PrimaryOpcode::VIRTUAL_HLE:
{
// end of function
// is there a jump to a instruction after this one?
uint32 hleFuncId = opcode & 0xFFFF;
if (hleFuncId >= 0x1000 && hleFuncId < 0x4000)
{
if (GamePatch_IsNonReturnFunction(hleFuncId - 0x1000) == false)
{
return true;
}
}
return false;
}
case Espresso::PrimaryOpcode::BC:
{
uint32 BD, BI;
Espresso::BOField BO;
bool AA, LK;
Espresso::decodeOp_BC(opcode, BD, BO, BI, AA, LK);
uint32 branchTarget = AA ? BD : BD + address;
if (!LK)
addBranchDestination(range, branchTarget);
break;
}
case Espresso::PrimaryOpcode::B:
{
uint32 LI;
bool AA, LK;
Espresso::decodeOp_B(opcode, LI, AA, LK);
uint32 branchTarget = AA ? LI : LI + address;
if (!LK)
{
addBranchDestination(range, branchTarget);
// if the next two or previous two instructions are branch instructions, we assume that they are destinations of a jump table
// todo - can we make this more reliable by checking for BCTR or similar instructions first?
// example: The Swapper 0x01B1FC04
if (PPCRecompilerCalcFuncSize_isUnconditionalBranchInstruction(memory_readU32(address + 4)) && PPCRecompilerCalcFuncSize_isUnconditionalBranchInstruction(memory_readU32(address + 8)) ||
PPCRecompilerCalcFuncSize_isUnconditionalBranchInstruction(memory_readU32(address - 8)) && PPCRecompilerCalcFuncSize_isUnconditionalBranchInstruction(memory_readU32(address - 4)))
{
return true;
}
return false; // current flow ends at unconditional branch instruction
}
break;
}
case Espresso::PrimaryOpcode::GROUP_19:
switch (Espresso::GetGroup19Opcode(opcode))
{
case Espresso::Opcode19::BCLR:
{
Espresso::BOField BO;
uint32 BI;
bool LK;
Espresso::decodeOp_BCLR(opcode, BO, BI, LK);
if (BO.branchAlways() && !LK)
{
// unconditional BLR
return false;
}
break;
}
case Espresso::Opcode19::BCCTR:
if (opcode == 0x4E800420)
{
// unconditional BCTR
// this instruction is often used for switch statements, therefore we should be wary of ending the function here
// It's better to overestimate function size than to predict sizes that are too short
// Currently we only end the function if the BCTR is followed by a NOP (alignment) or invalid instruction
// todo: improve robustness, find better ways to handle false positives
uint32 nextOpcode = memory_readU32(address + 4);
if (nextOpcode == 0x60000000 || PPCRecompilerCalcFuncSize_isValidInstruction(nextOpcode) == false)
{
return false;
}
return true;
}
// conditional BCTR
return true;
default:
break;
}
break;
default:
break;
}
return true;
}
void checkForCollisions()
{
#ifndef PUBLIC_RELEASE
uint32 endOfPrevious = 0;
for (auto itr : map_ranges)
{
if (endOfPrevious > itr->startAddress)
{
cemu_assert_debug(false);
}
endOfPrevious = itr->startAddress + itr->length;
}
#endif
}
// nextRange must point to the closest range after startAddress, or NULL if there is none
void processRange(MPTR startAddress, PPCRange_t* previousRange, PPCRange_t* nextRange)
{
checkForCollisions();
cemu_assert_debug(previousRange == nullptr || (startAddress == (previousRange->startAddress + previousRange->length)));
PPCRange_t* newRange;
if (previousRange && (previousRange->startAddress + previousRange->length) == startAddress)
{
newRange = previousRange;
}
else
{
cemu_assert_debug(previousRange == nullptr);
newRange = new PPCRange_t(startAddress);
map_ranges.emplace(newRange);
}
// process instruction flow until it is interrupted by a non-conditional branch
MPTR currentAddress = startAddress;
MPTR endAddress = 0xFFFFFFFF;
if (nextRange)
endAddress = nextRange->startAddress;
while (currentAddress < endAddress)
{
if (!processInstruction(newRange, currentAddress))
{
currentAddress += 4;
break;
}
currentAddress += 4;
}
newRange->length = currentAddress - newRange->startAddress;
if (nextRange && currentAddress >= nextRange->startAddress)
{
// merge with next range
newRange->length = (nextRange->startAddress + nextRange->length) - newRange->startAddress;
map_ranges.erase(nextRange);
delete nextRange;
checkForCollisions();
return;
}
checkForCollisions();
}
// find first unvisited branch target and start a new range there
// return true if method should be called again
bool processBranchTargetsSinglePass()
{
cemu_assert_debug(!map_ranges.empty());
auto rangeItr = map_ranges.begin();
PPCRange_t* previousRange = nullptr;
for (std::set<uint32_t>::const_iterator targetItr = map_branchTargets.begin() ; targetItr != map_branchTargets.end(); )
{
while (rangeItr != map_ranges.end() && ((*rangeItr)->startAddress + (*rangeItr)->length) <= (*targetItr))
{
previousRange = *rangeItr;
rangeItr++;
if (rangeItr == map_ranges.end())
{
// last range reached
if ((previousRange->startAddress + previousRange->length) == *targetItr)
processRange(*targetItr, previousRange, nullptr);
else
processRange(*targetItr, nullptr, nullptr);
return true;
}
}
if ((*targetItr) >= (*rangeItr)->startAddress &&
(*targetItr) < ((*rangeItr)->startAddress + (*rangeItr)->length))
{
// delete visited targets
targetItr = map_branchTargets.erase(targetItr);
continue;
}
cemu_assert_debug((*rangeItr)->startAddress > (*targetItr));
if (previousRange && (previousRange->startAddress + previousRange->length) == *targetItr)
processRange(*targetItr, previousRange, *rangeItr); // extend previousRange
else
processRange(*targetItr, nullptr, *rangeItr);
return true;
}
return false;
}
void processBranchTargets()
{
while (processBranchTargetsSinglePass());
}
private:
bool PPCRecompilerCalcFuncSize_isUnconditionalBranchInstruction(uint32 opcode)
{
if (Espresso::GetPrimaryOpcode(opcode) == Espresso::PrimaryOpcode::B)
{
uint32 LI;
bool AA, LK;
Espresso::decodeOp_B(opcode, LI, AA, LK);
if (!LK)
return true;
}
return false;
}
bool PPCRecompilerCalcFuncSize_isValidInstruction(uint32 opcode)
{
if ((opcode >> 26) == 0)
return false;
return true;
}
private:
struct RangePtrCmp
{
bool operator()(const PPCRange_t* lhs, const PPCRange_t* rhs) const
{
return lhs->startAddress < rhs->startAddress;
}
};
std::set<PPCRange_t*, RangePtrCmp> map_ranges;
std::set<uint32> map_branchTargets;
};

View file

@ -0,0 +1,593 @@
#include "Cafe/HW/Espresso/Interpreter/PPCInterpreterInternal.h"
#include "PPCFunctionBoundaryTracker.h"
#include "PPCRecompiler.h"
#include "PPCRecompilerIml.h"
#include "PPCRecompilerX64.h"
#include "Cafe/OS/RPL/rpl.h"
#include "util/containers/RangeStore.h"
#include "Cafe/OS/libs/coreinit/coreinit_CodeGen.h"
#include "config/ActiveSettings.h"
#include "config/LaunchSettings.h"
#include "util/helpers/fspinlock.h"
#include "Common/ExceptionHandler/ExceptionHandler.h"
#include "util/helpers/helpers.h"
#include "util/MemMapper/MemMapper.h"
struct PPCInvalidationRange
{
MPTR startAddress;
uint32 size;
PPCInvalidationRange(MPTR _startAddress, uint32 _size) : startAddress(_startAddress), size(_size) {};
};
struct
{
FSpinlock recompilerSpinlock;
std::queue<MPTR> targetQueue;
std::vector<PPCInvalidationRange> invalidationRanges;
}PPCRecompilerState;
RangeStore<PPCRecFunction_t*, uint32, 7703, 0x2000> rangeStore_ppcRanges;
void ATTR_MS_ABI (*PPCRecompiler_enterRecompilerCode)(uint64 codeMem, uint64 ppcInterpreterInstance);
void ATTR_MS_ABI (*PPCRecompiler_leaveRecompilerCode_visited)();
void ATTR_MS_ABI (*PPCRecompiler_leaveRecompilerCode_unvisited)();
PPCRecompilerInstanceData_t* ppcRecompilerInstanceData;
bool ppcRecompilerEnabled = false;
// this function does never block and can fail if the recompiler lock cannot be acquired immediately
void PPCRecompiler_visitAddressNoBlock(uint32 enterAddress)
{
// quick read-only check without lock
if (ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[enterAddress / 4] != PPCRecompiler_leaveRecompilerCode_unvisited)
return;
// try to acquire lock
if (!PPCRecompilerState.recompilerSpinlock.tryAcquire())
return;
auto funcPtr = ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[enterAddress / 4];
if (funcPtr != PPCRecompiler_leaveRecompilerCode_unvisited)
{
// was visited since previous check
PPCRecompilerState.recompilerSpinlock.release();
return;
}
// add to recompilation queue and flag as visited
PPCRecompilerState.targetQueue.emplace(enterAddress);
ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[enterAddress / 4] = PPCRecompiler_leaveRecompilerCode_visited;
PPCRecompilerState.recompilerSpinlock.release();
}
void PPCRecompiler_recompileIfUnvisited(uint32 enterAddress)
{
if (ppcRecompilerEnabled == false)
return;
PPCRecompiler_visitAddressNoBlock(enterAddress);
}
void PPCRecompiler_enter(PPCInterpreter_t* hCPU, PPCREC_JUMP_ENTRY funcPtr)
{
#if BOOST_OS_WINDOWS > 0
uint32 prevState = _controlfp(0, 0);
_controlfp(_RC_NEAR, _MCW_RC);
PPCRecompiler_enterRecompilerCode((uint64)funcPtr, (uint64)hCPU);
_controlfp(prevState, _MCW_RC);
// debug recompiler exit - useful to find frequently executed functions which couldn't be recompiled
#ifndef PUBLIC_RELEASE
if (hCPU->remainingCycles > 0 && GetAsyncKeyState(VK_F4))
{
auto t = std::chrono::high_resolution_clock::now();
auto dur = std::chrono::duration_cast<std::chrono::microseconds>(t.time_since_epoch()).count();
forceLog_printf("Recompiler exit: 0x%08x LR: 0x%08x Timestamp %lld.%04lld", hCPU->instructionPointer, hCPU->spr.LR, dur / 1000LL, (dur % 1000LL));
}
#endif
#else
PPCRecompiler_enterRecompilerCode((uint64)funcPtr, (uint64)hCPU);
#endif
// after leaving recompiler prematurely attempt to recompile the code at the new location
if (hCPU->remainingCycles > 0)
{
PPCRecompiler_visitAddressNoBlock(hCPU->instructionPointer);
}
}
void PPCRecompiler_attemptEnterWithoutRecompile(PPCInterpreter_t* hCPU, uint32 enterAddress)
{
cemu_assert_debug(hCPU->instructionPointer == enterAddress);
if (ppcRecompilerEnabled == false)
return;
auto funcPtr = ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[enterAddress / 4];
if (funcPtr != PPCRecompiler_leaveRecompilerCode_unvisited && funcPtr != PPCRecompiler_leaveRecompilerCode_visited)
{
cemu_assert_debug(ppcRecompilerInstanceData != nullptr);
PPCRecompiler_enter(hCPU, funcPtr);
}
}
void PPCRecompiler_attemptEnter(PPCInterpreter_t* hCPU, uint32 enterAddress)
{
cemu_assert_debug(hCPU->instructionPointer == enterAddress);
if (ppcRecompilerEnabled == false)
return;
if (hCPU->remainingCycles <= 0)
return;
auto funcPtr = ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[enterAddress / 4];
if (funcPtr == PPCRecompiler_leaveRecompilerCode_unvisited)
{
PPCRecompiler_visitAddressNoBlock(enterAddress);
}
else if (funcPtr != PPCRecompiler_leaveRecompilerCode_visited)
{
// enter
cemu_assert_debug(ppcRecompilerInstanceData != nullptr);
PPCRecompiler_enter(hCPU, funcPtr);
}
}
PPCRecFunction_t* PPCRecompiler_recompileFunction(PPCFunctionBoundaryTracker::PPCRange_t range, std::set<uint32>& entryAddresses, std::vector<std::pair<MPTR, uint32>>& entryPointsOut)
{
if (range.startAddress >= PPC_REC_CODE_AREA_END)
{
cemuLog_force("Attempting to recompile function outside of allowed code area");
return nullptr;
}
uint32 codeGenRangeStart;
uint32 codeGenRangeSize = 0;
coreinit::OSGetCodegenVirtAddrRangeInternal(codeGenRangeStart, codeGenRangeSize);
if (codeGenRangeSize != 0)
{
if (range.startAddress >= codeGenRangeStart && range.startAddress < (codeGenRangeStart + codeGenRangeSize))
{
if (coreinit::codeGenShouldAvoid())
{
return nullptr;
}
}
}
PPCRecFunction_t* ppcRecFunc = new PPCRecFunction_t();
ppcRecFunc->ppcAddress = range.startAddress;
ppcRecFunc->ppcSize = range.length;
// generate intermediate code
ppcImlGenContext_t ppcImlGenContext = { 0 };
bool compiledSuccessfully = PPCRecompiler_generateIntermediateCode(ppcImlGenContext, ppcRecFunc, entryAddresses);
if (compiledSuccessfully == false)
{
// todo: Free everything
PPCRecompiler_freeContext(&ppcImlGenContext);
delete ppcRecFunc;
return NULL;
}
// emit x64 code
bool x64GenerationSuccess = PPCRecompiler_generateX64Code(ppcRecFunc, &ppcImlGenContext);
if (x64GenerationSuccess == false)
{
PPCRecompiler_freeContext(&ppcImlGenContext);
return nullptr;
}
// collect list of PPC-->x64 entry points
entryPointsOut.clear();
for (sint32 s = 0; s < ppcImlGenContext.segmentListCount; s++)
{
PPCRecImlSegment_t* imlSegment = ppcImlGenContext.segmentList[s];
if (imlSegment->isEnterable == false)
continue;
uint32 ppcEnterOffset = imlSegment->enterPPCAddress;
uint32 x64Offset = imlSegment->x64Offset;
entryPointsOut.emplace_back(ppcEnterOffset, x64Offset);
}
PPCRecompiler_freeContext(&ppcImlGenContext);
return ppcRecFunc;
}
bool PPCRecompiler_makeRecompiledFunctionActive(uint32 initialEntryPoint, PPCFunctionBoundaryTracker::PPCRange_t& range, PPCRecFunction_t* ppcRecFunc, std::vector<std::pair<MPTR, uint32>>& entryPoints)
{
// update jump table
PPCRecompilerState.recompilerSpinlock.acquire();
// check if the initial entrypoint is still flagged for recompilation
// its possible that the range has been invalidated during the time it took to translate the function
if (ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[initialEntryPoint / 4] != PPCRecompiler_leaveRecompilerCode_visited)
{
PPCRecompilerState.recompilerSpinlock.release();
return false;
}
// check if the current range got invalidated in the time it took to recompile it
bool isInvalidated = false;
for (auto& invRange : PPCRecompilerState.invalidationRanges)
{
MPTR rStartAddr = invRange.startAddress;
MPTR rEndAddr = rStartAddr + invRange.size;
for (auto& recFuncRange : ppcRecFunc->list_ranges)
{
if (recFuncRange.ppcAddress < (rEndAddr) && (recFuncRange.ppcAddress + recFuncRange.ppcSize) >= rStartAddr)
{
isInvalidated = true;
break;
}
}
}
PPCRecompilerState.invalidationRanges.clear();
if (isInvalidated)
{
PPCRecompilerState.recompilerSpinlock.release();
return false;
}
// update jump table
for (auto& itr : entryPoints)
{
ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[itr.first / 4] = (PPCREC_JUMP_ENTRY)((uint8*)ppcRecFunc->x86Code + itr.second);
}
// due to inlining, some entrypoints can get optimized away
// therefore we reset all addresses that are still marked as visited (but not recompiled)
// we dont remove the points from the queue but any address thats not marked as visited won't get recompiled
// if they are reachable, the interpreter will queue them again
for (uint32 v = range.startAddress; v <= (range.startAddress + range.length); v += 4)
{
auto funcPtr = ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[v / 4];
if (funcPtr == PPCRecompiler_leaveRecompilerCode_visited)
ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[v / 4] = PPCRecompiler_leaveRecompilerCode_unvisited;
}
// register ranges
for (auto& r : ppcRecFunc->list_ranges)
{
r.storedRange = rangeStore_ppcRanges.storeRange(ppcRecFunc, r.ppcAddress, r.ppcAddress + r.ppcSize);
}
PPCRecompilerState.recompilerSpinlock.release();
return true;
}
void PPCRecompiler_recompileAtAddress(uint32 address)
{
cemu_assert_debug(ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[address / 4] == PPCRecompiler_leaveRecompilerCode_visited);
// get size
PPCFunctionBoundaryTracker funcBoundaries;
funcBoundaries.trackStartPoint(address);
// get range that encompasses address
PPCFunctionBoundaryTracker::PPCRange_t range;
if (funcBoundaries.getRangeForAddress(address, range) == false)
{
cemu_assert_debug(false);
}
// todo - use info from previously compiled ranges to determine full size of this function (and merge all the entryAddresses)
// collect all currently known entry points for this range
PPCRecompilerState.recompilerSpinlock.acquire();
std::set<uint32> entryAddresses;
entryAddresses.emplace(address);
PPCRecompilerState.recompilerSpinlock.release();
std::vector<std::pair<MPTR, uint32>> functionEntryPoints;
auto func = PPCRecompiler_recompileFunction(range, entryAddresses, functionEntryPoints);
if (!func)
{
return; // recompilation failed
}
bool r = PPCRecompiler_makeRecompiledFunctionActive(address, range, func, functionEntryPoints);
}
void PPCRecompiler_thread()
{
SetThreadName("PPCRecompiler_thread");
while (true)
{
std::this_thread::sleep_for(std::chrono::milliseconds(10));
// asynchronous recompilation:
// 1) take address from queue
// 2) check if address is still marked as visited
// 3) if yes -> calculate size, gather all entry points, recompile and update jump table
while (true)
{
PPCRecompilerState.recompilerSpinlock.acquire();
if (PPCRecompilerState.targetQueue.empty())
{
PPCRecompilerState.recompilerSpinlock.release();
break;
}
auto enterAddress = PPCRecompilerState.targetQueue.front();
PPCRecompilerState.targetQueue.pop();
auto funcPtr = ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[enterAddress / 4];
if (funcPtr != PPCRecompiler_leaveRecompilerCode_visited)
{
// only recompile functions if marked as visited
PPCRecompilerState.recompilerSpinlock.release();
continue;
}
PPCRecompilerState.recompilerSpinlock.release();
PPCRecompiler_recompileAtAddress(enterAddress);
}
}
}
#define PPC_REC_ALLOC_BLOCK_SIZE (4*1024*1024) // 4MB
std::bitset<(MEMORY_CODEAREA_ADDR + MEMORY_CODEAREA_SIZE) / PPC_REC_ALLOC_BLOCK_SIZE> ppcRecompiler_reservedBlockMask;
void PPCRecompiler_reserveLookupTableBlock(uint32 offset)
{
uint32 blockIndex = offset / PPC_REC_ALLOC_BLOCK_SIZE;
offset = blockIndex * PPC_REC_ALLOC_BLOCK_SIZE;
if (ppcRecompiler_reservedBlockMask[blockIndex])
return;
ppcRecompiler_reservedBlockMask[blockIndex] = true;
void* p1 = MemMapper::AllocateMemory(&(ppcRecompilerInstanceData->ppcRecompilerFuncTable[offset/4]), (PPC_REC_ALLOC_BLOCK_SIZE/4)*sizeof(void*), MemMapper::PAGE_PERMISSION::P_RW, true);
void* p3 = MemMapper::AllocateMemory(&(ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[offset/4]), (PPC_REC_ALLOC_BLOCK_SIZE/4)*sizeof(void*), MemMapper::PAGE_PERMISSION::P_RW, true);
if( !p1 || !p3 )
{
forceLog_printf("Failed to allocate memory for recompiler (0x%08x)", offset);
cemu_assert(false);
return;
}
for(uint32 i=0; i<PPC_REC_ALLOC_BLOCK_SIZE/4; i++)
{
ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[offset/4+i] = PPCRecompiler_leaveRecompilerCode_unvisited;
}
}
void PPCRecompiler_allocateRange(uint32 startAddress, uint32 size)
{
if (ppcRecompilerInstanceData == nullptr)
return;
uint32 endAddress = (startAddress + size + PPC_REC_ALLOC_BLOCK_SIZE - 1) & ~(PPC_REC_ALLOC_BLOCK_SIZE-1);
startAddress = (startAddress) & ~(PPC_REC_ALLOC_BLOCK_SIZE-1);
startAddress = std::min(startAddress, (uint32)MEMORY_CODEAREA_ADDR + MEMORY_CODEAREA_SIZE);
endAddress = std::min(endAddress, (uint32)MEMORY_CODEAREA_ADDR + MEMORY_CODEAREA_SIZE);
for (uint32 i = startAddress; i < endAddress; i += PPC_REC_ALLOC_BLOCK_SIZE)
{
PPCRecompiler_reserveLookupTableBlock(i);
}
}
struct ppcRecompilerFuncRange_t
{
MPTR ppcStart;
uint32 ppcSize;
void* x86Start;
size_t x86Size;
};
DLLEXPORT bool PPCRecompiler_findFuncRanges(uint32 addr, ppcRecompilerFuncRange_t* rangesOut, size_t* countInOut)
{
PPCRecompilerState.recompilerSpinlock.acquire();
size_t countIn = *countInOut;
size_t countOut = 0;
rangeStore_ppcRanges.findRanges(addr, addr + 4, [rangesOut, countIn, &countOut](uint32 start, uint32 end, PPCRecFunction_t* func)
{
if (countOut < countIn)
{
rangesOut[countOut].ppcStart = start;
rangesOut[countOut].ppcSize = (end-start);
rangesOut[countOut].x86Start = func->x86Code;
rangesOut[countOut].x86Size = func->x86Size;
}
countOut++;
}
);
PPCRecompilerState.recompilerSpinlock.release();
*countInOut = countOut;
if (countOut > countIn)
return false;
return true;
}
DLLEXPORT uintptr_t* PPCRecompiler_getJumpTableBase()
{
if (ppcRecompilerInstanceData == nullptr)
return nullptr;
return (uintptr_t*)ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable;
}
void PPCRecompiler_invalidateTableRange(uint32 offset, uint32 size)
{
if (ppcRecompilerInstanceData == nullptr)
return;
for (uint32 i = 0; i < size / 4; i++)
{
ppcRecompilerInstanceData->ppcRecompilerFuncTable[offset / 4 + i] = nullptr;
ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[offset / 4 + i] = PPCRecompiler_leaveRecompilerCode_unvisited;
}
}
void PPCRecompiler_deleteFunction(PPCRecFunction_t* func)
{
// assumes PPCRecompilerState.recompilerSpinlock is already held
cemu_assert_debug(PPCRecompilerState.recompilerSpinlock.isHolding());
for (auto& r : func->list_ranges)
{
PPCRecompiler_invalidateTableRange(r.ppcAddress, r.ppcSize);
if(r.storedRange)
rangeStore_ppcRanges.deleteRange(r.storedRange);
r.storedRange = nullptr;
}
// todo - free x86 code
}
DLLEXPORT void PPCRecompiler_invalidateRange(uint32 startAddr, uint32 endAddr)
{
if (ppcRecompilerEnabled == false)
return;
if (startAddr >= PPC_REC_CODE_AREA_SIZE)
return;
cemu_assert_debug(endAddr >= startAddr);
PPCRecompilerState.recompilerSpinlock.acquire();
uint32 rStart;
uint32 rEnd;
PPCRecFunction_t* rFunc;
// mark range as unvisited
for (uint64 currentAddr = (uint64)startAddr&~3; currentAddr < (uint64)(endAddr&~3); currentAddr += 4)
ppcRecompilerInstanceData->ppcRecompilerDirectJumpTable[currentAddr / 4] = PPCRecompiler_leaveRecompilerCode_unvisited;
// add entry to invalidation queue
PPCRecompilerState.invalidationRanges.emplace_back(startAddr, endAddr-startAddr);
while (rangeStore_ppcRanges.findFirstRange(startAddr, endAddr, rStart, rEnd, rFunc) )
{
PPCRecompiler_deleteFunction(rFunc);
}
PPCRecompilerState.recompilerSpinlock.release();
}
void PPCRecompiler_init()
{
if (ActiveSettings::GetCPUMode() == CPUMode::SinglecoreInterpreter)
{
ppcRecompilerEnabled = false;
return;
}
if (LaunchSettings::ForceInterpreter())
{
cemuLog_log(LogType::Force, "Recompiler disabled. Command line --force-interpreter was passed");
return;
}
if (ppcRecompilerInstanceData)
{
MemMapper::FreeReservation(ppcRecompilerInstanceData, sizeof(PPCRecompilerInstanceData_t));
ppcRecompilerInstanceData = nullptr;
}
debug_printf("Allocating %dMB for recompiler instance data...\n", (sint32)(sizeof(PPCRecompilerInstanceData_t) / 1024 / 1024));
ppcRecompilerInstanceData = (PPCRecompilerInstanceData_t*)MemMapper::ReserveMemory(nullptr, sizeof(PPCRecompilerInstanceData_t), MemMapper::PAGE_PERMISSION::P_RW);
MemMapper::AllocateMemory(&(ppcRecompilerInstanceData->_x64XMM_xorNegateMaskBottom), sizeof(PPCRecompilerInstanceData_t) - offsetof(PPCRecompilerInstanceData_t, _x64XMM_xorNegateMaskBottom), MemMapper::PAGE_PERMISSION::P_RW, true);
PPCRecompilerX64Gen_generateRecompilerInterfaceFunctions();
uint32 codeRegionEnd = RPLLoader_GetMaxCodeOffset();
codeRegionEnd = (codeRegionEnd + PPC_REC_ALLOC_BLOCK_SIZE - 1) & ~(PPC_REC_ALLOC_BLOCK_SIZE - 1);
uint32 codeRegionSize = codeRegionEnd - PPC_REC_CODE_AREA_START;
forceLogDebug_printf("Allocating recompiler tables for range 0x%08x-0x%08x", PPC_REC_CODE_AREA_START, codeRegionEnd);
for (uint32 i = 0; i < codeRegionSize; i += PPC_REC_ALLOC_BLOCK_SIZE)
{
PPCRecompiler_reserveLookupTableBlock(i);
}
// init x64 recompiler instance data
ppcRecompilerInstanceData->_x64XMM_xorNegateMaskBottom[0] = 1ULL << 63ULL;
ppcRecompilerInstanceData->_x64XMM_xorNegateMaskBottom[1] = 0ULL;
ppcRecompilerInstanceData->_x64XMM_xorNegateMaskPair[0] = 1ULL << 63ULL;
ppcRecompilerInstanceData->_x64XMM_xorNegateMaskPair[1] = 1ULL << 63ULL;
ppcRecompilerInstanceData->_x64XMM_xorNOTMask[0] = 0xFFFFFFFFFFFFFFFFULL;
ppcRecompilerInstanceData->_x64XMM_xorNOTMask[1] = 0xFFFFFFFFFFFFFFFFULL;
ppcRecompilerInstanceData->_x64XMM_andAbsMaskBottom[0] = ~(1ULL << 63ULL);
ppcRecompilerInstanceData->_x64XMM_andAbsMaskBottom[1] = ~0ULL;
ppcRecompilerInstanceData->_x64XMM_andAbsMaskPair[0] = ~(1ULL << 63ULL);
ppcRecompilerInstanceData->_x64XMM_andAbsMaskPair[1] = ~(1ULL << 63ULL);
ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[0] = ~(1 << 31);
ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[1] = 0xFFFFFFFF;
ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[2] = 0xFFFFFFFF;
ppcRecompilerInstanceData->_x64XMM_andFloatAbsMaskBottom[3] = 0xFFFFFFFF;
ppcRecompilerInstanceData->_x64XMM_singleWordMask[0] = 0xFFFFFFFFULL;
ppcRecompilerInstanceData->_x64XMM_singleWordMask[1] = 0ULL;
ppcRecompilerInstanceData->_x64XMM_constDouble1_1[0] = 1.0;
ppcRecompilerInstanceData->_x64XMM_constDouble1_1[1] = 1.0;
ppcRecompilerInstanceData->_x64XMM_constDouble0_0[0] = 0.0;
ppcRecompilerInstanceData->_x64XMM_constDouble0_0[1] = 0.0;
ppcRecompilerInstanceData->_x64XMM_constFloat0_0[0] = 0.0f;
ppcRecompilerInstanceData->_x64XMM_constFloat0_0[1] = 0.0f;
ppcRecompilerInstanceData->_x64XMM_constFloat1_1[0] = 1.0f;
ppcRecompilerInstanceData->_x64XMM_constFloat1_1[1] = 1.0f;
*(uint32*)&ppcRecompilerInstanceData->_x64XMM_constFloatMin[0] = 0x00800000;
*(uint32*)&ppcRecompilerInstanceData->_x64XMM_constFloatMin[1] = 0x00800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[0] = 0x7F800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[1] = 0x7F800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[2] = 0x7F800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMask1[3] = 0x7F800000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[0] = ~0x80000000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[1] = ~0x80000000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[2] = ~0x80000000;
ppcRecompilerInstanceData->_x64XMM_flushDenormalMaskResetSignBits[3] = ~0x80000000;
// setup GQR scale tables
for (uint32 i = 0; i < 32; i++)
{
float a = 1.0f / (float)(1u << i);
float b = 0;
if (i == 0)
b = 4294967296.0f;
else
b = (float)(1u << (32u - i));
float ar = (float)(1u << i);
float br = 0;
if (i == 0)
br = 1.0f / 4294967296.0f;
else
br = 1.0f / (float)(1u << (32u - i));
ppcRecompilerInstanceData->_psq_ld_scale_ps0_1[i * 2 + 0] = a;
ppcRecompilerInstanceData->_psq_ld_scale_ps0_1[i * 2 + 1] = 1.0f;
ppcRecompilerInstanceData->_psq_ld_scale_ps0_1[(i + 32) * 2 + 0] = b;
ppcRecompilerInstanceData->_psq_ld_scale_ps0_1[(i + 32) * 2 + 1] = 1.0f;
ppcRecompilerInstanceData->_psq_ld_scale_ps0_ps1[i * 2 + 0] = a;
ppcRecompilerInstanceData->_psq_ld_scale_ps0_ps1[i * 2 + 1] = a;
ppcRecompilerInstanceData->_psq_ld_scale_ps0_ps1[(i + 32) * 2 + 0] = b;
ppcRecompilerInstanceData->_psq_ld_scale_ps0_ps1[(i + 32) * 2 + 1] = b;
ppcRecompilerInstanceData->_psq_st_scale_ps0_1[i * 2 + 0] = ar;
ppcRecompilerInstanceData->_psq_st_scale_ps0_1[i * 2 + 1] = 1.0f;
ppcRecompilerInstanceData->_psq_st_scale_ps0_1[(i + 32) * 2 + 0] = br;
ppcRecompilerInstanceData->_psq_st_scale_ps0_1[(i + 32) * 2 + 1] = 1.0f;
ppcRecompilerInstanceData->_psq_st_scale_ps0_ps1[i * 2 + 0] = ar;
ppcRecompilerInstanceData->_psq_st_scale_ps0_ps1[i * 2 + 1] = ar;
ppcRecompilerInstanceData->_psq_st_scale_ps0_ps1[(i + 32) * 2 + 0] = br;
ppcRecompilerInstanceData->_psq_st_scale_ps0_ps1[(i + 32) * 2 + 1] = br;
}
// mxcsr
ppcRecompilerInstanceData->_x64XMM_mxCsr_ftzOn = 0x1F80 | 0x8000;
ppcRecompilerInstanceData->_x64XMM_mxCsr_ftzOff = 0x1F80;
// query processor extensions
int cpuInfo[4];
__cpuid(cpuInfo, 0x80000001);
hasLZCNTSupport = ((cpuInfo[2] >> 5) & 1) != 0;
__cpuid(cpuInfo, 0x1);
hasMOVBESupport = ((cpuInfo[2] >> 22) & 1) != 0;
hasAVXSupport = ((cpuInfo[2] >> 28) & 1) != 0;
__cpuidex(cpuInfo, 0x7, 0);
hasBMI2Support = ((cpuInfo[1] >> 8) & 1) != 0;
forceLog_printf("Recompiler initialized. CPU extensions: %s%s%s", hasLZCNTSupport ? "LZCNT " : "", hasMOVBESupport ? "MOVBE " : "", hasAVXSupport ? "AVX " : "");
ppcRecompilerEnabled = true;
// launch recompilation thread
std::thread t_recompiler(PPCRecompiler_thread);
t_recompiler.detach();
}

View file

@ -0,0 +1,399 @@
#include <vector>
#define PPC_REC_CODE_AREA_START (0x00000000) // lower bound of executable memory area. Recompiler expects this address to be 0
#define PPC_REC_CODE_AREA_END (0x10000000) // upper bound of executable memory area
#define PPC_REC_CODE_AREA_SIZE (PPC_REC_CODE_AREA_END - PPC_REC_CODE_AREA_START)
#define PPC_REC_ALIGN_TO_4MB(__v) (((__v)+4*1024*1024-1)&~(4*1024*1024-1))
#define PPC_REC_MAX_VIRTUAL_GPR (40) // enough to store 32 GPRs + a few SPRs + temp registers (usually only 1-2)
typedef struct
{
uint32 ppcAddress;
uint32 ppcSize;
//void* x86Start;
//size_t x86Size;
void* storedRange;
}ppcRecRange_t;
typedef struct
{
uint32 ppcAddress;
uint32 ppcSize; // ppc code size of function
void* x86Code; // pointer to x86 code
size_t x86Size;
std::vector<ppcRecRange_t> list_ranges;
}PPCRecFunction_t;
#define PPCREC_IML_OP_FLAG_SIGNEXTEND (1<<0)
#define PPCREC_IML_OP_FLAG_SWITCHENDIAN (1<<1)
#define PPCREC_IML_OP_FLAG_NOT_EXPANDED (1<<2) // set single-precision load instructions to indicate that the value should not be rounded to double-precision
#define PPCREC_IML_OP_FLAG_UNUSED (1<<7) // used to mark instructions that are not used
typedef struct
{
uint8 type;
uint8 operation;
uint8 crRegister; // set to 0xFF if not set, not all IML instruction types support cr.
uint8 crMode; // only used when crRegister is valid, used to differentiate between various forms of condition flag set/clear behavior
uint32 crIgnoreMask; // bit set for every respective CR bit that doesn't need to be updated
uint32 associatedPPCAddress; // ppc address that is associated with this instruction
union
{
struct
{
uint8 _padding[7];
}padding;
struct
{
// R (op) A [update cr* in mode *]
uint8 registerResult;
uint8 registerA;
}op_r_r;
struct
{
// R = A (op) B [update cr* in mode *]
uint8 registerResult;
uint8 registerA;
uint8 registerB;
}op_r_r_r;
struct
{
// R = A (op) immS32 [update cr* in mode *]
uint8 registerResult;
uint8 registerA;
sint32 immS32;
}op_r_r_s32;
struct
{
// R/F = NAME or NAME = R/F
uint8 registerIndex;
uint8 copyWidth;
uint32 name;
uint8 flags;
}op_r_name;
struct
{
// R (op) s32 [update cr* in mode *]
uint8 registerIndex;
sint32 immS32;
}op_r_immS32;
struct
{
uint32 address;
uint8 flags;
}op_jumpmark;
struct
{
uint32 param;
uint32 param2;
uint16 paramU16;
}op_macro;
struct
{
uint32 jumpmarkAddress;
bool jumpAccordingToSegment; //PPCRecImlSegment_t* destinationSegment; // if set, this replaces jumpmarkAddress
uint8 condition; // only used when crRegisterIndex is 8 or above (update: Apparently only used to mark jumps without a condition? -> Cleanup)
uint8 crRegisterIndex;
uint8 crBitIndex;
bool bitMustBeSet;
}op_conditionalJump;
struct
{
uint8 registerData;
uint8 registerMem;
uint8 registerMem2;
uint8 registerGQR;
uint8 copyWidth;
//uint8 flags;
struct
{
bool swapEndian : 1;
bool signExtend : 1;
bool notExpanded : 1; // for floats
}flags2;
uint8 mode; // transfer mode (copy width, ps0/ps1 behavior)
sint32 immS32;
}op_storeLoad;
struct
{
struct
{
uint8 registerMem;
sint32 immS32;
}src;
struct
{
uint8 registerMem;
sint32 immS32;
}dst;
uint8 copyWidth;
}op_mem2mem;
struct
{
uint8 registerResult;
uint8 registerOperand;
uint8 flags;
}op_fpr_r_r;
struct
{
uint8 registerResult;
uint8 registerOperandA;
uint8 registerOperandB;
uint8 flags;
}op_fpr_r_r_r;
struct
{
uint8 registerResult;
uint8 registerOperandA;
uint8 registerOperandB;
uint8 registerOperandC;
uint8 flags;
}op_fpr_r_r_r_r;
struct
{
uint8 registerResult;
//uint8 flags;
}op_fpr_r;
struct
{
uint32 ppcAddress;
uint32 x64Offset;
}op_ppcEnter;
struct
{
uint8 crD; // crBitIndex (result)
uint8 crA; // crBitIndex
uint8 crB; // crBitIndex
}op_cr;
// conditional operations (emitted if supported by target platform)
struct
{
// r_s32
uint8 registerIndex;
sint32 immS32;
// condition
uint8 crRegisterIndex;
uint8 crBitIndex;
bool bitMustBeSet;
}op_conditional_r_s32;
};
}PPCRecImlInstruction_t;
typedef struct _PPCRecImlSegment_t PPCRecImlSegment_t;
typedef struct _ppcRecompilerSegmentPoint_t
{
sint32 index;
PPCRecImlSegment_t* imlSegment;
_ppcRecompilerSegmentPoint_t* next;
_ppcRecompilerSegmentPoint_t* prev;
}ppcRecompilerSegmentPoint_t;
struct raLivenessLocation_t
{
sint32 index;
bool isRead;
bool isWrite;
raLivenessLocation_t() {};
raLivenessLocation_t(sint32 index, bool isRead, bool isWrite)
: index(index), isRead(isRead), isWrite(isWrite) {};
};
struct raLivenessSubrangeLink_t
{
struct raLivenessSubrange_t* prev;
struct raLivenessSubrange_t* next;
};
struct raLivenessSubrange_t
{
struct raLivenessRange_t* range;
PPCRecImlSegment_t* imlSegment;
ppcRecompilerSegmentPoint_t start;
ppcRecompilerSegmentPoint_t end;
// dirty state tracking
bool _noLoad;
bool hasStore;
bool hasStoreDelayed;
// next
raLivenessSubrange_t* subrangeBranchTaken;
raLivenessSubrange_t* subrangeBranchNotTaken;
// processing
uint32 lastIterationIndex;
// instruction locations
std::vector<raLivenessLocation_t> list_locations;
// linked list (subranges with same GPR virtual register)
raLivenessSubrangeLink_t link_sameVirtualRegisterGPR;
// linked list (all subranges for this segment)
raLivenessSubrangeLink_t link_segmentSubrangesGPR;
};
struct raLivenessRange_t
{
sint32 virtualRegister;
sint32 physicalRegister;
sint32 name;
std::vector<raLivenessSubrange_t*> list_subranges;
};
struct PPCSegmentRegisterAllocatorInfo_t
{
// analyzer stage
bool isPartOfProcessedLoop{}; // used during loop detection
sint32 lastIterationIndex{};
// linked lists
raLivenessSubrange_t* linkedList_allSubranges{};
raLivenessSubrange_t* linkedList_perVirtualGPR[PPC_REC_MAX_VIRTUAL_GPR]{};
};
struct PPCRecVGPRDistances_t
{
struct _RegArrayEntry
{
sint32 usageStart{};
sint32 usageEnd{};
}reg[PPC_REC_MAX_VIRTUAL_GPR];
bool isProcessed[PPC_REC_MAX_VIRTUAL_GPR]{};
};
typedef struct _PPCRecImlSegment_t
{
sint32 momentaryIndex{}; // index in segment list, generally not kept up to date except if needed (necessary for loop detection)
sint32 startOffset{}; // offset to first instruction in iml instruction list
sint32 count{}; // number of instructions in segment
uint32 ppcAddress{}; // ppc address (0xFFFFFFFF if not associated with an address)
uint32 x64Offset{}; // x64 code offset of segment start
uint32 cycleCount{}; // number of PPC cycles required to execute this segment (roughly)
// list of intermediate instructions in this segment
PPCRecImlInstruction_t* imlList{};
sint32 imlListSize{};
sint32 imlListCount{};
// segment link
_PPCRecImlSegment_t* nextSegmentBranchNotTaken{}; // this is also the default for segments where there is no branch
_PPCRecImlSegment_t* nextSegmentBranchTaken{};
bool nextSegmentIsUncertain{};
sint32 loopDepth{};
//sList_t* list_prevSegments;
std::vector<_PPCRecImlSegment_t*> list_prevSegments{};
// PPC range of segment
uint32 ppcAddrMin{};
uint32 ppcAddrMax{};
// enterable segments
bool isEnterable{}; // this segment can be entered from outside the recompiler (no preloaded registers necessary)
uint32 enterPPCAddress{}; // used if isEnterable is true
// jump destination segments
bool isJumpDestination{}; // segment is a destination for one or more (conditional) jumps
uint32 jumpDestinationPPCAddress{};
// PPC FPR use mask
bool ppcFPRUsed[32]{}; // same as ppcGPRUsed, but for FPR
// CR use mask
uint32 crBitsInput{}; // bits that are expected to be set from the previous segment (read in this segment but not overwritten)
uint32 crBitsRead{}; // all bits that are read in this segment
uint32 crBitsWritten{}; // bits that are written in this segment
// register allocator info
PPCSegmentRegisterAllocatorInfo_t raInfo{};
PPCRecVGPRDistances_t raDistances{};
bool raRangeExtendProcessed{};
// segment points
ppcRecompilerSegmentPoint_t* segmentPointList{};
}PPCRecImlSegment_t;
struct ppcImlGenContext_t
{
PPCRecFunction_t* functionRef;
uint32* currentInstruction;
uint32 ppcAddressOfCurrentInstruction;
// fpr mode
bool LSQE{ true };
bool PSE{ true };
// cycle counter
uint32 cyclesSinceLastBranch; // used to track ppc cycles
// temporary general purpose registers
uint32 mappedRegister[PPC_REC_MAX_VIRTUAL_GPR];
// temporary floating point registers (single and double precision)
uint32 mappedFPRRegister[256];
// list of intermediate instructions
PPCRecImlInstruction_t* imlList;
sint32 imlListSize;
sint32 imlListCount;
// list of segments
PPCRecImlSegment_t** segmentList;
sint32 segmentListSize;
sint32 segmentListCount;
// code generation control
bool hasFPUInstruction; // if true, PPCEnter macro will create FP_UNAVAIL checks -> Not needed in user mode
// register allocator info
struct
{
std::vector<raLivenessRange_t*> list_ranges;
}raInfo;
// analysis info
struct
{
bool modifiesGQR[8];
}tracking;
};
typedef void ATTR_MS_ABI (*PPCREC_JUMP_ENTRY)();
typedef struct
{
PPCRecFunction_t* ppcRecompilerFuncTable[PPC_REC_ALIGN_TO_4MB(PPC_REC_CODE_AREA_SIZE/4)]; // one virtual-function pointer for each potential ppc instruction
PPCREC_JUMP_ENTRY ppcRecompilerDirectJumpTable[PPC_REC_ALIGN_TO_4MB(PPC_REC_CODE_AREA_SIZE/4)]; // lookup table for ppc offset to native code function
// x64 data
uint64 __declspec(align(16)) _x64XMM_xorNegateMaskBottom[2];
uint64 __declspec(align(16)) _x64XMM_xorNegateMaskPair[2];
uint64 __declspec(align(16)) _x64XMM_xorNOTMask[2];
uint64 __declspec(align(16)) _x64XMM_andAbsMaskBottom[2];
uint64 __declspec(align(16)) _x64XMM_andAbsMaskPair[2];
uint32 __declspec(align(16)) _x64XMM_andFloatAbsMaskBottom[4];
uint64 __declspec(align(16)) _x64XMM_singleWordMask[2];
double __declspec(align(16)) _x64XMM_constDouble1_1[2];
double __declspec(align(16)) _x64XMM_constDouble0_0[2];
float __declspec(align(16)) _x64XMM_constFloat0_0[2];
float __declspec(align(16)) _x64XMM_constFloat1_1[2];
float __declspec(align(16)) _x64XMM_constFloatMin[2];
uint32 __declspec(align(16)) _x64XMM_flushDenormalMask1[4];
uint32 __declspec(align(16)) _x64XMM_flushDenormalMaskResetSignBits[4];
// PSQ load/store scale tables
double _psq_ld_scale_ps0_ps1[64 * 2];
double _psq_ld_scale_ps0_1[64 * 2];
double _psq_st_scale_ps0_ps1[64 * 2];
double _psq_st_scale_ps0_1[64 * 2];
// MXCSR
uint32 _x64XMM_mxCsr_ftzOn;
uint32 _x64XMM_mxCsr_ftzOff;
}PPCRecompilerInstanceData_t;
extern __declspec(dllexport) PPCRecompilerInstanceData_t* ppcRecompilerInstanceData;
extern bool ppcRecompilerEnabled;
__declspec(dllexport) void PPCRecompiler_init();
void PPCRecompiler_allocateRange(uint32 startAddress, uint32 size);
DLLEXPORT void PPCRecompiler_invalidateRange(uint32 startAddr, uint32 endAddr);
extern void ATTR_MS_ABI (*PPCRecompiler_enterRecompilerCode)(uint64 codeMem, uint64 ppcInterpreterInstance);
extern void ATTR_MS_ABI (*PPCRecompiler_leaveRecompilerCode_visited)();
extern void ATTR_MS_ABI (*PPCRecompiler_leaveRecompilerCode_unvisited)();
#define PPC_REC_INVALID_FUNCTION ((PPCRecFunction_t*)-1)
// CPUID
extern __declspec(dllexport) bool hasLZCNTSupport;
extern __declspec(dllexport) bool hasMOVBESupport;
extern __declspec(dllexport) bool hasBMI2Support;
extern __declspec(dllexport) bool hasAVXSupport;
// todo - move some of the stuff above into PPCRecompilerInternal.h
// recompiler interface
void PPCRecompiler_recompileIfUnvisited(uint32 enterAddress);
void PPCRecompiler_attemptEnter(struct PPCInterpreter_t* hCPU, uint32 enterAddress);
void PPCRecompiler_attemptEnterWithoutRecompile(struct PPCInterpreter_t* hCPU, uint32 enterAddress);

View file

@ -0,0 +1,422 @@
#define PPCREC_CR_REG_TEMP 8 // there are only 8 cr registers (0-7) we use the 8th as temporary cr register that is never stored (BDNZ instruction for example)
enum
{
PPCREC_IML_OP_ASSIGN, // '=' operator
PPCREC_IML_OP_ENDIAN_SWAP, // '=' operator with 32bit endian swap
PPCREC_IML_OP_ADD, // '+' operator
PPCREC_IML_OP_SUB, // '-' operator
PPCREC_IML_OP_SUB_CARRY_UPDATE_CARRY, // complex operation, result = operand + ~operand2 + carry bit, updates carry bit
PPCREC_IML_OP_COMPARE_SIGNED, // arithmetic/signed comparison operator (updates cr)
PPCREC_IML_OP_COMPARE_UNSIGNED, // logical/unsigned comparison operator (updates cr)
PPCREC_IML_OP_MULTIPLY_SIGNED, // '*' operator (signed multiply)
PPCREC_IML_OP_MULTIPLY_HIGH_UNSIGNED, // unsigned 64bit multiply, store only high 32bit-word of result
PPCREC_IML_OP_MULTIPLY_HIGH_SIGNED, // signed 64bit multiply, store only high 32bit-word of result
PPCREC_IML_OP_DIVIDE_SIGNED, // '/' operator (signed divide)
PPCREC_IML_OP_DIVIDE_UNSIGNED, // '/' operator (unsigned divide)
PPCREC_IML_OP_ADD_CARRY, // complex operation, result = operand + carry bit, updates carry bit
PPCREC_IML_OP_ADD_CARRY_ME, // complex operation, result = operand + carry bit + (-1), updates carry bit
PPCREC_IML_OP_ADD_UPDATE_CARRY, // '+' operator but also updates carry flag
PPCREC_IML_OP_ADD_CARRY_UPDATE_CARRY, // '+' operator and also adds carry, updates carry flag
// assign operators with cast
PPCREC_IML_OP_ASSIGN_S16_TO_S32, // copy 16bit and sign extend
PPCREC_IML_OP_ASSIGN_S8_TO_S32, // copy 8bit and sign extend
// binary operation
PPCREC_IML_OP_OR, // '|' operator
PPCREC_IML_OP_ORC, // '|' operator, second operand is complemented first
PPCREC_IML_OP_AND, // '&' operator
PPCREC_IML_OP_XOR, // '^' operator
PPCREC_IML_OP_LEFT_ROTATE, // left rotate operator
PPCREC_IML_OP_LEFT_SHIFT, // shift left operator
PPCREC_IML_OP_RIGHT_SHIFT, // right shift operator (unsigned)
PPCREC_IML_OP_NOT, // complement each bit
PPCREC_IML_OP_NEG, // negate
// ppc
PPCREC_IML_OP_RLWIMI, // RLWIMI instruction (rotate, merge based on mask)
PPCREC_IML_OP_SRAW, // SRAWI/SRAW instruction (algebraic shift right, sets ca flag)
PPCREC_IML_OP_SLW, // SLW (shift based on register by up to 63 bits)
PPCREC_IML_OP_SRW, // SRW (shift based on register by up to 63 bits)
PPCREC_IML_OP_CNTLZW,
PPCREC_IML_OP_SUBFC, // SUBFC and SUBFIC (subtract from and set carry)
PPCREC_IML_OP_DCBZ, // clear 32 bytes aligned to 0x20
PPCREC_IML_OP_MFCR, // copy cr to gpr
PPCREC_IML_OP_MTCRF, // copy gpr to cr (with mask)
// condition register
PPCREC_IML_OP_CR_CLEAR, // clear cr bit
PPCREC_IML_OP_CR_SET, // set cr bit
PPCREC_IML_OP_CR_OR, // OR cr bits
PPCREC_IML_OP_CR_ORC, // OR cr bits, complement second input operand bit first
PPCREC_IML_OP_CR_AND, // AND cr bits
PPCREC_IML_OP_CR_ANDC, // AND cr bits, complement second input operand bit first
// FPU
PPCREC_IML_OP_FPR_ADD_BOTTOM,
PPCREC_IML_OP_FPR_ADD_PAIR,
PPCREC_IML_OP_FPR_SUB_PAIR,
PPCREC_IML_OP_FPR_SUB_BOTTOM,
PPCREC_IML_OP_FPR_MULTIPLY_BOTTOM,
PPCREC_IML_OP_FPR_MULTIPLY_PAIR,
PPCREC_IML_OP_FPR_DIVIDE_BOTTOM,
PPCREC_IML_OP_FPR_DIVIDE_PAIR,
PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM_AND_TOP,
PPCREC_IML_OP_FPR_COPY_TOP_TO_BOTTOM_AND_TOP,
PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_BOTTOM,
PPCREC_IML_OP_FPR_COPY_BOTTOM_TO_TOP, // leave bottom of destination untouched
PPCREC_IML_OP_FPR_COPY_TOP_TO_TOP, // leave bottom of destination untouched
PPCREC_IML_OP_FPR_COPY_TOP_TO_BOTTOM, // leave top of destination untouched
PPCREC_IML_OP_FPR_COPY_BOTTOM_AND_TOP_SWAPPED,
PPCREC_IML_OP_FPR_EXPAND_BOTTOM32_TO_BOTTOM64_AND_TOP64, // expand bottom f32 to f64 in bottom and top half
PPCREC_IML_OP_FPR_BOTTOM_FRES_TO_BOTTOM_AND_TOP, // calculate reciprocal with Espresso accuracy of source bottom half and write result to destination bottom and top half
PPCREC_IML_OP_FPR_FCMPO_BOTTOM,
PPCREC_IML_OP_FPR_FCMPU_BOTTOM,
PPCREC_IML_OP_FPR_FCMPU_TOP,
PPCREC_IML_OP_FPR_NEGATE_BOTTOM,
PPCREC_IML_OP_FPR_NEGATE_PAIR,
PPCREC_IML_OP_FPR_ABS_BOTTOM, // abs(fp0)
PPCREC_IML_OP_FPR_ABS_PAIR,
PPCREC_IML_OP_FPR_FRES_PAIR, // 1.0/fp approx (Espresso accuracy)
PPCREC_IML_OP_FPR_FRSQRTE_PAIR, // 1.0/sqrt(fp) approx (Espresso accuracy)
PPCREC_IML_OP_FPR_NEGATIVE_ABS_BOTTOM, // -abs(fp0)
PPCREC_IML_OP_FPR_ROUND_TO_SINGLE_PRECISION_BOTTOM, // round 64bit double to 64bit double with 32bit float precision (in bottom half of xmm register)
PPCREC_IML_OP_FPR_ROUND_TO_SINGLE_PRECISION_PAIR, // round two 64bit doubles to 64bit double with 32bit float precision
PPCREC_IML_OP_FPR_BOTTOM_RECIPROCAL_SQRT,
PPCREC_IML_OP_FPR_BOTTOM_FCTIWZ,
PPCREC_IML_OP_FPR_SELECT_BOTTOM, // selectively copy bottom value from operand B or C based on value in operand A
PPCREC_IML_OP_FPR_SELECT_PAIR, // selectively copy top/bottom from operand B or C based on value in top/bottom of operand A
// PS
PPCREC_IML_OP_FPR_SUM0,
PPCREC_IML_OP_FPR_SUM1,
};
#define PPCREC_IML_OP_FPR_COPY_PAIR (PPCREC_IML_OP_ASSIGN)
enum
{
PPCREC_IML_MACRO_BLR, // macro for BLR instruction code
PPCREC_IML_MACRO_BLRL, // macro for BLRL instruction code
PPCREC_IML_MACRO_BCTR, // macro for BCTR instruction code
PPCREC_IML_MACRO_BCTRL, // macro for BCTRL instruction code
PPCREC_IML_MACRO_BL, // call to different function (can be within same function)
PPCREC_IML_MACRO_B_FAR, // branch to different function
PPCREC_IML_MACRO_COUNT_CYCLES, // decrease current remaining thread cycles by a certain amount
PPCREC_IML_MACRO_HLE, // HLE function call
PPCREC_IML_MACRO_MFTB, // get TB register value (low or high)
PPCREC_IML_MACRO_LEAVE, // leaves recompiler and switches to interpeter
// debugging
PPCREC_IML_MACRO_DEBUGBREAK, // throws a debugbreak
};
enum
{
PPCREC_JUMP_CONDITION_NONE,
PPCREC_JUMP_CONDITION_E, // equal / zero
PPCREC_JUMP_CONDITION_NE, // not equal / not zero
PPCREC_JUMP_CONDITION_LE, // less or equal
PPCREC_JUMP_CONDITION_L, // less
PPCREC_JUMP_CONDITION_GE, // greater or equal
PPCREC_JUMP_CONDITION_G, // greater
// special case:
PPCREC_JUMP_CONDITION_SUMMARYOVERFLOW, // needs special handling
PPCREC_JUMP_CONDITION_NSUMMARYOVERFLOW, // not summaryoverflow
};
enum
{
PPCREC_CR_MODE_COMPARE_SIGNED,
PPCREC_CR_MODE_COMPARE_UNSIGNED, // alias logic compare
// others: PPCREC_CR_MODE_ARITHMETIC,
PPCREC_CR_MODE_ARITHMETIC, // arithmetic use (for use with add/sub instructions without generating extra code)
PPCREC_CR_MODE_LOGICAL,
};
enum
{
PPCREC_IML_TYPE_NONE,
PPCREC_IML_TYPE_NO_OP, // no-op instruction
PPCREC_IML_TYPE_JUMPMARK, // possible jump destination (generated before each ppc instruction)
PPCREC_IML_TYPE_R_R, // r* (op) *r
PPCREC_IML_TYPE_R_R_R, // r* = r* (op) r*
PPCREC_IML_TYPE_R_R_S32, // r* = r* (op) s32*
PPCREC_IML_TYPE_LOAD, // r* = [r*+s32*]
PPCREC_IML_TYPE_LOAD_INDEXED, // r* = [r*+r*]
PPCREC_IML_TYPE_STORE, // [r*+s32*] = r*
PPCREC_IML_TYPE_STORE_INDEXED, // [r*+r*] = r*
PPCREC_IML_TYPE_R_NAME, // r* = name
PPCREC_IML_TYPE_NAME_R, // name* = r*
PPCREC_IML_TYPE_R_S32, // r* (op) imm
PPCREC_IML_TYPE_MACRO,
PPCREC_IML_TYPE_CJUMP, // conditional jump
PPCREC_IML_TYPE_CJUMP_CYCLE_CHECK, // jumps only if remaining thread cycles >= 0
PPCREC_IML_TYPE_PPC_ENTER, // used to mark locations that should be written to recompilerCallTable
PPCREC_IML_TYPE_CR, // condition register specific operations (one or more operands)
// conditional
PPCREC_IML_TYPE_CONDITIONAL_R_S32,
// FPR
PPCREC_IML_TYPE_FPR_R_NAME, // name = f*
PPCREC_IML_TYPE_FPR_NAME_R, // f* = name
PPCREC_IML_TYPE_FPR_LOAD, // r* = (bitdepth) [r*+s32*] (single or paired single mode)
PPCREC_IML_TYPE_FPR_LOAD_INDEXED, // r* = (bitdepth) [r*+r*] (single or paired single mode)
PPCREC_IML_TYPE_FPR_STORE, // (bitdepth) [r*+s32*] = r* (single or paired single mode)
PPCREC_IML_TYPE_FPR_STORE_INDEXED, // (bitdepth) [r*+r*] = r* (single or paired single mode)
PPCREC_IML_TYPE_FPR_R_R,
PPCREC_IML_TYPE_FPR_R_R_R,
PPCREC_IML_TYPE_FPR_R_R_R_R,
PPCREC_IML_TYPE_FPR_R,
// special
PPCREC_IML_TYPE_MEM2MEM, // memory to memory copy (deprecated)
};
enum
{
PPCREC_NAME_NONE,
PPCREC_NAME_TEMPORARY,
PPCREC_NAME_R0 = 1000,
PPCREC_NAME_SPR0 = 2000,
PPCREC_NAME_FPR0 = 3000,
PPCREC_NAME_TEMPORARY_FPR0 = 4000, // 0 to 7
//PPCREC_NAME_CR0 = 3000, // value mapped condition register (usually it isn't needed and can be optimized away)
};
// special cases for LOAD/STORE
#define PPC_REC_LOAD_LWARX_MARKER (100) // lwarx instruction (similar to LWZX but sets reserved address/value)
#define PPC_REC_STORE_STWCX_MARKER (100) // stwcx instruction (similar to STWX but writes only if reservation from LWARX is valid)
#define PPC_REC_STORE_STSWI_1 (200) // stswi nb = 1
#define PPC_REC_STORE_STSWI_2 (201) // stswi nb = 2
#define PPC_REC_STORE_STSWI_3 (202) // stswi nb = 3
#define PPC_REC_STORE_LSWI_1 (200) // lswi nb = 1
#define PPC_REC_STORE_LSWI_2 (201) // lswi nb = 2
#define PPC_REC_STORE_LSWI_3 (202) // lswi nb = 3
#define PPC_REC_INVALID_REGISTER 0xFF
#define PPCREC_CR_BIT_LT 0
#define PPCREC_CR_BIT_GT 1
#define PPCREC_CR_BIT_EQ 2
#define PPCREC_CR_BIT_SO 3
enum
{
// fpr load
PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0,
PPCREC_FPR_LD_MODE_SINGLE_INTO_PS0_PS1,
PPCREC_FPR_LD_MODE_DOUBLE_INTO_PS0,
PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0,
PPCREC_FPR_LD_MODE_PSQ_GENERIC_PS0_PS1,
PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0,
PPCREC_FPR_LD_MODE_PSQ_FLOAT_PS0_PS1,
PPCREC_FPR_LD_MODE_PSQ_S16_PS0,
PPCREC_FPR_LD_MODE_PSQ_S16_PS0_PS1,
PPCREC_FPR_LD_MODE_PSQ_U16_PS0,
PPCREC_FPR_LD_MODE_PSQ_U16_PS0_PS1,
PPCREC_FPR_LD_MODE_PSQ_S8_PS0,
PPCREC_FPR_LD_MODE_PSQ_S8_PS0_PS1,
PPCREC_FPR_LD_MODE_PSQ_U8_PS0,
PPCREC_FPR_LD_MODE_PSQ_U8_PS0_PS1,
// fpr store
PPCREC_FPR_ST_MODE_SINGLE_FROM_PS0, // store 1 single precision float from ps0
PPCREC_FPR_ST_MODE_DOUBLE_FROM_PS0, // store 1 double precision float from ps0
PPCREC_FPR_ST_MODE_UI32_FROM_PS0, // store raw low-32bit of PS0
PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0_PS1,
PPCREC_FPR_ST_MODE_PSQ_GENERIC_PS0,
PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0_PS1,
PPCREC_FPR_ST_MODE_PSQ_FLOAT_PS0,
PPCREC_FPR_ST_MODE_PSQ_S8_PS0,
PPCREC_FPR_ST_MODE_PSQ_S8_PS0_PS1,
PPCREC_FPR_ST_MODE_PSQ_U8_PS0,
PPCREC_FPR_ST_MODE_PSQ_U8_PS0_PS1,
PPCREC_FPR_ST_MODE_PSQ_U16_PS0,
PPCREC_FPR_ST_MODE_PSQ_U16_PS0_PS1,
PPCREC_FPR_ST_MODE_PSQ_S16_PS0,
PPCREC_FPR_ST_MODE_PSQ_S16_PS0_PS1,
};
bool PPCRecompiler_generateIntermediateCode(ppcImlGenContext_t& ppcImlGenContext, PPCRecFunction_t* PPCRecFunction, std::set<uint32>& entryAddresses);
void PPCRecompiler_freeContext(ppcImlGenContext_t* ppcImlGenContext); // todo - move to destructor
PPCRecImlInstruction_t* PPCRecompilerImlGen_generateNewEmptyInstruction(ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompiler_pushBackIMLInstructions(PPCRecImlSegment_t* imlSegment, sint32 index, sint32 shiftBackCount);
PPCRecImlInstruction_t* PPCRecompiler_insertInstruction(PPCRecImlSegment_t* imlSegment, sint32 index);
void PPCRecompilerIml_insertSegments(ppcImlGenContext_t* ppcImlGenContext, sint32 index, sint32 count);
void PPCRecompilerIml_setSegmentPoint(ppcRecompilerSegmentPoint_t* segmentPoint, PPCRecImlSegment_t* imlSegment, sint32 index);
void PPCRecompilerIml_removeSegmentPoint(ppcRecompilerSegmentPoint_t* segmentPoint);
// GPR register management
uint32 PPCRecompilerImlGen_loadRegister(ppcImlGenContext_t* ppcImlGenContext, uint32 mappedName, bool loadNew = false);
uint32 PPCRecompilerImlGen_loadOverwriteRegister(ppcImlGenContext_t* ppcImlGenContext, uint32 mappedName);
// FPR register management
uint32 PPCRecompilerImlGen_loadFPRRegister(ppcImlGenContext_t* ppcImlGenContext, uint32 mappedName, bool loadNew = false);
uint32 PPCRecompilerImlGen_loadOverwriteFPRRegister(ppcImlGenContext_t* ppcImlGenContext, uint32 mappedName);
// IML instruction generation
void PPCRecompilerImlGen_generateNewInstruction_jump(ppcImlGenContext_t* ppcImlGenContext, PPCRecImlInstruction_t* imlInstruction, uint32 jumpmarkAddress);
void PPCRecompilerImlGen_generateNewInstruction_jumpSegment(ppcImlGenContext_t* ppcImlGenContext, PPCRecImlInstruction_t* imlInstruction);
void PPCRecompilerImlGen_generateNewInstruction_r_s32(ppcImlGenContext_t* ppcImlGenContext, uint32 operation, uint8 registerIndex, sint32 immS32, uint32 copyWidth, bool signExtend, bool bigEndian, uint8 crRegister, uint32 crMode);
void PPCRecompilerImlGen_generateNewInstruction_conditional_r_s32(ppcImlGenContext_t* ppcImlGenContext, PPCRecImlInstruction_t* imlInstruction, uint32 operation, uint8 registerIndex, sint32 immS32, uint32 crRegisterIndex, uint32 crBitIndex, bool bitMustBeSet);
void PPCRecompilerImlGen_generateNewInstruction_r_r(ppcImlGenContext_t* ppcImlGenContext, PPCRecImlInstruction_t* imlInstruction, uint32 operation, uint8 registerResult, uint8 registerA, uint8 crRegister = PPC_REC_INVALID_REGISTER, uint8 crMode = 0);
// IML instruction generation (new style, can generate new instructions but also overwrite existing ones)
void PPCRecompilerImlGen_generateNewInstruction_noOp(ppcImlGenContext_t* ppcImlGenContext, PPCRecImlInstruction_t* imlInstruction);
void PPCRecompilerImlGen_generateNewInstruction_memory_memory(ppcImlGenContext_t* ppcImlGenContext, PPCRecImlInstruction_t* imlInstruction, uint8 srcMemReg, sint32 srcImmS32, uint8 dstMemReg, sint32 dstImmS32, uint8 copyWidth);
void PPCRecompilerImlGen_generateNewInstruction_fpr_r(ppcImlGenContext_t* ppcImlGenContext, PPCRecImlInstruction_t* imlInstruction, sint32 operation, uint8 registerResult, sint32 crRegister = PPC_REC_INVALID_REGISTER);
// IML generation - FPU
bool PPCRecompilerImlGen_LFS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_LFSU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_LFSX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_LFSUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_LFD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_LFDU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_LFDX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_LFDUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_STFS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_STFSU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_STFSX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_STFSUX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_STFIWX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_STFD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_STFDU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_STFDX(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FADD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FSUB(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FMUL(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FDIV(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FMADD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FMSUB(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FNMSUB(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FMULS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FDIVS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FADDS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FSUBS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FMADDS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FMSUBS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FNMSUBS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FCMPO(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FCMPU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FMR(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FABS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FNABS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FRES(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FRSP(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FNEG(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FSEL(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FRSQRTE(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_FCTIWZ(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PSQ_L(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PSQ_LU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PSQ_ST(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PSQ_STU(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_MULS0(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_MULS1(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_MADDS0(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_MADDS1(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_ADD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_SUB(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_MUL(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_DIV(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_MADD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_NMADD(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_MSUB(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_NMSUB(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_SUM0(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_SUM1(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_NEG(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_ABS(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_RES(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_RSQRTE(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_MR(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_SEL(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_MERGE00(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_MERGE01(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_MERGE10(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_MERGE11(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_CMPO0(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_CMPU0(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
bool PPCRecompilerImlGen_PS_CMPU1(ppcImlGenContext_t* ppcImlGenContext, uint32 opcode);
// IML general
bool PPCRecompiler_isSuffixInstruction(PPCRecImlInstruction_t* iml);
void PPCRecompilerIML_linkSegments(ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompilerIml_setLinkBranchNotTaken(PPCRecImlSegment_t* imlSegmentSrc, PPCRecImlSegment_t* imlSegmentDst);
void PPCRecompilerIml_setLinkBranchTaken(PPCRecImlSegment_t* imlSegmentSrc, PPCRecImlSegment_t* imlSegmentDst);
void PPCRecompilerIML_relinkInputSegment(PPCRecImlSegment_t* imlSegmentOrig, PPCRecImlSegment_t* imlSegmentNew);
void PPCRecompilerIML_removeLink(PPCRecImlSegment_t* imlSegmentSrc, PPCRecImlSegment_t* imlSegmentDst);
void PPCRecompilerIML_isolateEnterableSegments(ppcImlGenContext_t* ppcImlGenContext);
PPCRecImlInstruction_t* PPCRecompilerIML_getLastInstruction(PPCRecImlSegment_t* imlSegment);
// IML analyzer
typedef struct
{
uint32 readCRBits;
uint32 writtenCRBits;
}PPCRecCRTracking_t;
bool PPCRecompilerImlAnalyzer_isTightFiniteLoop(PPCRecImlSegment_t* imlSegment);
bool PPCRecompilerImlAnalyzer_canTypeWriteCR(PPCRecImlInstruction_t* imlInstruction);
void PPCRecompilerImlAnalyzer_getCRTracking(PPCRecImlInstruction_t* imlInstruction, PPCRecCRTracking_t* crTracking);
// IML optimizer
bool PPCRecompiler_reduceNumberOfFPRRegisters(ppcImlGenContext_t* ppcImlGenContext);
bool PPCRecompiler_manageFPRRegisters(ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompiler_removeRedundantCRUpdates(ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompiler_optimizeDirectFloatCopies(ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompiler_optimizeDirectIntegerCopies(ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompiler_optimizePSQLoadAndStore(ppcImlGenContext_t* ppcImlGenContext);
// IML register allocator
void PPCRecompilerImm_allocateRegisters(ppcImlGenContext_t* ppcImlGenContext);
// late optimizations
void PPCRecompiler_reorderConditionModifyInstructions(ppcImlGenContext_t* ppcImlGenContext);
// debug
void PPCRecompiler_dumpIMLSegment(PPCRecImlSegment_t* imlSegment, sint32 segmentIndex, bool printLivenessRangeInfo = false);
typedef struct
{
union
{
struct
{
sint16 readNamedReg1;
sint16 readNamedReg2;
sint16 readNamedReg3;
sint16 writtenNamedReg1;
};
sint16 gpr[4]; // 3 read + 1 write
};
// FPR
union
{
struct
{
// note: If destination operand is not fully written, it will be added as a read FPR as well
sint16 readFPR1;
sint16 readFPR2;
sint16 readFPR3;
sint16 readFPR4; // usually this is set to the result FPR if only partially overwritten
sint16 writtenFPR1;
};
sint16 fpr[4];
};
}PPCImlOptimizerUsedRegisters_t;
void PPCRecompiler_checkRegisterUsage(ppcImlGenContext_t* ppcImlGenContext, PPCRecImlInstruction_t* imlInstruction, PPCImlOptimizerUsedRegisters_t* registersUsed);

View file

@ -0,0 +1,137 @@
#include "PPCRecompiler.h"
#include "PPCRecompilerIml.h"
#include "util/helpers/fixedSizeList.h"
#include "Cafe/HW/Espresso/Interpreter/PPCInterpreterInternal.h"
/*
* Initializes a single segment and returns true if it is a finite loop
*/
bool PPCRecompilerImlAnalyzer_isTightFiniteLoop(PPCRecImlSegment_t* imlSegment)
{
bool isTightFiniteLoop = false;
// base criteria, must jump to beginning of same segment
if (imlSegment->nextSegmentBranchTaken != imlSegment)
return false;
// loops using BDNZ are assumed to always be finite
for (sint32 t = 0; t < imlSegment->imlListCount; t++)
{
if (imlSegment->imlList[t].type == PPCREC_IML_TYPE_R_S32 && imlSegment->imlList[t].operation == PPCREC_IML_OP_SUB && imlSegment->imlList[t].crRegister == 8)
{
return true;
}
}
// for non-BDNZ loops, check for common patterns
// risky approach, look for ADD/SUB operations and assume that potential overflow means finite (does not include r_r_s32 ADD/SUB)
// this catches most loops with load-update and store-update instructions, but also those with decrementing counters
FixedSizeList<sint32, 64, true> list_modifiedRegisters;
for (sint32 t = 0; t < imlSegment->imlListCount; t++)
{
if (imlSegment->imlList[t].type == PPCREC_IML_TYPE_R_S32 && (imlSegment->imlList[t].operation == PPCREC_IML_OP_ADD || imlSegment->imlList[t].operation == PPCREC_IML_OP_SUB) )
{
list_modifiedRegisters.addUnique(imlSegment->imlList[t].op_r_immS32.registerIndex);
}
}
if (list_modifiedRegisters.count > 0)
{
// remove all registers from the list that are modified by non-ADD/SUB instructions
// todo: We should also cover the case where ADD+SUB on the same register cancel the effect out
PPCImlOptimizerUsedRegisters_t registersUsed;
for (sint32 t = 0; t < imlSegment->imlListCount; t++)
{
if (imlSegment->imlList[t].type == PPCREC_IML_TYPE_R_S32 && (imlSegment->imlList[t].operation == PPCREC_IML_OP_ADD || imlSegment->imlList[t].operation == PPCREC_IML_OP_SUB))
continue;
PPCRecompiler_checkRegisterUsage(NULL, imlSegment->imlList + t, &registersUsed);
if(registersUsed.writtenNamedReg1 < 0)
continue;
list_modifiedRegisters.remove(registersUsed.writtenNamedReg1);
}
if (list_modifiedRegisters.count > 0)
{
return true;
}
}
return false;
}
/*
* Returns true if the imlInstruction can overwrite CR (depending on value of ->crRegister)
*/
bool PPCRecompilerImlAnalyzer_canTypeWriteCR(PPCRecImlInstruction_t* imlInstruction)
{
if (imlInstruction->type == PPCREC_IML_TYPE_R_R)
return true;
if (imlInstruction->type == PPCREC_IML_TYPE_R_R_R)
return true;
if (imlInstruction->type == PPCREC_IML_TYPE_R_R_S32)
return true;
if (imlInstruction->type == PPCREC_IML_TYPE_R_S32)
return true;
if (imlInstruction->type == PPCREC_IML_TYPE_FPR_R_R)
return true;
if (imlInstruction->type == PPCREC_IML_TYPE_FPR_R_R_R)
return true;
if (imlInstruction->type == PPCREC_IML_TYPE_FPR_R_R_R_R)
return true;
if (imlInstruction->type == PPCREC_IML_TYPE_FPR_R)
return true;
return false;
}
void PPCRecompilerImlAnalyzer_getCRTracking(PPCRecImlInstruction_t* imlInstruction, PPCRecCRTracking_t* crTracking)
{
crTracking->readCRBits = 0;
crTracking->writtenCRBits = 0;
if (imlInstruction->type == PPCREC_IML_TYPE_CJUMP)
{
if (imlInstruction->op_conditionalJump.condition != PPCREC_JUMP_CONDITION_NONE)
{
uint32 crBitFlag = 1 << (imlInstruction->op_conditionalJump.crRegisterIndex * 4 + imlInstruction->op_conditionalJump.crBitIndex);
crTracking->readCRBits = (crBitFlag);
}
}
else if (imlInstruction->type == PPCREC_IML_TYPE_CONDITIONAL_R_S32)
{
uint32 crBitFlag = 1 << (imlInstruction->op_conditional_r_s32.crRegisterIndex * 4 + imlInstruction->op_conditional_r_s32.crBitIndex);
crTracking->readCRBits = crBitFlag;
}
else if (imlInstruction->type == PPCREC_IML_TYPE_R_S32 && imlInstruction->operation == PPCREC_IML_OP_MFCR)
{
crTracking->readCRBits = 0xFFFFFFFF;
}
else if (imlInstruction->type == PPCREC_IML_TYPE_R_S32 && imlInstruction->operation == PPCREC_IML_OP_MTCRF)
{
crTracking->writtenCRBits |= ppc_MTCRFMaskToCRBitMask((uint32)imlInstruction->op_r_immS32.immS32);
}
else if (imlInstruction->type == PPCREC_IML_TYPE_CR)
{
if (imlInstruction->operation == PPCREC_IML_OP_CR_CLEAR ||
imlInstruction->operation == PPCREC_IML_OP_CR_SET)
{
uint32 crBitFlag = 1 << (imlInstruction->op_cr.crD);
crTracking->writtenCRBits = crBitFlag;
}
else if (imlInstruction->operation == PPCREC_IML_OP_CR_OR ||
imlInstruction->operation == PPCREC_IML_OP_CR_ORC ||
imlInstruction->operation == PPCREC_IML_OP_CR_AND ||
imlInstruction->operation == PPCREC_IML_OP_CR_ANDC)
{
uint32 crBitFlag = 1 << (imlInstruction->op_cr.crD);
crTracking->writtenCRBits = crBitFlag;
crBitFlag = 1 << (imlInstruction->op_cr.crA);
crTracking->readCRBits = crBitFlag;
crBitFlag = 1 << (imlInstruction->op_cr.crB);
crTracking->readCRBits |= crBitFlag;
}
else
assert_dbg();
}
else if (PPCRecompilerImlAnalyzer_canTypeWriteCR(imlInstruction) && imlInstruction->crRegister >= 0 && imlInstruction->crRegister <= 7)
{
crTracking->writtenCRBits |= (0xF << (imlInstruction->crRegister * 4));
}
else if ((imlInstruction->type == PPCREC_IML_TYPE_STORE || imlInstruction->type == PPCREC_IML_TYPE_STORE_INDEXED) && imlInstruction->op_storeLoad.copyWidth == PPC_REC_STORE_STWCX_MARKER)
{
// overwrites CR0
crTracking->writtenCRBits |= (0xF << 0);
}
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,399 @@
#include "PPCRecompiler.h"
#include "PPCRecompilerIml.h"
#include "PPCRecompilerX64.h"
#include "PPCRecompilerImlRanges.h"
#include "util/helpers/MemoryPool.h"
void PPCRecRARange_addLink_perVirtualGPR(raLivenessSubrange_t** root, raLivenessSubrange_t* subrange)
{
#ifndef PUBLIC_RELEASE
if ((*root) && (*root)->range->virtualRegister != subrange->range->virtualRegister)
assert_dbg();
#endif
subrange->link_sameVirtualRegisterGPR.next = *root;
if (*root)
(*root)->link_sameVirtualRegisterGPR.prev = subrange;
subrange->link_sameVirtualRegisterGPR.prev = nullptr;
*root = subrange;
}
void PPCRecRARange_addLink_allSubrangesGPR(raLivenessSubrange_t** root, raLivenessSubrange_t* subrange)
{
subrange->link_segmentSubrangesGPR.next = *root;
if (*root)
(*root)->link_segmentSubrangesGPR.prev = subrange;
subrange->link_segmentSubrangesGPR.prev = nullptr;
*root = subrange;
}
void PPCRecRARange_removeLink_perVirtualGPR(raLivenessSubrange_t** root, raLivenessSubrange_t* subrange)
{
raLivenessSubrange_t* tempPrev = subrange->link_sameVirtualRegisterGPR.prev;
if (subrange->link_sameVirtualRegisterGPR.prev)
subrange->link_sameVirtualRegisterGPR.prev->link_sameVirtualRegisterGPR.next = subrange->link_sameVirtualRegisterGPR.next;
else
(*root) = subrange->link_sameVirtualRegisterGPR.next;
if (subrange->link_sameVirtualRegisterGPR.next)
subrange->link_sameVirtualRegisterGPR.next->link_sameVirtualRegisterGPR.prev = tempPrev;
#ifndef PUBLIC_RELEASE
subrange->link_sameVirtualRegisterGPR.prev = (raLivenessSubrange_t*)1;
subrange->link_sameVirtualRegisterGPR.next = (raLivenessSubrange_t*)1;
#endif
}
void PPCRecRARange_removeLink_allSubrangesGPR(raLivenessSubrange_t** root, raLivenessSubrange_t* subrange)
{
raLivenessSubrange_t* tempPrev = subrange->link_segmentSubrangesGPR.prev;
if (subrange->link_segmentSubrangesGPR.prev)
subrange->link_segmentSubrangesGPR.prev->link_segmentSubrangesGPR.next = subrange->link_segmentSubrangesGPR.next;
else
(*root) = subrange->link_segmentSubrangesGPR.next;
if (subrange->link_segmentSubrangesGPR.next)
subrange->link_segmentSubrangesGPR.next->link_segmentSubrangesGPR.prev = tempPrev;
#ifndef PUBLIC_RELEASE
subrange->link_segmentSubrangesGPR.prev = (raLivenessSubrange_t*)1;
subrange->link_segmentSubrangesGPR.next = (raLivenessSubrange_t*)1;
#endif
}
MemoryPoolPermanentObjects<raLivenessRange_t> memPool_livenessRange(4096);
MemoryPoolPermanentObjects<raLivenessSubrange_t> memPool_livenessSubrange(4096);
raLivenessRange_t* PPCRecRA_createRangeBase(ppcImlGenContext_t* ppcImlGenContext, uint32 virtualRegister, uint32 name)
{
raLivenessRange_t* livenessRange = memPool_livenessRange.acquireObj();
livenessRange->list_subranges.resize(0);
livenessRange->virtualRegister = virtualRegister;
livenessRange->name = name;
livenessRange->physicalRegister = -1;
ppcImlGenContext->raInfo.list_ranges.push_back(livenessRange);
return livenessRange;
}
raLivenessSubrange_t* PPCRecRA_createSubrange(ppcImlGenContext_t* ppcImlGenContext, raLivenessRange_t* range, PPCRecImlSegment_t* imlSegment, sint32 startIndex, sint32 endIndex)
{
raLivenessSubrange_t* livenessSubrange = memPool_livenessSubrange.acquireObj();
livenessSubrange->list_locations.resize(0);
livenessSubrange->range = range;
livenessSubrange->imlSegment = imlSegment;
PPCRecompilerIml_setSegmentPoint(&livenessSubrange->start, imlSegment, startIndex);
PPCRecompilerIml_setSegmentPoint(&livenessSubrange->end, imlSegment, endIndex);
// default values
livenessSubrange->hasStore = false;
livenessSubrange->hasStoreDelayed = false;
livenessSubrange->lastIterationIndex = 0;
livenessSubrange->subrangeBranchNotTaken = nullptr;
livenessSubrange->subrangeBranchTaken = nullptr;
livenessSubrange->_noLoad = false;
// add to range
range->list_subranges.push_back(livenessSubrange);
// add to segment
PPCRecRARange_addLink_perVirtualGPR(&(imlSegment->raInfo.linkedList_perVirtualGPR[range->virtualRegister]), livenessSubrange);
PPCRecRARange_addLink_allSubrangesGPR(&imlSegment->raInfo.linkedList_allSubranges, livenessSubrange);
return livenessSubrange;
}
void _unlinkSubrange(raLivenessSubrange_t* subrange)
{
PPCRecImlSegment_t* imlSegment = subrange->imlSegment;
PPCRecRARange_removeLink_perVirtualGPR(&imlSegment->raInfo.linkedList_perVirtualGPR[subrange->range->virtualRegister], subrange);
PPCRecRARange_removeLink_allSubrangesGPR(&imlSegment->raInfo.linkedList_allSubranges, subrange);
}
void PPCRecRA_deleteSubrange(ppcImlGenContext_t* ppcImlGenContext, raLivenessSubrange_t* subrange)
{
_unlinkSubrange(subrange);
subrange->range->list_subranges.erase(std::find(subrange->range->list_subranges.begin(), subrange->range->list_subranges.end(), subrange));
subrange->list_locations.clear();
PPCRecompilerIml_removeSegmentPoint(&subrange->start);
PPCRecompilerIml_removeSegmentPoint(&subrange->end);
memPool_livenessSubrange.releaseObj(subrange);
}
void _PPCRecRA_deleteSubrangeNoUnlinkFromRange(ppcImlGenContext_t* ppcImlGenContext, raLivenessSubrange_t* subrange)
{
_unlinkSubrange(subrange);
PPCRecompilerIml_removeSegmentPoint(&subrange->start);
PPCRecompilerIml_removeSegmentPoint(&subrange->end);
memPool_livenessSubrange.releaseObj(subrange);
}
void PPCRecRA_deleteRange(ppcImlGenContext_t* ppcImlGenContext, raLivenessRange_t* range)
{
for (auto& subrange : range->list_subranges)
{
_PPCRecRA_deleteSubrangeNoUnlinkFromRange(ppcImlGenContext, subrange);
}
ppcImlGenContext->raInfo.list_ranges.erase(std::find(ppcImlGenContext->raInfo.list_ranges.begin(), ppcImlGenContext->raInfo.list_ranges.end(), range));
memPool_livenessRange.releaseObj(range);
}
void PPCRecRA_deleteRangeNoUnlink(ppcImlGenContext_t* ppcImlGenContext, raLivenessRange_t* range)
{
for (auto& subrange : range->list_subranges)
{
_PPCRecRA_deleteSubrangeNoUnlinkFromRange(ppcImlGenContext, subrange);
}
memPool_livenessRange.releaseObj(range);
}
void PPCRecRA_deleteAllRanges(ppcImlGenContext_t* ppcImlGenContext)
{
for(auto& range : ppcImlGenContext->raInfo.list_ranges)
{
PPCRecRA_deleteRangeNoUnlink(ppcImlGenContext, range);
}
ppcImlGenContext->raInfo.list_ranges.clear();
}
void PPCRecRA_mergeRanges(ppcImlGenContext_t* ppcImlGenContext, raLivenessRange_t* range, raLivenessRange_t* absorbedRange)
{
cemu_assert_debug(range != absorbedRange);
cemu_assert_debug(range->virtualRegister == absorbedRange->virtualRegister);
// move all subranges from absorbedRange to range
for (auto& subrange : absorbedRange->list_subranges)
{
range->list_subranges.push_back(subrange);
subrange->range = range;
}
absorbedRange->list_subranges.clear();
PPCRecRA_deleteRange(ppcImlGenContext, absorbedRange);
}
void PPCRecRA_mergeSubranges(ppcImlGenContext_t* ppcImlGenContext, raLivenessSubrange_t* subrange, raLivenessSubrange_t* absorbedSubrange)
{
#ifndef PUBLIC_RELEASE
PPCRecRA_debugValidateSubrange(subrange);
PPCRecRA_debugValidateSubrange(absorbedSubrange);
if (subrange->imlSegment != absorbedSubrange->imlSegment)
assert_dbg();
if (subrange->end.index > absorbedSubrange->start.index)
assert_dbg();
if (subrange->subrangeBranchTaken || subrange->subrangeBranchNotTaken)
assert_dbg();
if (subrange == absorbedSubrange)
assert_dbg();
#endif
subrange->subrangeBranchTaken = absorbedSubrange->subrangeBranchTaken;
subrange->subrangeBranchNotTaken = absorbedSubrange->subrangeBranchNotTaken;
// merge usage locations
for (auto& location : absorbedSubrange->list_locations)
{
subrange->list_locations.push_back(location);
}
absorbedSubrange->list_locations.clear();
subrange->end.index = absorbedSubrange->end.index;
PPCRecRA_debugValidateSubrange(subrange);
PPCRecRA_deleteSubrange(ppcImlGenContext, absorbedSubrange);
}
// remove all inter-segment connections from the range and split it into local ranges (also removes empty ranges)
void PPCRecRA_explodeRange(ppcImlGenContext_t* ppcImlGenContext, raLivenessRange_t* range)
{
if (range->list_subranges.size() == 1)
assert_dbg();
for (auto& subrange : range->list_subranges)
{
if (subrange->list_locations.empty())
continue;
raLivenessRange_t* newRange = PPCRecRA_createRangeBase(ppcImlGenContext, range->virtualRegister, range->name);
raLivenessSubrange_t* newSubrange = PPCRecRA_createSubrange(ppcImlGenContext, newRange, subrange->imlSegment, subrange->list_locations.data()[0].index, subrange->list_locations.data()[subrange->list_locations.size() - 1].index + 1);
// copy locations
for (auto& location : subrange->list_locations)
{
newSubrange->list_locations.push_back(location);
}
}
// remove original range
PPCRecRA_deleteRange(ppcImlGenContext, range);
}
#ifndef PUBLIC_RELEASE
void PPCRecRA_debugValidateSubrange(raLivenessSubrange_t* subrange)
{
// validate subrange
if (subrange->subrangeBranchTaken && subrange->subrangeBranchTaken->imlSegment != subrange->imlSegment->nextSegmentBranchTaken)
assert_dbg();
if (subrange->subrangeBranchNotTaken && subrange->subrangeBranchNotTaken->imlSegment != subrange->imlSegment->nextSegmentBranchNotTaken)
assert_dbg();
}
#else
void PPCRecRA_debugValidateSubrange(raLivenessSubrange_t* subrange) {}
#endif
// split subrange at the given index
// After the split there will be two ranges/subranges:
// head -> subrange is shortned to end at splitIndex
// tail -> a new subrange that reaches from splitIndex to the end of the original subrange
// if head has a physical register assigned it will not carry over to tail
// The return value is the tail subrange
// If trimToHole is true, the end of the head subrange and the start of the tail subrange will be moved to fit the locations
// Ranges that begin at RA_INTER_RANGE_START are allowed and can be split
raLivenessSubrange_t* PPCRecRA_splitLocalSubrange(ppcImlGenContext_t* ppcImlGenContext, raLivenessSubrange_t* subrange, sint32 splitIndex, bool trimToHole)
{
// validation
#ifndef PUBLIC_RELEASE
if (subrange->end.index == RA_INTER_RANGE_END || subrange->end.index == RA_INTER_RANGE_START)
assert_dbg();
if (subrange->start.index >= splitIndex)
assert_dbg();
if (subrange->end.index <= splitIndex)
assert_dbg();
#endif
// create tail
raLivenessRange_t* tailRange = PPCRecRA_createRangeBase(ppcImlGenContext, subrange->range->virtualRegister, subrange->range->name);
raLivenessSubrange_t* tailSubrange = PPCRecRA_createSubrange(ppcImlGenContext, tailRange, subrange->imlSegment, splitIndex, subrange->end.index);
// copy locations
for (auto& location : subrange->list_locations)
{
if (location.index >= splitIndex)
tailSubrange->list_locations.push_back(location);
}
// remove tail locations from head
for (sint32 i = 0; i < subrange->list_locations.size(); i++)
{
raLivenessLocation_t* location = subrange->list_locations.data() + i;
if (location->index >= splitIndex)
{
subrange->list_locations.resize(i);
break;
}
}
// adjust start/end
if (trimToHole)
{
if (subrange->list_locations.empty())
{
subrange->end.index = subrange->start.index+1;
}
else
{
subrange->end.index = subrange->list_locations.back().index + 1;
}
if (tailSubrange->list_locations.empty())
{
assert_dbg(); // should not happen? (In this case we can just avoid generating a tail at all)
}
else
{
tailSubrange->start.index = tailSubrange->list_locations.front().index;
}
}
return tailSubrange;
}
void PPCRecRA_updateOrAddSubrangeLocation(raLivenessSubrange_t* subrange, sint32 index, bool isRead, bool isWrite)
{
if (subrange->list_locations.empty())
{
subrange->list_locations.emplace_back(index, isRead, isWrite);
return;
}
raLivenessLocation_t* lastLocation = subrange->list_locations.data() + (subrange->list_locations.size() - 1);
cemu_assert_debug(lastLocation->index <= index);
if (lastLocation->index == index)
{
// update
lastLocation->isRead = lastLocation->isRead || isRead;
lastLocation->isWrite = lastLocation->isWrite || isWrite;
return;
}
// add new
subrange->list_locations.emplace_back(index, isRead, isWrite);
}
sint32 PPCRecRARange_getReadWriteCost(PPCRecImlSegment_t* imlSegment)
{
sint32 v = imlSegment->loopDepth + 1;
v *= 5;
return v*v; // 25, 100, 225, 400
}
// calculate cost of entire range
// ignores data flow and does not detect avoidable reads/stores
sint32 PPCRecRARange_estimateCost(raLivenessRange_t* range)
{
sint32 cost = 0;
// todo - this algorithm isn't accurate. If we have 10 parallel branches with a load each then the actual cost is still only that of one branch (plus minimal extra cost for generating more code).
// currently we calculate the cost based on the most expensive entry/exit point
sint32 mostExpensiveRead = 0;
sint32 mostExpensiveWrite = 0;
sint32 readCount = 0;
sint32 writeCount = 0;
for (auto& subrange : range->list_subranges)
{
if (subrange->start.index != RA_INTER_RANGE_START)
{
//cost += PPCRecRARange_getReadWriteCost(subrange->imlSegment);
mostExpensiveRead = std::max(mostExpensiveRead, PPCRecRARange_getReadWriteCost(subrange->imlSegment));
readCount++;
}
if (subrange->end.index != RA_INTER_RANGE_END)
{
//cost += PPCRecRARange_getReadWriteCost(subrange->imlSegment);
mostExpensiveWrite = std::max(mostExpensiveWrite, PPCRecRARange_getReadWriteCost(subrange->imlSegment));
writeCount++;
}
}
cost = mostExpensiveRead + mostExpensiveWrite;
cost = cost + (readCount + writeCount) / 10;
return cost;
}
// calculate cost of range that it would have after calling PPCRecRA_explodeRange() on it
sint32 PPCRecRARange_estimateAdditionalCostAfterRangeExplode(raLivenessRange_t* range)
{
sint32 cost = -PPCRecRARange_estimateCost(range);
for (auto& subrange : range->list_subranges)
{
if (subrange->list_locations.empty())
continue;
cost += PPCRecRARange_getReadWriteCost(subrange->imlSegment) * 2; // we assume a read and a store
}
return cost;
}
sint32 PPCRecRARange_estimateAdditionalCostAfterSplit(raLivenessSubrange_t* subrange, sint32 splitIndex)
{
// validation
#ifndef PUBLIC_RELEASE
if (subrange->end.index == RA_INTER_RANGE_END)
assert_dbg();
#endif
sint32 cost = 0;
// find split position in location list
if (subrange->list_locations.empty())
{
assert_dbg(); // should not happen?
return 0;
}
if (splitIndex <= subrange->list_locations.front().index)
return 0;
if (splitIndex > subrange->list_locations.back().index)
return 0;
// todo - determine exact cost of split subranges
cost += PPCRecRARange_getReadWriteCost(subrange->imlSegment) * 2; // currently we assume that the additional region will require a read and a store
//for (sint32 f = 0; f < subrange->list_locations.size(); f++)
//{
// raLivenessLocation_t* location = subrange->list_locations.data() + f;
// if (location->index >= splitIndex)
// {
// ...
// return cost;
// }
//}
return cost;
}

View file

@ -0,0 +1,27 @@
#pragma once
raLivenessRange_t* PPCRecRA_createRangeBase(ppcImlGenContext_t* ppcImlGenContext, uint32 virtualRegister, uint32 name);
raLivenessSubrange_t* PPCRecRA_createSubrange(ppcImlGenContext_t* ppcImlGenContext, raLivenessRange_t* range, PPCRecImlSegment_t* imlSegment, sint32 startIndex, sint32 endIndex);
void PPCRecRA_deleteSubrange(ppcImlGenContext_t* ppcImlGenContext, raLivenessSubrange_t* subrange);
void PPCRecRA_deleteRange(ppcImlGenContext_t* ppcImlGenContext, raLivenessRange_t* range);
void PPCRecRA_deleteAllRanges(ppcImlGenContext_t* ppcImlGenContext);
void PPCRecRA_mergeRanges(ppcImlGenContext_t* ppcImlGenContext, raLivenessRange_t* range, raLivenessRange_t* absorbedRange);
void PPCRecRA_explodeRange(ppcImlGenContext_t* ppcImlGenContext, raLivenessRange_t* range);
void PPCRecRA_mergeSubranges(ppcImlGenContext_t* ppcImlGenContext, raLivenessSubrange_t* subrange, raLivenessSubrange_t* absorbedSubrange);
raLivenessSubrange_t* PPCRecRA_splitLocalSubrange(ppcImlGenContext_t* ppcImlGenContext, raLivenessSubrange_t* subrange, sint32 splitIndex, bool trimToHole = false);
void PPCRecRA_updateOrAddSubrangeLocation(raLivenessSubrange_t* subrange, sint32 index, bool isRead, bool isWrite);
void PPCRecRA_debugValidateSubrange(raLivenessSubrange_t* subrange);
// cost estimation
sint32 PPCRecRARange_getReadWriteCost(PPCRecImlSegment_t* imlSegment);
sint32 PPCRecRARange_estimateCost(raLivenessRange_t* range);
sint32 PPCRecRARange_estimateAdditionalCostAfterRangeExplode(raLivenessRange_t* range);
sint32 PPCRecRARange_estimateAdditionalCostAfterSplit(raLivenessSubrange_t* subrange, sint32 splitIndex);
// special values to mark the index of ranges that reach across the segment border
#define RA_INTER_RANGE_START (-1)
#define RA_INTER_RANGE_END (0x70000000)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,414 @@
#include "PPCRecompiler.h"
#include "PPCRecompilerIml.h"
#include "PPCRecompilerX64.h"
#include "PPCRecompilerImlRanges.h"
#include <queue>
bool _isRangeDefined(PPCRecImlSegment_t* imlSegment, sint32 vGPR)
{
return (imlSegment->raDistances.reg[vGPR].usageStart != INT_MAX);
}
void PPCRecRA_calculateSegmentMinMaxRanges(ppcImlGenContext_t* ppcImlGenContext, PPCRecImlSegment_t* imlSegment)
{
for (sint32 i = 0; i < PPC_REC_MAX_VIRTUAL_GPR; i++)
{
imlSegment->raDistances.reg[i].usageStart = INT_MAX;
imlSegment->raDistances.reg[i].usageEnd = INT_MIN;
}
// scan instructions for usage range
sint32 index = 0;
PPCImlOptimizerUsedRegisters_t gprTracking;
while (index < imlSegment->imlListCount)
{
// end loop at suffix instruction
if (PPCRecompiler_isSuffixInstruction(imlSegment->imlList + index))
break;
// get accessed GPRs
PPCRecompiler_checkRegisterUsage(NULL, imlSegment->imlList + index, &gprTracking);
for (sint32 t = 0; t < 4; t++)
{
sint32 virtualRegister = gprTracking.gpr[t];
if (virtualRegister < 0)
continue;
cemu_assert_debug(virtualRegister < PPC_REC_MAX_VIRTUAL_GPR);
imlSegment->raDistances.reg[virtualRegister].usageStart = std::min(imlSegment->raDistances.reg[virtualRegister].usageStart, index); // index before/at instruction
imlSegment->raDistances.reg[virtualRegister].usageEnd = std::max(imlSegment->raDistances.reg[virtualRegister].usageEnd, index+1); // index after instruction
}
// next instruction
index++;
}
}
void PPCRecRA_calculateLivenessRangesV2(ppcImlGenContext_t* ppcImlGenContext)
{
// for each register calculate min/max index of usage range within each segment
for (sint32 s = 0; s < ppcImlGenContext->segmentListCount; s++)
{
PPCRecRA_calculateSegmentMinMaxRanges(ppcImlGenContext, ppcImlGenContext->segmentList[s]);
}
}
raLivenessSubrange_t* PPCRecRA_convertToMappedRanges(ppcImlGenContext_t* ppcImlGenContext, PPCRecImlSegment_t* imlSegment, sint32 vGPR, raLivenessRange_t* range)
{
if (imlSegment->raDistances.isProcessed[vGPR])
{
// return already existing segment
return imlSegment->raInfo.linkedList_perVirtualGPR[vGPR];
}
imlSegment->raDistances.isProcessed[vGPR] = true;
if (_isRangeDefined(imlSegment, vGPR) == false)
return nullptr;
// create subrange
cemu_assert_debug(imlSegment->raInfo.linkedList_perVirtualGPR[vGPR] == nullptr);
raLivenessSubrange_t* subrange = PPCRecRA_createSubrange(ppcImlGenContext, range, imlSegment, imlSegment->raDistances.reg[vGPR].usageStart, imlSegment->raDistances.reg[vGPR].usageEnd);
// traverse forward
if (imlSegment->raDistances.reg[vGPR].usageEnd == RA_INTER_RANGE_END)
{
if (imlSegment->nextSegmentBranchTaken && imlSegment->nextSegmentBranchTaken->raDistances.reg[vGPR].usageStart == RA_INTER_RANGE_START)
{
subrange->subrangeBranchTaken = PPCRecRA_convertToMappedRanges(ppcImlGenContext, imlSegment->nextSegmentBranchTaken, vGPR, range);
cemu_assert_debug(subrange->subrangeBranchTaken->start.index == RA_INTER_RANGE_START);
}
if (imlSegment->nextSegmentBranchNotTaken && imlSegment->nextSegmentBranchNotTaken->raDistances.reg[vGPR].usageStart == RA_INTER_RANGE_START)
{
subrange->subrangeBranchNotTaken = PPCRecRA_convertToMappedRanges(ppcImlGenContext, imlSegment->nextSegmentBranchNotTaken, vGPR, range);
cemu_assert_debug(subrange->subrangeBranchNotTaken->start.index == RA_INTER_RANGE_START);
}
}
// traverse backward
if (imlSegment->raDistances.reg[vGPR].usageStart == RA_INTER_RANGE_START)
{
for (auto& it : imlSegment->list_prevSegments)
{
if (it->raDistances.reg[vGPR].usageEnd == RA_INTER_RANGE_END)
PPCRecRA_convertToMappedRanges(ppcImlGenContext, it, vGPR, range);
}
}
// return subrange
return subrange;
}
void PPCRecRA_createSegmentLivenessRanges(ppcImlGenContext_t* ppcImlGenContext, PPCRecImlSegment_t* imlSegment)
{
for (sint32 i = 0; i < PPC_REC_MAX_VIRTUAL_GPR; i++)
{
if( _isRangeDefined(imlSegment, i) == false )
continue;
if( imlSegment->raDistances.isProcessed[i])
continue;
raLivenessRange_t* range = PPCRecRA_createRangeBase(ppcImlGenContext, i, ppcImlGenContext->mappedRegister[i]);
PPCRecRA_convertToMappedRanges(ppcImlGenContext, imlSegment, i, range);
}
// create lookup table of ranges
raLivenessSubrange_t* vGPR2Subrange[PPC_REC_MAX_VIRTUAL_GPR];
for (sint32 i = 0; i < PPC_REC_MAX_VIRTUAL_GPR; i++)
{
vGPR2Subrange[i] = imlSegment->raInfo.linkedList_perVirtualGPR[i];
#ifndef PUBLIC_RELEASE
if (vGPR2Subrange[i] && vGPR2Subrange[i]->link_sameVirtualRegisterGPR.next != nullptr)
assert_dbg();
#endif
}
// parse instructions and convert to locations
sint32 index = 0;
PPCImlOptimizerUsedRegisters_t gprTracking;
while (index < imlSegment->imlListCount)
{
// end loop at suffix instruction
if (PPCRecompiler_isSuffixInstruction(imlSegment->imlList + index))
break;
// get accessed GPRs
PPCRecompiler_checkRegisterUsage(NULL, imlSegment->imlList + index, &gprTracking);
// handle accessed GPR
for (sint32 t = 0; t < 4; t++)
{
sint32 virtualRegister = gprTracking.gpr[t];
if (virtualRegister < 0)
continue;
bool isWrite = (t == 3);
// add location
PPCRecRA_updateOrAddSubrangeLocation(vGPR2Subrange[virtualRegister], index, isWrite == false, isWrite);
#ifndef PUBLIC_RELEASE
if (index < vGPR2Subrange[virtualRegister]->start.index)
assert_dbg();
if (index+1 > vGPR2Subrange[virtualRegister]->end.index)
assert_dbg();
#endif
}
// next instruction
index++;
}
}
void PPCRecRA_extendRangeToEndOfSegment(ppcImlGenContext_t* ppcImlGenContext, PPCRecImlSegment_t* imlSegment, sint32 vGPR)
{
if (_isRangeDefined(imlSegment, vGPR) == false)
{
imlSegment->raDistances.reg[vGPR].usageStart = RA_INTER_RANGE_END;
imlSegment->raDistances.reg[vGPR].usageEnd = RA_INTER_RANGE_END;
return;
}
imlSegment->raDistances.reg[vGPR].usageEnd = RA_INTER_RANGE_END;
}
void PPCRecRA_extendRangeToBeginningOfSegment(ppcImlGenContext_t* ppcImlGenContext, PPCRecImlSegment_t* imlSegment, sint32 vGPR)
{
if (_isRangeDefined(imlSegment, vGPR) == false)
{
imlSegment->raDistances.reg[vGPR].usageStart = RA_INTER_RANGE_START;
imlSegment->raDistances.reg[vGPR].usageEnd = RA_INTER_RANGE_START;
}
else
{
imlSegment->raDistances.reg[vGPR].usageStart = RA_INTER_RANGE_START;
}
// propagate backwards
for (auto& it : imlSegment->list_prevSegments)
{
PPCRecRA_extendRangeToEndOfSegment(ppcImlGenContext, it, vGPR);
}
}
void _PPCRecRA_connectRanges(ppcImlGenContext_t* ppcImlGenContext, sint32 vGPR, PPCRecImlSegment_t** route, sint32 routeDepth)
{
#ifndef PUBLIC_RELEASE
if (routeDepth < 2)
assert_dbg();
#endif
// extend starting range to end of segment
PPCRecRA_extendRangeToEndOfSegment(ppcImlGenContext, route[0], vGPR);
// extend all the connecting segments in both directions
for (sint32 i = 1; i < (routeDepth - 1); i++)
{
PPCRecRA_extendRangeToEndOfSegment(ppcImlGenContext, route[i], vGPR);
PPCRecRA_extendRangeToBeginningOfSegment(ppcImlGenContext, route[i], vGPR);
}
// extend the final segment towards the beginning
PPCRecRA_extendRangeToBeginningOfSegment(ppcImlGenContext, route[routeDepth-1], vGPR);
}
void _PPCRecRA_checkAndTryExtendRange(ppcImlGenContext_t* ppcImlGenContext, PPCRecImlSegment_t* currentSegment, sint32 vGPR, sint32 distanceLeft, PPCRecImlSegment_t** route, sint32 routeDepth)
{
if (routeDepth >= 64)
{
forceLogDebug_printf("Recompiler RA route maximum depth exceeded for function 0x%08x\n", ppcImlGenContext->functionRef->ppcAddress);
return;
}
route[routeDepth] = currentSegment;
if (currentSegment->raDistances.reg[vGPR].usageStart == INT_MAX)
{
// measure distance to end of segment
distanceLeft -= currentSegment->imlListCount;
if (distanceLeft > 0)
{
if (currentSegment->nextSegmentBranchNotTaken)
_PPCRecRA_checkAndTryExtendRange(ppcImlGenContext, currentSegment->nextSegmentBranchNotTaken, vGPR, distanceLeft, route, routeDepth + 1);
if (currentSegment->nextSegmentBranchTaken)
_PPCRecRA_checkAndTryExtendRange(ppcImlGenContext, currentSegment->nextSegmentBranchTaken, vGPR, distanceLeft, route, routeDepth + 1);
}
return;
}
else
{
// measure distance to range
if (currentSegment->raDistances.reg[vGPR].usageStart == RA_INTER_RANGE_END)
{
if (distanceLeft < currentSegment->imlListCount)
return; // range too far away
}
else if (currentSegment->raDistances.reg[vGPR].usageStart != RA_INTER_RANGE_START && currentSegment->raDistances.reg[vGPR].usageStart > distanceLeft)
return; // out of range
// found close range -> connect ranges
_PPCRecRA_connectRanges(ppcImlGenContext, vGPR, route, routeDepth + 1);
}
}
void PPCRecRA_checkAndTryExtendRange(ppcImlGenContext_t* ppcImlGenContext, PPCRecImlSegment_t* currentSegment, sint32 vGPR)
{
#ifndef PUBLIC_RELEASE
if (currentSegment->raDistances.reg[vGPR].usageEnd < 0)
assert_dbg();
#endif
// count instructions to end of initial segment
if (currentSegment->raDistances.reg[vGPR].usageEnd == RA_INTER_RANGE_START)
assert_dbg();
sint32 instructionsUntilEndOfSeg;
if (currentSegment->raDistances.reg[vGPR].usageEnd == RA_INTER_RANGE_END)
instructionsUntilEndOfSeg = 0;
else
instructionsUntilEndOfSeg = currentSegment->imlListCount - currentSegment->raDistances.reg[vGPR].usageEnd;
#ifndef PUBLIC_RELEASE
if (instructionsUntilEndOfSeg < 0)
assert_dbg();
#endif
sint32 remainingScanDist = 45 - instructionsUntilEndOfSeg;
if (remainingScanDist <= 0)
return; // can't reach end
// also dont forget: Extending is easier if we allow 'non symetric' branches. E.g. register range one enters one branch
PPCRecImlSegment_t* route[64];
route[0] = currentSegment;
if (currentSegment->nextSegmentBranchNotTaken)
{
_PPCRecRA_checkAndTryExtendRange(ppcImlGenContext, currentSegment->nextSegmentBranchNotTaken, vGPR, remainingScanDist, route, 1);
}
if (currentSegment->nextSegmentBranchTaken)
{
_PPCRecRA_checkAndTryExtendRange(ppcImlGenContext, currentSegment->nextSegmentBranchTaken, vGPR, remainingScanDist, route, 1);
}
}
void PPCRecRA_mergeCloseRangesForSegmentV2(ppcImlGenContext_t* ppcImlGenContext, PPCRecImlSegment_t* imlSegment)
{
for (sint32 i = 0; i < PPC_REC_MAX_VIRTUAL_GPR; i++) // todo: Use dynamic maximum or list of used vGPRs so we can avoid parsing empty entries
{
if(imlSegment->raDistances.reg[i].usageStart == INT_MAX)
continue; // not used
// check and extend if possible
PPCRecRA_checkAndTryExtendRange(ppcImlGenContext, imlSegment, i);
}
#ifndef PUBLIC_RELEASE
if (imlSegment->list_prevSegments.empty() == false && imlSegment->isEnterable)
assert_dbg();
if ((imlSegment->nextSegmentBranchNotTaken != nullptr || imlSegment->nextSegmentBranchTaken != nullptr) && imlSegment->nextSegmentIsUncertain)
assert_dbg();
#endif
}
void PPCRecRA_followFlowAndExtendRanges(ppcImlGenContext_t* ppcImlGenContext, PPCRecImlSegment_t* imlSegment)
{
std::vector<PPCRecImlSegment_t*> list_segments;
list_segments.reserve(1000);
sint32 index = 0;
imlSegment->raRangeExtendProcessed = true;
list_segments.push_back(imlSegment);
while (index < list_segments.size())
{
PPCRecImlSegment_t* currentSegment = list_segments[index];
PPCRecRA_mergeCloseRangesForSegmentV2(ppcImlGenContext, currentSegment);
// follow flow
if (currentSegment->nextSegmentBranchNotTaken && currentSegment->nextSegmentBranchNotTaken->raRangeExtendProcessed == false)
{
currentSegment->nextSegmentBranchNotTaken->raRangeExtendProcessed = true;
list_segments.push_back(currentSegment->nextSegmentBranchNotTaken);
}
if (currentSegment->nextSegmentBranchTaken && currentSegment->nextSegmentBranchTaken->raRangeExtendProcessed == false)
{
currentSegment->nextSegmentBranchTaken->raRangeExtendProcessed = true;
list_segments.push_back(currentSegment->nextSegmentBranchTaken);
}
index++;
}
}
void PPCRecRA_mergeCloseRangesV2(ppcImlGenContext_t* ppcImlGenContext)
{
for (sint32 s = 0; s < ppcImlGenContext->segmentListCount; s++)
{
PPCRecImlSegment_t* imlSegment = ppcImlGenContext->segmentList[s];
if (imlSegment->list_prevSegments.empty())
{
if (imlSegment->raRangeExtendProcessed)
assert_dbg(); // should not happen
PPCRecRA_followFlowAndExtendRanges(ppcImlGenContext, imlSegment);
}
}
}
void PPCRecRA_extendRangesOutOfLoopsV2(ppcImlGenContext_t* ppcImlGenContext)
{
for (sint32 s = 0; s < ppcImlGenContext->segmentListCount; s++)
{
PPCRecImlSegment_t* imlSegment = ppcImlGenContext->segmentList[s];
auto localLoopDepth = imlSegment->loopDepth;
if( localLoopDepth <= 0 )
continue; // not inside a loop
// look for loop exit
bool hasLoopExit = false;
if (imlSegment->nextSegmentBranchTaken && imlSegment->nextSegmentBranchTaken->loopDepth < localLoopDepth)
{
hasLoopExit = true;
}
if (imlSegment->nextSegmentBranchNotTaken && imlSegment->nextSegmentBranchNotTaken->loopDepth < localLoopDepth)
{
hasLoopExit = true;
}
if(hasLoopExit == false)
continue;
// extend looping ranges into all exits (this allows the data flow analyzer to move stores out of the loop)
for (sint32 i = 0; i < PPC_REC_MAX_VIRTUAL_GPR; i++) // todo: Use dynamic maximum or list of used vGPRs so we can avoid parsing empty entries
{
if (imlSegment->raDistances.reg[i].usageEnd != RA_INTER_RANGE_END)
continue; // range not set or does not reach end of segment
if(imlSegment->nextSegmentBranchTaken)
PPCRecRA_extendRangeToBeginningOfSegment(ppcImlGenContext, imlSegment->nextSegmentBranchTaken, i);
if(imlSegment->nextSegmentBranchNotTaken)
PPCRecRA_extendRangeToBeginningOfSegment(ppcImlGenContext, imlSegment->nextSegmentBranchNotTaken, i);
}
}
}
void PPCRecRA_processFlowAndCalculateLivenessRangesV2(ppcImlGenContext_t* ppcImlGenContext)
{
// merge close ranges
PPCRecRA_mergeCloseRangesV2(ppcImlGenContext);
// extra pass to move register stores out of loops
PPCRecRA_extendRangesOutOfLoopsV2(ppcImlGenContext);
// calculate liveness ranges
for (sint32 s = 0; s < ppcImlGenContext->segmentListCount; s++)
{
PPCRecImlSegment_t* imlSegment = ppcImlGenContext->segmentList[s];
PPCRecRA_createSegmentLivenessRanges(ppcImlGenContext, imlSegment);
}
}
void PPCRecRA_analyzeSubrangeDataDependencyV2(raLivenessSubrange_t* subrange)
{
bool isRead = false;
bool isWritten = false;
bool isOverwritten = false;
for (auto& location : subrange->list_locations)
{
if (location.isRead)
{
isRead = true;
}
if (location.isWrite)
{
if (isRead == false)
isOverwritten = true;
isWritten = true;
}
}
subrange->_noLoad = isOverwritten;
subrange->hasStore = isWritten;
if (subrange->start.index == RA_INTER_RANGE_START)
subrange->_noLoad = true;
}
void _analyzeRangeDataFlow(raLivenessSubrange_t* subrange);
void PPCRecRA_analyzeRangeDataFlowV2(ppcImlGenContext_t* ppcImlGenContext)
{
// this function is called after _assignRegisters(), which means that all ranges are already final and wont change anymore
// first do a per-subrange pass
for (auto& range : ppcImlGenContext->raInfo.list_ranges)
{
for (auto& subrange : range->list_subranges)
{
PPCRecRA_analyzeSubrangeDataDependencyV2(subrange);
}
}
// then do a second pass where we scan along subrange flow
for (auto& range : ppcImlGenContext->raInfo.list_ranges)
{
for (auto& subrange : range->list_subranges) // todo - traversing this backwards should be faster and yield better results due to the nature of the algorithm
{
_analyzeRangeDataFlow(subrange);
}
}
}

View file

@ -0,0 +1,173 @@
#include "PPCRecompiler.h"
#include "PPCRecompilerIml.h"
PPCRecImlSegment_t* PPCRecompiler_getSegmentByPPCJumpAddress(ppcImlGenContext_t* ppcImlGenContext, uint32 ppcOffset)
{
for(sint32 s=0; s<ppcImlGenContext->segmentListCount; s++)
{
if( ppcImlGenContext->segmentList[s]->isJumpDestination && ppcImlGenContext->segmentList[s]->jumpDestinationPPCAddress == ppcOffset )
{
return ppcImlGenContext->segmentList[s];
}
}
debug_printf("PPCRecompiler_getSegmentByPPCJumpAddress(): Unable to find segment (ppcOffset 0x%08x)\n", ppcOffset);
return NULL;
}
void PPCRecompilerIml_setLinkBranchNotTaken(PPCRecImlSegment_t* imlSegmentSrc, PPCRecImlSegment_t* imlSegmentDst)
{
// make sure segments aren't already linked
if (imlSegmentSrc->nextSegmentBranchNotTaken == imlSegmentDst)
return;
// add as next segment for source
if (imlSegmentSrc->nextSegmentBranchNotTaken != NULL)
assert_dbg();
imlSegmentSrc->nextSegmentBranchNotTaken = imlSegmentDst;
// add as previous segment for destination
imlSegmentDst->list_prevSegments.push_back(imlSegmentSrc);
}
void PPCRecompilerIml_setLinkBranchTaken(PPCRecImlSegment_t* imlSegmentSrc, PPCRecImlSegment_t* imlSegmentDst)
{
// make sure segments aren't already linked
if (imlSegmentSrc->nextSegmentBranchTaken == imlSegmentDst)
return;
// add as next segment for source
if (imlSegmentSrc->nextSegmentBranchTaken != NULL)
assert_dbg();
imlSegmentSrc->nextSegmentBranchTaken = imlSegmentDst;
// add as previous segment for destination
imlSegmentDst->list_prevSegments.push_back(imlSegmentSrc);
}
void PPCRecompilerIML_removeLink(PPCRecImlSegment_t* imlSegmentSrc, PPCRecImlSegment_t* imlSegmentDst)
{
if (imlSegmentSrc->nextSegmentBranchNotTaken == imlSegmentDst)
{
imlSegmentSrc->nextSegmentBranchNotTaken = NULL;
}
else if (imlSegmentSrc->nextSegmentBranchTaken == imlSegmentDst)
{
imlSegmentSrc->nextSegmentBranchTaken = NULL;
}
else
assert_dbg();
bool matchFound = false;
for (sint32 i = 0; i < imlSegmentDst->list_prevSegments.size(); i++)
{
if (imlSegmentDst->list_prevSegments[i] == imlSegmentSrc)
{
imlSegmentDst->list_prevSegments.erase(imlSegmentDst->list_prevSegments.begin()+i);
matchFound = true;
break;
}
}
if (matchFound == false)
assert_dbg();
}
/*
* Replaces all links to segment orig with linkts to segment new
*/
void PPCRecompilerIML_relinkInputSegment(PPCRecImlSegment_t* imlSegmentOrig, PPCRecImlSegment_t* imlSegmentNew)
{
while (imlSegmentOrig->list_prevSegments.size() != 0)
{
PPCRecImlSegment_t* prevSegment = imlSegmentOrig->list_prevSegments[0];
if (prevSegment->nextSegmentBranchNotTaken == imlSegmentOrig)
{
PPCRecompilerIML_removeLink(prevSegment, imlSegmentOrig);
PPCRecompilerIml_setLinkBranchNotTaken(prevSegment, imlSegmentNew);
}
else if (prevSegment->nextSegmentBranchTaken == imlSegmentOrig)
{
PPCRecompilerIML_removeLink(prevSegment, imlSegmentOrig);
PPCRecompilerIml_setLinkBranchTaken(prevSegment, imlSegmentNew);
}
else
{
assert_dbg();
}
}
}
void PPCRecompilerIML_linkSegments(ppcImlGenContext_t* ppcImlGenContext)
{
for(sint32 s=0; s<ppcImlGenContext->segmentListCount; s++)
{
PPCRecImlSegment_t* imlSegment = ppcImlGenContext->segmentList[s];
bool isLastSegment = (s+1)>=ppcImlGenContext->segmentListCount;
PPCRecImlSegment_t* nextSegment = isLastSegment?NULL:ppcImlGenContext->segmentList[s+1];
// handle empty segment
if( imlSegment->imlListCount == 0 )
{
if (isLastSegment == false)
PPCRecompilerIml_setLinkBranchNotTaken(imlSegment, ppcImlGenContext->segmentList[s+1]); // continue execution to next segment
else
imlSegment->nextSegmentIsUncertain = true;
continue;
}
// check last instruction of segment
PPCRecImlInstruction_t* imlInstruction = imlSegment->imlList+(imlSegment->imlListCount-1);
if( imlInstruction->type == PPCREC_IML_TYPE_CJUMP || imlInstruction->type == PPCREC_IML_TYPE_CJUMP_CYCLE_CHECK )
{
// find destination segment by ppc jump address
PPCRecImlSegment_t* jumpDestSegment = PPCRecompiler_getSegmentByPPCJumpAddress(ppcImlGenContext, imlInstruction->op_conditionalJump.jumpmarkAddress);
if( jumpDestSegment )
{
if (imlInstruction->op_conditionalJump.condition != PPCREC_JUMP_CONDITION_NONE)
PPCRecompilerIml_setLinkBranchNotTaken(imlSegment, nextSegment);
PPCRecompilerIml_setLinkBranchTaken(imlSegment, jumpDestSegment);
}
else
{
imlSegment->nextSegmentIsUncertain = true;
}
}
else if( imlInstruction->type == PPCREC_IML_TYPE_MACRO )
{
// currently we assume that the next segment is unknown for all macros
imlSegment->nextSegmentIsUncertain = true;
}
else
{
// all other instruction types do not branch
//imlSegment->nextSegment[0] = nextSegment;
PPCRecompilerIml_setLinkBranchNotTaken(imlSegment, nextSegment);
//imlSegment->nextSegmentIsUncertain = true;
}
}
}
void PPCRecompilerIML_isolateEnterableSegments(ppcImlGenContext_t* ppcImlGenContext)
{
sint32 initialSegmentCount = ppcImlGenContext->segmentListCount;
for (sint32 i = 0; i < ppcImlGenContext->segmentListCount; i++)
{
PPCRecImlSegment_t* imlSegment = ppcImlGenContext->segmentList[i];
if (imlSegment->list_prevSegments.empty() == false && imlSegment->isEnterable)
{
// spawn new segment at end
PPCRecompilerIml_insertSegments(ppcImlGenContext, ppcImlGenContext->segmentListCount, 1);
PPCRecImlSegment_t* entrySegment = ppcImlGenContext->segmentList[ppcImlGenContext->segmentListCount-1];
entrySegment->isEnterable = true;
entrySegment->enterPPCAddress = imlSegment->enterPPCAddress;
// create jump instruction
PPCRecompiler_pushBackIMLInstructions(entrySegment, 0, 1);
PPCRecompilerImlGen_generateNewInstruction_jumpSegment(ppcImlGenContext, entrySegment->imlList + 0);
PPCRecompilerIml_setLinkBranchTaken(entrySegment, imlSegment);
// remove enterable flag from original segment
imlSegment->isEnterable = false;
imlSegment->enterPPCAddress = 0;
}
}
}
PPCRecImlInstruction_t* PPCRecompilerIML_getLastInstruction(PPCRecImlSegment_t* imlSegment)
{
if (imlSegment->imlListCount == 0)
return nullptr;
return imlSegment->imlList + (imlSegment->imlListCount - 1);
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,332 @@
typedef struct
{
uint32 offset;
uint8 type;
void* extraInfo;
}x64RelocEntry_t;
typedef struct
{
uint8* codeBuffer;
sint32 codeBufferIndex;
sint32 codeBufferSize;
// cr state
sint32 activeCRRegister; // current x86 condition flags reflect this cr* register
sint32 activeCRState; // describes the way in which x86 flags map to the cr register (signed / unsigned)
// relocate offsets
x64RelocEntry_t* relocateOffsetTable;
sint32 relocateOffsetTableSize;
sint32 relocateOffsetTableCount;
}x64GenContext_t;
// Some of these are defined by winnt.h and gnu headers
#undef REG_EAX
#undef REG_ECX
#undef REG_EDX
#undef REG_EBX
#undef REG_ESP
#undef REG_EBP
#undef REG_ESI
#undef REG_EDI
#undef REG_NONE
#undef REG_RAX
#undef REG_RCX
#undef REG_RDX
#undef REG_RBX
#undef REG_RSP
#undef REG_RBP
#undef REG_RSI
#undef REG_RDI
#undef REG_R8
#undef REG_R9
#undef REG_R10
#undef REG_R11
#undef REG_R12
#undef REG_R13
#undef REG_R14
#undef REG_R15
#define REG_EAX 0
#define REG_ECX 1
#define REG_EDX 2
#define REG_EBX 3
#define REG_ESP 4 // reserved for low half of hCPU pointer
#define REG_EBP 5
#define REG_ESI 6
#define REG_EDI 7
#define REG_NONE -1
#define REG_RAX 0
#define REG_RCX 1
#define REG_RDX 2
#define REG_RBX 3
#define REG_RSP 4 // reserved for hCPU pointer
#define REG_RBP 5
#define REG_RSI 6
#define REG_RDI 7
#define REG_R8 8
#define REG_R9 9
#define REG_R10 10
#define REG_R11 11
#define REG_R12 12
#define REG_R13 13 // reserved to hold pointer to memory base? (Not decided yet)
#define REG_R14 14 // reserved as temporary register
#define REG_R15 15 // reserved for pointer to ppcRecompilerInstanceData
#define REG_AL 0
#define REG_CL 1
#define REG_DL 2
#define REG_BL 3
#define REG_AH 4
#define REG_CH 5
#define REG_DH 6
#define REG_BH 7
// reserved registers
#define REG_RESV_TEMP (REG_R14)
#define REG_RESV_HCPU (REG_RSP)
#define REG_RESV_MEMBASE (REG_R13)
#define REG_RESV_RECDATA (REG_R15)
// reserved floating-point registers
#define REG_RESV_FPR_TEMP (15)
extern sint32 x64Gen_registerMap[12];
#define tempToRealRegister(__x) (x64Gen_registerMap[__x])
#define tempToRealFPRRegister(__x) (__x)
#define reg32ToReg16(__x) (__x)
enum
{
X86_CONDITION_EQUAL, // or zero
X86_CONDITION_NOT_EQUAL, // or not zero
X86_CONDITION_SIGNED_LESS, // or not greater/equal
X86_CONDITION_SIGNED_GREATER, // or not less/equal
X86_CONDITION_SIGNED_LESS_EQUAL, // or not greater
X86_CONDITION_SIGNED_GREATER_EQUAL, // or not less
X86_CONDITION_UNSIGNED_BELOW, // or not above/equal
X86_CONDITION_UNSIGNED_ABOVE, // or not below/equal
X86_CONDITION_UNSIGNED_BELOW_EQUAL, // or not above
X86_CONDITION_UNSIGNED_ABOVE_EQUAL, // or not below
X86_CONDITION_CARRY, // carry flag must be set
X86_CONDITION_NOT_CARRY, // carry flag must not be set
X86_CONDITION_SIGN, // sign flag must be set
X86_CONDITION_NOT_SIGN, // sign flag must not be set
X86_CONDITION_PARITY, // parity flag must be set
X86_CONDITION_NONE, // no condition, jump always
};
#define PPCREC_CR_TEMPORARY (8) // never stored
#define PPCREC_CR_STATE_TYPE_UNSIGNED_ARITHMETIC (0) // for signed arithmetic operations (ADD, CMPI)
#define PPCREC_CR_STATE_TYPE_SIGNED_ARITHMETIC (1) // for unsigned arithmetic operations (ADD, CMPI)
#define PPCREC_CR_STATE_TYPE_LOGICAL (2) // for unsigned operations (CMPLI)
#define X86_RELOC_MAKE_RELATIVE (0) // make code imm relative to instruction
#define X64_RELOC_LINK_TO_PPC (1) // translate from ppc address to x86 offset
#define X64_RELOC_LINK_TO_SEGMENT (2) // link to beginning of segment
#define PPC_X64_GPR_USABLE_REGISTERS (16-4)
#define PPC_X64_FPR_USABLE_REGISTERS (16-1) // Use XMM0 - XMM14, XMM15 is the temp register
bool PPCRecompiler_generateX64Code(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext);
void PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext);
void PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext_t* x64GenContext, sint32 jumpInstructionOffset, sint32 destinationOffset);
void PPCRecompilerX64Gen_generateRecompilerInterfaceFunctions();
void PPCRecompilerX64Gen_imlInstruction_fpr_r_name(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, PPCRecImlInstruction_t* imlInstruction);
void PPCRecompilerX64Gen_imlInstruction_fpr_name_r(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, PPCRecImlInstruction_t* imlInstruction);
bool PPCRecompilerX64Gen_imlInstruction_fpr_load(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, PPCRecImlInstruction_t* imlInstruction, bool indexed);
bool PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, PPCRecImlInstruction_t* imlInstruction, bool indexed);
void PPCRecompilerX64Gen_imlInstruction_fpr_r_r(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, PPCRecImlInstruction_t* imlInstruction);
void PPCRecompilerX64Gen_imlInstruction_fpr_r_r_r(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, PPCRecImlInstruction_t* imlInstruction);
void PPCRecompilerX64Gen_imlInstruction_fpr_r_r_r_r(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, PPCRecImlInstruction_t* imlInstruction);
void PPCRecompilerX64Gen_imlInstruction_fpr_r(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, PPCRecImlInstruction_t* imlInstruction);
// ASM gen
void x64Gen_writeU8(x64GenContext_t* x64GenContext, uint8 v);
void x64Gen_writeU16(x64GenContext_t* x64GenContext, uint32 v);
void x64Gen_writeU32(x64GenContext_t* x64GenContext, uint32 v);
void x64Emit_mov_reg32_mem32(x64GenContext_t* x64GenContext, sint32 destReg, sint32 memBaseReg64, sint32 memOffset);
void x64Emit_mov_mem32_reg32(x64GenContext_t* x64GenContext, sint32 memBaseReg64, sint32 memOffset, sint32 srcReg);
void x64Emit_mov_mem64_reg64(x64GenContext_t* x64GenContext, sint32 memBaseReg64, sint32 memOffset, sint32 srcReg);
void x64Emit_mov_reg64_mem64(x64GenContext_t* x64GenContext, sint32 destReg, sint32 memBaseReg64, sint32 memOffset);
void x64Emit_mov_reg64_mem32(x64GenContext_t* x64GenContext, sint32 destReg, sint32 memBaseReg64, sint32 memOffset);
void x64Emit_mov_mem32_reg64(x64GenContext_t* x64GenContext, sint32 memBaseReg64, sint32 memOffset, sint32 srcReg);
void x64Emit_mov_reg64_mem64(x64GenContext_t* x64GenContext, sint32 destReg, sint32 memBaseReg64, sint32 memIndexReg64, sint32 memOffset);
void x64Emit_mov_reg32_mem32(x64GenContext_t* x64GenContext, sint32 destReg, sint32 memBaseReg64, sint32 memIndexReg64, sint32 memOffset);
void x64Emit_mov_reg64b_mem8(x64GenContext_t* x64GenContext, sint32 destReg, sint32 memBaseReg64, sint32 memIndexReg64, sint32 memOffset);
void x64Emit_movZX_reg32_mem8(x64GenContext_t* x64GenContext, sint32 destReg, sint32 memBaseReg64, sint32 memIndexReg64, sint32 memOffset);
void x64Emit_movZX_reg64_mem8(x64GenContext_t* x64GenContext, sint32 destReg, sint32 memBaseReg64, sint32 memOffset);
void x64Gen_movSignExtend_reg64Low32_mem8Reg64PlusReg64(x64GenContext_t* x64GenContext, sint32 dstRegister, sint32 memRegisterA64, sint32 memRegisterB64, sint32 memImmS32);
void x64Gen_movZeroExtend_reg64Low16_mem16Reg64PlusReg64(x64GenContext_t* x64GenContext, sint32 dstRegister, sint32 memRegisterA64, sint32 memRegisterB64, sint32 memImmS32);
void x64Gen_mov_mem64Reg64PlusReg64_reg64(x64GenContext_t* x64GenContext, sint32 dstRegister, sint32 memRegisterA64, sint32 memRegisterB64, sint32 memImmS32);
void x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext_t* x64GenContext, sint32 memRegisterA64, sint32 memRegisterB64, sint32 memImmS32, sint32 srcRegister);
void x64Gen_movTruncate_mem16Reg64PlusReg64_reg64(x64GenContext_t* x64GenContext, sint32 memRegisterA64, sint32 memRegisterB64, sint32 memImmS32, sint32 srcRegister);
void x64Gen_movTruncate_mem8Reg64PlusReg64_reg64(x64GenContext_t* x64GenContext, sint32 memRegisterA64, sint32 memRegisterB64, sint32 memImmS32, sint32 srcRegister);
void x64Gen_mov_mem32Reg64_imm32(x64GenContext_t* x64GenContext, sint32 memRegister, uint32 memImmU32, uint32 dataImmU32);
void x64Gen_mov_mem64Reg64_imm32(x64GenContext_t* x64GenContext, sint32 memRegister, uint32 memImmU32, uint32 dataImmU32);
void x64Gen_mov_mem8Reg64_imm8(x64GenContext_t* x64GenContext, sint32 memRegister, uint32 memImmU32, uint8 dataImmU8);
void x64Gen_mov_reg64_imm64(x64GenContext_t* x64GenContext, sint32 destRegister, uint64 immU64);
void x64Gen_mov_reg64Low32_imm32(x64GenContext_t* x64GenContext, sint32 destRegister, uint64 immU32);
void x64Gen_mov_reg64Low32_reg64Low32(x64GenContext_t* x64GenContext, sint32 destRegister, sint32 srcRegister);
void x64Gen_lea_reg64Low32_reg64Low32PlusReg64Low32(x64GenContext_t* x64GenContext, sint32 dstRegister, sint32 memRegisterA64, sint32 memRegisterB64);
void x64Gen_cmovcc_reg64Low32_reg64Low32(x64GenContext_t* x64GenContext, uint32 conditionType, sint32 destRegister, sint32 srcRegister);
void x64Gen_mov_reg64_reg64(x64GenContext_t* x64GenContext, sint32 destRegister, sint32 srcRegister);
void x64Gen_xchg_reg64_reg64(x64GenContext_t* x64GenContext, sint32 destRegister, sint32 srcRegister);
void x64Gen_movSignExtend_reg64Low32_reg64Low16(x64GenContext_t* x64GenContext, sint32 destRegister, sint32 srcRegister);
void x64Gen_movZeroExtend_reg64Low32_reg64Low16(x64GenContext_t* x64GenContext, sint32 destRegister, sint32 srcRegister);
void x64Gen_movSignExtend_reg64Low32_reg64Low8(x64GenContext_t* x64GenContext, sint32 destRegister, sint32 srcRegister);
void x64Gen_movZeroExtend_reg64Low32_reg64Low8(x64GenContext_t* x64GenContext, sint32 destRegister, sint32 srcRegister);
void x64Gen_or_reg64Low8_mem8Reg64(x64GenContext_t* x64GenContext, sint32 dstRegister, sint32 memRegister64, sint32 memImmS32);
void x64Gen_and_reg64Low8_mem8Reg64(x64GenContext_t* x64GenContext, sint32 dstRegister, sint32 memRegister64, sint32 memImmS32);
void x64Gen_mov_mem8Reg64_reg64Low8(x64GenContext_t* x64GenContext, sint32 dstRegister, sint32 memRegister64, sint32 memImmS32);
void x64Gen_lock_cmpxchg_mem32Reg64PlusReg64_reg64(x64GenContext_t* x64GenContext, sint32 memRegisterA64, sint32 memRegisterB64, sint32 memImmS32, sint32 srcRegister);
void x64Gen_lock_cmpxchg_mem32Reg64_reg64(x64GenContext_t* x64GenContext, sint32 memRegister64, sint32 memImmS32, sint32 srcRegister);
void x64Gen_add_reg64_reg64(x64GenContext_t* x64GenContext, sint32 destRegister, sint32 srcRegister);
void x64Gen_add_reg64Low32_reg64Low32(x64GenContext_t* x64GenContext, sint32 destRegister, sint32 srcRegister);
void x64Gen_add_reg64_imm32(x64GenContext_t* x64GenContext, sint32 srcRegister, uint32 immU32);
void x64Gen_add_reg64Low32_imm32(x64GenContext_t* x64GenContext, sint32 srcRegister, uint32 immU32);
void x64Gen_sub_reg64Low32_reg64Low32(x64GenContext_t* x64GenContext, sint32 destRegister, sint32 srcRegister);
void x64Gen_sub_reg64Low32_imm32(x64GenContext_t* x64GenContext, sint32 srcRegister, uint32 immU32);
void x64Gen_sub_reg64_imm32(x64GenContext_t* x64GenContext, sint32 srcRegister, uint32 immU32);
void x64Gen_sub_mem32reg64_imm32(x64GenContext_t* x64GenContext, sint32 memRegister, sint32 memImmS32, uint64 immU32);
void x64Gen_sbb_reg64Low32_reg64Low32(x64GenContext_t* x64GenContext, sint32 destRegister, sint32 srcRegister);
void x64Gen_adc_reg64Low32_reg64Low32(x64GenContext_t* x64GenContext, sint32 destRegister, sint32 srcRegister);
void x64Gen_adc_reg64Low32_imm32(x64GenContext_t* x64GenContext, sint32 srcRegister, uint32 immU32);
void x64Gen_dec_mem32(x64GenContext_t* x64GenContext, sint32 memoryRegister, uint32 memoryImmU32);
void x64Gen_imul_reg64Low32_reg64Low32(x64GenContext_t* x64GenContext, sint32 destRegister, sint32 operandRegister);
void x64Gen_idiv_reg64Low32(x64GenContext_t* x64GenContext, sint32 operandRegister);
void x64Gen_div_reg64Low32(x64GenContext_t* x64GenContext, sint32 operandRegister);
void x64Gen_imul_reg64Low32(x64GenContext_t* x64GenContext, sint32 operandRegister);
void x64Gen_mul_reg64Low32(x64GenContext_t* x64GenContext, sint32 operandRegister);
void x64Gen_and_reg64Low32_imm32(x64GenContext_t* x64GenContext, sint32 srcRegister, uint32 immU32);
void x64Gen_and_reg64Low32_reg64Low32(x64GenContext_t* x64GenContext, sint32 destRegister, sint32 srcRegister);
void x64Gen_test_reg64Low32_reg64Low32(x64GenContext_t* x64GenContext, sint32 destRegister, sint32 srcRegister);
void x64Gen_test_reg64Low32_imm32(x64GenContext_t* x64GenContext, sint32 srcRegister, uint32 immU32);
void x64Gen_cmp_reg64Low32_imm32(x64GenContext_t* x64GenContext, sint32 srcRegister, sint32 immS32);
void x64Gen_cmp_reg64Low32_reg64Low32(x64GenContext_t* x64GenContext, sint32 destRegister, sint32 srcRegister);
void x64Gen_cmp_reg64Low32_mem32reg64(x64GenContext_t* x64GenContext, sint32 destRegister, sint32 memRegister, sint32 memImmS32);
void x64Gen_or_reg64Low32_imm32(x64GenContext_t* x64GenContext, sint32 srcRegister, uint32 immU32);
void x64Gen_or_reg64Low32_reg64Low32(x64GenContext_t* x64GenContext, sint32 destRegister, sint32 srcRegister);
void x64Gen_xor_reg32_reg32(x64GenContext_t* x64GenContext, sint32 destRegister, sint32 srcRegister);
void x64Gen_xor_reg64Low32_reg64Low32(x64GenContext_t* x64GenContext, sint32 destRegister, sint32 srcRegister);
void x64Gen_xor_reg64Low32_imm32(x64GenContext_t* x64GenContext, sint32 srcRegister, uint32 immU32);
void x64Gen_rol_reg64Low32_imm8(x64GenContext_t* x64GenContext, sint32 srcRegister, sint8 immS8);
void x64Gen_rol_reg64Low32_cl(x64GenContext_t* x64GenContext, sint32 srcRegister);
void x64Gen_rol_reg64Low16_imm8(x64GenContext_t* x64GenContext, sint32 srcRegister, sint8 immS8);
void x64Gen_rol_reg64_imm8(x64GenContext_t* x64GenContext, sint32 srcRegister, sint8 immS8);
void x64Gen_shl_reg64Low32_imm8(x64GenContext_t* x64GenContext, sint32 srcRegister, sint8 immS8);
void x64Gen_shr_reg64Low32_imm8(x64GenContext_t* x64GenContext, sint32 srcRegister, sint8 immS8);
void x64Gen_sar_reg64Low32_imm8(x64GenContext_t* x64GenContext, sint32 srcRegister, sint8 immS8);
void x64Gen_not_reg64Low32(x64GenContext_t* x64GenContext, sint32 destRegister);
void x64Gen_neg_reg64Low32(x64GenContext_t* x64GenContext, sint32 destRegister);
void x64Gen_cdq(x64GenContext_t* x64GenContext);
void x64Gen_bswap_reg64(x64GenContext_t* x64GenContext, sint32 destRegister);
void x64Gen_bswap_reg64Lower32bit(x64GenContext_t* x64GenContext, sint32 destRegister);
void x64Gen_bswap_reg64Lower16bit(x64GenContext_t* x64GenContext, sint32 destRegister);
void x64Gen_lzcnt_reg64Low32_reg64Low32(x64GenContext_t* x64GenContext, sint32 destRegister, sint32 srcRegister);
void x64Gen_bsr_reg64Low32_reg64Low32(x64GenContext_t* x64GenContext, sint32 destRegister, sint32 srcRegister);
void x64Gen_cmp_reg64Low32_imm32(x64GenContext_t* x64GenContext, sint32 srcRegister, sint32 immS32);
void x64Gen_setcc_mem8(x64GenContext_t* x64GenContext, sint32 conditionType, sint32 memoryRegister, uint32 memoryImmU32);
void x64Gen_setcc_reg64b(x64GenContext_t* x64GenContext, sint32 conditionType, sint32 dataRegister);
void x64Gen_bt_mem8(x64GenContext_t* x64GenContext, sint32 memoryRegister, uint32 memoryImmU32, uint8 bitIndex);
void x64Gen_cmc(x64GenContext_t* x64GenContext);
void x64Gen_jmp_imm32(x64GenContext_t* x64GenContext, uint32 destImm32);
void x64Gen_jmp_memReg64(x64GenContext_t* x64GenContext, sint32 memRegister, uint32 immU32);
void x64Gen_jmpc_far(x64GenContext_t* x64GenContext, sint32 conditionType, sint32 relativeDest);
void x64Gen_jmpc_near(x64GenContext_t* x64GenContext, sint32 conditionType, sint32 relativeDest);
void x64Gen_push_reg64(x64GenContext_t* x64GenContext, sint32 srcRegister);
void x64Gen_pop_reg64(x64GenContext_t* x64GenContext, sint32 destRegister);
void x64Gen_jmp_reg64(x64GenContext_t* x64GenContext, sint32 srcRegister);
void x64Gen_call_reg64(x64GenContext_t* x64GenContext, sint32 srcRegister);
void x64Gen_ret(x64GenContext_t* x64GenContext);
void x64Gen_int3(x64GenContext_t* x64GenContext);
// floating-point (SIMD/SSE) gen
void x64Gen_movaps_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSource);
void x64Gen_movupd_xmmReg_memReg128(x64GenContext_t* x64GenContext, sint32 xmmRegister, sint32 memRegister, uint32 memImmU32);
void x64Gen_movupd_memReg128_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegister, sint32 memRegister, uint32 memImmU32);
void x64Gen_movddup_xmmReg_memReg64(x64GenContext_t* x64GenContext, sint32 xmmRegister, sint32 memRegister, uint32 memImmU32);
void x64Gen_movddup_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_movhlps_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_movsd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_movsd_memReg64_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegister, sint32 memRegister, uint32 memImmU32);
void x64Gen_movlpd_xmmReg_memReg64(x64GenContext_t* x64GenContext, sint32 xmmRegister, sint32 memRegister, uint32 memImmU32);
void x64Gen_unpcklpd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_unpckhpd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_shufpd_xmmReg_xmmReg_imm8(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc, uint8 imm8);
void x64Gen_addsd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_addpd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_subsd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_subpd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_mulsd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_mulpd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_mulpd_xmmReg_memReg128(x64GenContext_t* x64GenContext, sint32 xmmRegister, sint32 memRegister, uint32 memImmU32);
void x64Gen_divsd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_divpd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_comisd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_comisd_xmmReg_mem64Reg64(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 memoryReg, sint32 memImmS32);
void x64Gen_ucomisd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_comiss_xmmReg_mem64Reg64(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 memoryReg, sint32 memImmS32);
void x64Gen_orps_xmmReg_mem128Reg64(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, uint32 memReg, uint32 memImmS32);
void x64Gen_xorps_xmmReg_mem128Reg64(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, uint32 memReg, uint32 memImmS32);
void x64Gen_andps_xmmReg_mem128Reg64(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, uint32 memReg, uint32 memImmS32);
void x64Gen_andpd_xmmReg_memReg128(x64GenContext_t* x64GenContext, sint32 xmmRegister, sint32 memRegister, uint32 memImmU32);
void x64Gen_andps_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_pcmpeqd_xmmReg_mem128Reg64(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, uint32 memReg, uint32 memImmS32);
void x64Gen_cvttpd2dq_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_cvttsd2si_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 registerDest, sint32 xmmRegisterSrc);
void x64Gen_cvtsd2ss_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_cvtpd2ps_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_cvtss2sd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_cvtps2pd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_cvtpi2pd_xmmReg_mem64Reg64(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 memReg, sint32 memImmS32);
void x64Gen_cvtsd2si_reg64Low_xmmReg(x64GenContext_t* x64GenContext, sint32 registerDest, sint32 xmmRegisterSrc);
void x64Gen_cvttsd2si_reg64Low_xmmReg(x64GenContext_t* x64GenContext, sint32 registerDest, sint32 xmmRegisterSrc);
void x64Gen_sqrtsd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_sqrtpd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_rcpss_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc);
void x64Gen_mulss_xmmReg_memReg64(x64GenContext_t* x64GenContext, sint32 xmmRegister, sint32 memRegister, uint32 memImmU32);
void x64Gen_movd_xmmReg_reg64Low32(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 registerSrc);
void x64Gen_movd_reg64Low32_xmmReg(x64GenContext_t* x64GenContext, sint32 registerDest, sint32 xmmRegisterSrc);
void x64Gen_movq_xmmReg_reg64(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 registerSrc);
void x64Gen_movq_reg64_xmmReg(x64GenContext_t* x64GenContext, sint32 registerDst, sint32 xmmRegisterSrc);
// AVX
void x64Gen_avx_VPUNPCKHQDQ_xmm_xmm_xmm(x64GenContext_t* x64GenContext, sint32 dstRegister, sint32 srcRegisterA, sint32 srcRegisterB);
void x64Gen_avx_VUNPCKHPD_xmm_xmm_xmm(x64GenContext_t* x64GenContext, sint32 dstRegister, sint32 srcRegisterA, sint32 srcRegisterB);
void x64Gen_avx_VSUBPD_xmm_xmm_xmm(x64GenContext_t* x64GenContext, sint32 dstRegister, sint32 srcRegisterA, sint32 srcRegisterB);
// BMI
void x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext_t* x64GenContext, sint32 dstRegister, sint32 memRegisterA64, sint32 memRegisterB64, sint32 memImmS32);
void x64Gen_movBEZeroExtend_reg64Low16_mem16Reg64PlusReg64(x64GenContext_t* x64GenContext, sint32 dstRegister, sint32 memRegisterA64, sint32 memRegisterB64, sint32 memImmS32);
void x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext_t* x64GenContext, sint32 memRegisterA64, sint32 memRegisterB64, sint32 memImmS32, sint32 srcRegister);
void x64Gen_shrx_reg64_reg64_reg64(x64GenContext_t* x64GenContext, sint32 registerDst, sint32 registerA, sint32 registerB);
void x64Gen_shlx_reg64_reg64_reg64(x64GenContext_t* x64GenContext, sint32 registerDst, sint32 registerA, sint32 registerB);

View file

@ -0,0 +1,49 @@
#include "PPCRecompiler.h"
#include "PPCRecompilerX64.h"
void _x64Gen_writeMODRMDeprecated(x64GenContext_t* x64GenContext, sint32 dataRegister, sint32 memRegisterA64, sint32 memRegisterB64, sint32 memImmS32);
void _x64Gen_vex128_nds(x64GenContext_t* x64GenContext, uint8 opcodeMap, uint8 additionalOperand, uint8 pp, uint8 vex_ext, uint8 vex_r, uint8 vex_b, uint8 opcode)
{
if(vex_b != 0)
x64Gen_writeU8(x64GenContext, 0xC4); // three byte VEX
else
x64Gen_writeU8(x64GenContext, 0xC5); // two byte VEX
if (vex_b != 0)
{
uint8 vex_x = 0;
x64Gen_writeU8(x64GenContext, (vex_r ? 0x00 : 0x80) | (vex_x ? 0x00 : 0x40) | (vex_b ? 0x00 : 0x20) | 1);
}
x64Gen_writeU8(x64GenContext, (vex_ext<<7) | (((~additionalOperand)&0xF)<<3) | pp);
x64Gen_writeU8(x64GenContext, opcode);
}
#define VEX_PP_0F 0 // guessed
#define VEX_PP_66_0F 1
#define VEX_PP_F3_0F 2 // guessed
#define VEX_PP_F2_0F 3 // guessed
void x64Gen_avx_VPUNPCKHQDQ_xmm_xmm_xmm(x64GenContext_t* x64GenContext, sint32 dstRegister, sint32 srcRegisterA, sint32 srcRegisterB)
{
_x64Gen_vex128_nds(x64GenContext, 0, srcRegisterA, VEX_PP_66_0F, dstRegister < 8 ? 1 : 0, (dstRegister >= 8 && srcRegisterB >= 8) ? 1 : 0, srcRegisterB < 8 ? 0 : 1, 0x6D);
x64Gen_writeU8(x64GenContext, 0xC0 + (srcRegisterB & 7) + (dstRegister & 7) * 8);
}
void x64Gen_avx_VUNPCKHPD_xmm_xmm_xmm(x64GenContext_t* x64GenContext, sint32 dstRegister, sint32 srcRegisterA, sint32 srcRegisterB)
{
_x64Gen_vex128_nds(x64GenContext, 0, srcRegisterA, VEX_PP_66_0F, dstRegister < 8 ? 1 : 0, (dstRegister >= 8 && srcRegisterB >= 8) ? 1 : 0, srcRegisterB < 8 ? 0 : 1, 0x15);
x64Gen_writeU8(x64GenContext, 0xC0 + (srcRegisterB & 7) + (dstRegister & 7) * 8);
}
void x64Gen_avx_VSUBPD_xmm_xmm_xmm(x64GenContext_t* x64GenContext, sint32 dstRegister, sint32 srcRegisterA, sint32 srcRegisterB)
{
_x64Gen_vex128_nds(x64GenContext, 0, srcRegisterA, VEX_PP_66_0F, dstRegister < 8 ? 1 : 0, (dstRegister >= 8 && srcRegisterB >= 8) ? 1 : 0, srcRegisterB < 8 ? 0 : 1, 0x5C);
x64Gen_writeU8(x64GenContext, 0xC0 + (srcRegisterB & 7) + (dstRegister & 7) * 8);
}

View file

@ -0,0 +1,80 @@
#include "PPCRecompiler.h"
#include "PPCRecompilerX64.h"
void _x64Gen_writeMODRMDeprecated(x64GenContext_t* x64GenContext, sint32 dataRegister, sint32 memRegisterA64, sint32 memRegisterB64, sint32 memImmS32);
void x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext_t* x64GenContext, sint32 dstRegister, sint32 memRegisterA64, sint32 memRegisterB64, sint32 memImmS32)
{
// MOVBE <dstReg64> (low dword), DWORD [<reg64> + <reg64> + <imm64>]
if( dstRegister >= 8 && memRegisterA64 >= 8 && memRegisterB64 >= 8 )
x64Gen_writeU8(x64GenContext, 0x47);
else if( memRegisterA64 >= 8 && memRegisterB64 >= 8 )
x64Gen_writeU8(x64GenContext, 0x43);
else if( dstRegister >= 8 && memRegisterB64 >= 8 )
x64Gen_writeU8(x64GenContext, 0x42);
else if( dstRegister >= 8 && memRegisterA64 >= 8 )
x64Gen_writeU8(x64GenContext, 0x45);
else if( dstRegister >= 8 )
x64Gen_writeU8(x64GenContext, 0x44);
else if( memRegisterA64 >= 8 )
x64Gen_writeU8(x64GenContext, 0x41);
else if( memRegisterB64 >= 8 )
x64Gen_writeU8(x64GenContext, 0x42);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x38);
x64Gen_writeU8(x64GenContext, 0xF0);
_x64Gen_writeMODRMDeprecated(x64GenContext, dstRegister, memRegisterA64, memRegisterB64, memImmS32);
}
void x64Gen_movBEZeroExtend_reg64Low16_mem16Reg64PlusReg64(x64GenContext_t* x64GenContext, sint32 dstRegister, sint32 memRegisterA64, sint32 memRegisterB64, sint32 memImmS32)
{
// MOVBE <dstReg64> (low word), WORD [<reg64> + <reg64> + <imm64>]
// note: Unlike the 32bit version this instruction does not set the upper 32bits of the 64bit register to 0
x64Gen_writeU8(x64GenContext, 0x66); // 16bit prefix
x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, dstRegister, memRegisterA64, memRegisterB64, memImmS32);
}
void x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext_t* x64GenContext, sint32 memRegisterA64, sint32 memRegisterB64, sint32 memImmS32, sint32 srcRegister)
{
// MOVBE DWORD [<reg64> + <reg64> + <imm64>], <srcReg64> (low dword)
if( srcRegister >= 8 && memRegisterA64 >= 8 && memRegisterB64 >= 8 )
x64Gen_writeU8(x64GenContext, 0x47);
else if( memRegisterA64 >= 8 && memRegisterB64 >= 8 )
x64Gen_writeU8(x64GenContext, 0x43);
else if( srcRegister >= 8 && memRegisterB64 >= 8 )
x64Gen_writeU8(x64GenContext, 0x42);
else if( srcRegister >= 8 && memRegisterA64 >= 8 )
x64Gen_writeU8(x64GenContext, 0x45);
else if( srcRegister >= 8 )
x64Gen_writeU8(x64GenContext, 0x44);
else if( memRegisterA64 >= 8 )
x64Gen_writeU8(x64GenContext, 0x41);
else if( memRegisterB64 >= 8 )
x64Gen_writeU8(x64GenContext, 0x42);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x38);
x64Gen_writeU8(x64GenContext, 0xF1);
_x64Gen_writeMODRMDeprecated(x64GenContext, srcRegister, memRegisterA64, memRegisterB64, memImmS32);
}
void x64Gen_shrx_reg64_reg64_reg64(x64GenContext_t* x64GenContext, sint32 registerDst, sint32 registerA, sint32 registerB)
{
// SHRX reg64, reg64, reg64
x64Gen_writeU8(x64GenContext, 0xC4);
x64Gen_writeU8(x64GenContext, 0xE2 - ((registerDst >= 8) ? 0x80 : 0) - ((registerA >= 8) ? 0x20 : 0));
x64Gen_writeU8(x64GenContext, 0xFB - registerB * 8);
x64Gen_writeU8(x64GenContext, 0xF7);
x64Gen_writeU8(x64GenContext, 0xC0 + (registerDst & 7) * 8 + (registerA & 7));
}
void x64Gen_shlx_reg64_reg64_reg64(x64GenContext_t* x64GenContext, sint32 registerDst, sint32 registerA, sint32 registerB)
{
// SHLX reg64, reg64, reg64
x64Gen_writeU8(x64GenContext, 0xC4);
x64Gen_writeU8(x64GenContext, 0xE2 - ((registerDst >= 8) ? 0x80 : 0) - ((registerA >= 8) ? 0x20 : 0));
x64Gen_writeU8(x64GenContext, 0xF9 - registerB * 8);
x64Gen_writeU8(x64GenContext, 0xF7);
x64Gen_writeU8(x64GenContext, 0xC0 + (registerDst & 7) * 8 + (registerA & 7));
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,752 @@
#include "PPCRecompiler.h"
#include "PPCRecompilerIml.h"
#include "PPCRecompilerX64.h"
void x64Gen_genSSEVEXPrefix2(x64GenContext_t* x64GenContext, sint32 xmmRegister1, sint32 xmmRegister2, bool use64BitMode)
{
if( xmmRegister1 < 8 && xmmRegister2 < 8 && use64BitMode == false )
return;
uint8 v = 0x40;
if( xmmRegister1 >= 8 )
v |= 0x01;
if( xmmRegister2 >= 8 )
v |= 0x04;
if( use64BitMode )
v |= 0x08;
x64Gen_writeU8(x64GenContext, v);
}
void x64Gen_genSSEVEXPrefix1(x64GenContext_t* x64GenContext, sint32 xmmRegister, bool use64BitMode)
{
if( xmmRegister < 8 && use64BitMode == false )
return;
uint8 v = 0x40;
if( use64BitMode )
v |= 0x01;
if( xmmRegister >= 8 )
v |= 0x04;
x64Gen_writeU8(x64GenContext, v);
}
void x64Gen_movaps_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSource)
{
// SSE
// copy xmm register
// MOVAPS <xmm>, <xmm>
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSource, xmmRegisterDest, false); // tested
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x28); // alternative encoding: 0x29, source and destination register are exchanged
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSource&7));
}
void x64Gen_movupd_xmmReg_memReg128(x64GenContext_t* x64GenContext, sint32 xmmRegister, sint32 memRegister, uint32 memImmU32)
{
// SSE2
// move two doubles from memory into xmm register
// MOVUPD <xmm>, [<reg>+<imm>]
if( memRegister == REG_ESP )
{
// todo: Short form of instruction if memImmU32 is 0 or in -128 to 127 range
// 66 0F 10 84 E4 23 01 00 00
x64Gen_writeU8(x64GenContext, 0x66);
x64Gen_genSSEVEXPrefix1(x64GenContext, xmmRegister, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x10);
x64Gen_writeU8(x64GenContext, 0x84+(xmmRegister&7)*8);
x64Gen_writeU8(x64GenContext, 0xE4);
x64Gen_writeU32(x64GenContext, memImmU32);
}
else if( memRegister == REG_NONE )
{
assert_dbg();
//x64Gen_writeU8(x64GenContext, 0x66);
//x64Gen_writeU8(x64GenContext, 0x0F);
//x64Gen_writeU8(x64GenContext, 0x10);
//x64Gen_writeU8(x64GenContext, 0x05+(xmmRegister&7)*8);
//x64Gen_writeU32(x64GenContext, memImmU32);
}
else
{
assert_dbg();
}
}
void x64Gen_movupd_memReg128_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegister, sint32 memRegister, uint32 memImmU32)
{
// SSE2
// move two doubles from memory into xmm register
// MOVUPD [<reg>+<imm>], <xmm>
if( memRegister == REG_ESP )
{
// todo: Short form of instruction if memImmU32 is 0 or in -128 to 127 range
x64Gen_writeU8(x64GenContext, 0x66);
x64Gen_genSSEVEXPrefix1(x64GenContext, xmmRegister, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x11);
x64Gen_writeU8(x64GenContext, 0x84+(xmmRegister&7)*8);
x64Gen_writeU8(x64GenContext, 0xE4);
x64Gen_writeU32(x64GenContext, memImmU32);
}
else if( memRegister == REG_NONE )
{
assert_dbg();
//x64Gen_writeU8(x64GenContext, 0x66);
//x64Gen_writeU8(x64GenContext, 0x0F);
//x64Gen_writeU8(x64GenContext, 0x11);
//x64Gen_writeU8(x64GenContext, 0x05+(xmmRegister&7)*8);
//x64Gen_writeU32(x64GenContext, memImmU32);
}
else
{
assert_dbg();
}
}
void x64Gen_movddup_xmmReg_memReg64(x64GenContext_t* x64GenContext, sint32 xmmRegister, sint32 memRegister, uint32 memImmU32)
{
// SSE3
// move one double from memory into lower and upper half of a xmm register
if( memRegister == REG_RSP )
{
// MOVDDUP <xmm>, [<reg>+<imm>]
// todo: Short form of instruction if memImmU32 is 0 or in -128 to 127 range
x64Gen_writeU8(x64GenContext, 0xF2);
if( xmmRegister >= 8 )
x64Gen_writeU8(x64GenContext, 0x44);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x12);
x64Gen_writeU8(x64GenContext, 0x84+(xmmRegister&7)*8);
x64Gen_writeU8(x64GenContext, 0xE4);
x64Gen_writeU32(x64GenContext, memImmU32);
}
else if( memRegister == REG_R15 )
{
// MOVDDUP <xmm>, [<reg>+<imm>]
// todo: Short form of instruction if memImmU32 is 0 or in -128 to 127 range
// F2 41 0F 12 87 - 44 33 22 11
x64Gen_writeU8(x64GenContext, 0xF2);
x64Gen_genSSEVEXPrefix1(x64GenContext, xmmRegister, true);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x12);
x64Gen_writeU8(x64GenContext, 0x87+(xmmRegister&7)*8);
x64Gen_writeU32(x64GenContext, memImmU32);
}
else if( memRegister == REG_NONE )
{
// MOVDDUP <xmm>, [<imm>]
// 36 F2 0F 12 05 - 00 00 00 00
assert_dbg();
//x64Gen_writeU8(x64GenContext, 0x36);
//x64Gen_writeU8(x64GenContext, 0xF2);
//x64Gen_writeU8(x64GenContext, 0x0F);
//x64Gen_writeU8(x64GenContext, 0x12);
//x64Gen_writeU8(x64GenContext, 0x05+(xmmRegister&7)*8);
//x64Gen_writeU32(x64GenContext, memImmU32);
}
else
{
assert_dbg();
}
}
void x64Gen_movddup_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE3
// move low double from xmm register into lower and upper half of a different xmm register
x64Gen_writeU8(x64GenContext, 0xF2);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x12);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_movhlps_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE1
// move high double from xmm register into lower and upper half of a different xmm register
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x12);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_movsd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE2
// move lower double from xmm register into lower half of a different xmm register, leave other half untouched
x64Gen_writeU8(x64GenContext, 0xF2);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x10); // alternative encoding: 0x11, src and dest exchanged
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_movsd_memReg64_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegister, sint32 memRegister, uint32 memImmU32)
{
// SSE2
// move lower 64bits (double) of xmm register to memory location
if( memRegister == REG_NONE )
{
// MOVSD [<imm>], <xmm>
// F2 0F 11 05 - 45 23 01 00
assert_dbg();
//x64Gen_writeU8(x64GenContext, 0xF2);
//x64Gen_genSSEVEXPrefix(x64GenContext, xmmRegister, 0, false);
//x64Gen_writeU8(x64GenContext, 0x0F);
//x64Gen_writeU8(x64GenContext, 0x11);
//x64Gen_writeU8(x64GenContext, 0x05+xmmRegister*8);
//x64Gen_writeU32(x64GenContext, memImmU32);
}
else if( memRegister == REG_RSP )
{
// MOVSD [RSP+<imm>], <xmm>
// F2 0F 11 84 24 - 33 22 11 00
x64Gen_writeU8(x64GenContext, 0xF2);
x64Gen_genSSEVEXPrefix2(x64GenContext, 0, xmmRegister, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x11);
x64Gen_writeU8(x64GenContext, 0x84+(xmmRegister&7)*8);
x64Gen_writeU8(x64GenContext, 0x24);
x64Gen_writeU32(x64GenContext, memImmU32);
}
else
{
assert_dbg();
}
}
void x64Gen_movlpd_xmmReg_memReg64(x64GenContext_t* x64GenContext, sint32 xmmRegister, sint32 memRegister, uint32 memImmU32)
{
// SSE3
// move one double from memory into lower half of a xmm register, leave upper half unchanged(?)
if( memRegister == REG_NONE )
{
// MOVLPD <xmm>, [<imm>]
//x64Gen_writeU8(x64GenContext, 0x66);
//x64Gen_writeU8(x64GenContext, 0x0F);
//x64Gen_writeU8(x64GenContext, 0x12);
//x64Gen_writeU8(x64GenContext, 0x05+(xmmRegister&7)*8);
//x64Gen_writeU32(x64GenContext, memImmU32);
assert_dbg();
}
else if( memRegister == REG_RSP )
{
// MOVLPD <xmm>, [<reg64>+<imm>]
// 66 0F 12 84 24 - 33 22 11 00
x64Gen_writeU8(x64GenContext, 0x66);
x64Gen_genSSEVEXPrefix2(x64GenContext, 0, xmmRegister, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x12);
x64Gen_writeU8(x64GenContext, 0x84+(xmmRegister&7)*8);
x64Gen_writeU8(x64GenContext, 0x24);
x64Gen_writeU32(x64GenContext, memImmU32);
}
else
{
assert_dbg();
}
}
void x64Gen_unpcklpd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE2
x64Gen_writeU8(x64GenContext, 0x66);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x14);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_unpckhpd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE2
x64Gen_writeU8(x64GenContext, 0x66);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x15);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_shufpd_xmmReg_xmmReg_imm8(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc, uint8 imm8)
{
// SSE2
// shuffled copy source to destination
x64Gen_writeU8(x64GenContext, 0x66);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0xC6);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
x64Gen_writeU8(x64GenContext, imm8);
}
void x64Gen_addsd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE2
// add bottom double of two xmm registers, leave upper quadword unchanged
x64Gen_writeU8(x64GenContext, 0xF2);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false); // untested
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x58);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_addpd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE2
// add both doubles of two xmm registers
x64Gen_writeU8(x64GenContext, 0x66);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x58);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_subsd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE2
// subtract bottom double of two xmm registers, leave upper quadword unchanged
x64Gen_writeU8(x64GenContext, 0xF2);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x5C);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_subpd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE2
// subtract both doubles of two xmm registers
x64Gen_writeU8(x64GenContext, 0x66);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false); // untested
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x5C);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_mulsd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE2
// multiply bottom double of two xmm registers, leave upper quadword unchanged
x64Gen_writeU8(x64GenContext, 0xF2);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x59);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_mulpd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE2
// multiply both doubles of two xmm registers
x64Gen_writeU8(x64GenContext, 0x66);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false); // untested
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x59);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_mulpd_xmmReg_memReg128(x64GenContext_t* x64GenContext, sint32 xmmRegister, sint32 memRegister, uint32 memImmU32)
{
// SSE2
if (memRegister == REG_NONE)
{
assert_dbg();
}
else if (memRegister == REG_R14)
{
x64Gen_writeU8(x64GenContext, 0x66);
x64Gen_writeU8(x64GenContext, (xmmRegister < 8) ? 0x41 : 0x45);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x59);
x64Gen_writeU8(x64GenContext, 0x86 + (xmmRegister & 7) * 8);
x64Gen_writeU32(x64GenContext, memImmU32);
}
else
{
assert_dbg();
}
}
void x64Gen_divsd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE2
// divide bottom double of two xmm registers, leave upper quadword unchanged
x64Gen_writeU8(x64GenContext, 0xF2);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x5E);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_divpd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE2
// divide bottom and top double of two xmm registers
x64Gen_writeU8(x64GenContext, 0x66);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x5E);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_comisd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE2
// compare bottom doubles
x64Gen_writeU8(x64GenContext, 0x66);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false); // untested
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x2F);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_comisd_xmmReg_mem64Reg64(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 memoryReg, sint32 memImmS32)
{
// SSE2
// compare bottom double with double from memory location
if( memoryReg == REG_R15 )
{
x64Gen_writeU8(x64GenContext, 0x66);
x64Gen_genSSEVEXPrefix1(x64GenContext, xmmRegisterDest, true);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x2F);
x64Gen_writeU8(x64GenContext, 0x87+(xmmRegisterDest&7)*8);
x64Gen_writeU32(x64GenContext, (uint32)memImmS32);
}
else
assert_dbg();
}
void x64Gen_ucomisd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE2
// compare bottom doubles
x64Gen_writeU8(x64GenContext, 0x66);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x2E);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_comiss_xmmReg_mem64Reg64(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 memoryReg, sint32 memImmS32)
{
// SSE2
// compare bottom float with float from memory location
if (memoryReg == REG_R15)
{
x64Gen_genSSEVEXPrefix1(x64GenContext, xmmRegisterDest, true);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x2F);
x64Gen_writeU8(x64GenContext, 0x87 + (xmmRegisterDest & 7) * 8);
x64Gen_writeU32(x64GenContext, (uint32)memImmS32);
}
else
assert_dbg();
}
void x64Gen_orps_xmmReg_mem128Reg64(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, uint32 memReg, uint32 memImmS32)
{
// SSE2
// and xmm register with 128 bit value from memory
if( memReg == REG_R15 )
{
x64Gen_genSSEVEXPrefix2(x64GenContext, memReg, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x56);
x64Gen_writeU8(x64GenContext, 0x87+(xmmRegisterDest&7)*8);
x64Gen_writeU32(x64GenContext, (uint32)memImmS32);
}
else
assert_dbg();
}
void x64Gen_xorps_xmmReg_mem128Reg64(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, uint32 memReg, uint32 memImmS32)
{
// SSE2
// xor xmm register with 128 bit value from memory
if( memReg == REG_R15 )
{
x64Gen_genSSEVEXPrefix1(x64GenContext, xmmRegisterDest, true); // todo: should be x64Gen_genSSEVEXPrefix2() with memReg?
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x57);
x64Gen_writeU8(x64GenContext, 0x87+(xmmRegisterDest&7)*8);
x64Gen_writeU32(x64GenContext, (uint32)memImmS32);
}
else
assert_dbg();
}
void x64Gen_andpd_xmmReg_memReg128(x64GenContext_t* x64GenContext, sint32 xmmRegister, sint32 memRegister, uint32 memImmU32)
{
// SSE2
if (memRegister == REG_NONE)
{
assert_dbg();
}
else if (memRegister == REG_R14)
{
x64Gen_writeU8(x64GenContext, 0x66);
x64Gen_writeU8(x64GenContext, (xmmRegister < 8) ? 0x41 : 0x45);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x54);
x64Gen_writeU8(x64GenContext, 0x86 + (xmmRegister & 7) * 8);
x64Gen_writeU32(x64GenContext, memImmU32);
}
else
{
assert_dbg();
}
}
void x64Gen_andps_xmmReg_mem128Reg64(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, uint32 memReg, uint32 memImmS32)
{
// SSE2
// and xmm register with 128 bit value from memory
if( memReg == REG_R15 )
{
x64Gen_genSSEVEXPrefix1(x64GenContext, xmmRegisterDest, true); // todo: should be x64Gen_genSSEVEXPrefix2() with memReg?
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x54);
x64Gen_writeU8(x64GenContext, 0x87+(xmmRegisterDest&7)*8);
x64Gen_writeU32(x64GenContext, (uint32)memImmS32);
}
else
assert_dbg();
}
void x64Gen_andps_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE2
// and xmm register with xmm register
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x54);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_pcmpeqd_xmmReg_mem128Reg64(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, uint32 memReg, uint32 memImmS32)
{
// SSE2
// doubleword integer compare
if( memReg == REG_R15 )
{
x64Gen_writeU8(x64GenContext, 0x66);
x64Gen_genSSEVEXPrefix1(x64GenContext, xmmRegisterDest, true);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x76);
x64Gen_writeU8(x64GenContext, 0x87+(xmmRegisterDest&7)*8);
x64Gen_writeU32(x64GenContext, (uint32)memImmS32);
}
else
assert_dbg();
}
void x64Gen_cvttpd2dq_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE2
// convert two doubles into two 32-bit integers in bottom part of xmm register, reset upper 64 bits of destination register
x64Gen_writeU8(x64GenContext, 0x66);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0xE6);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_cvttsd2si_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 registerDest, sint32 xmmRegisterSrc)
{
// SSE2
// convert double to truncated integer in general purpose register
x64Gen_writeU8(x64GenContext, 0xF2);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, registerDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x2C);
x64Gen_writeU8(x64GenContext, 0xC0+(registerDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_cvtsd2ss_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE2
// converts bottom 64bit double to bottom 32bit single
x64Gen_writeU8(x64GenContext, 0xF2);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x5A);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_cvtpd2ps_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE2
// converts two 64bit doubles to two 32bit singles in bottom half of register
x64Gen_writeU8(x64GenContext, 0x66);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x5A);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_cvtps2pd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE2
// converts two 32bit singles to two 64bit doubles
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x5A);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_cvtss2sd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE2
// converts bottom 32bit single to bottom 64bit double
x64Gen_writeU8(x64GenContext, 0xF3);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x5A);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_cvtpi2pd_xmmReg_mem64Reg64(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 memReg, sint32 memImmS32)
{
// SSE2
// converts two signed 32bit integers to two doubles
if( memReg == REG_RSP )
{
x64Gen_writeU8(x64GenContext, 0x66);
x64Gen_genSSEVEXPrefix1(x64GenContext, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x2A);
x64Gen_writeU8(x64GenContext, 0x84+(xmmRegisterDest&7)*8);
x64Gen_writeU8(x64GenContext, 0x24);
x64Gen_writeU32(x64GenContext, (uint32)memImmS32);
}
else
{
assert_dbg();
}
}
void x64Gen_cvtsd2si_reg64Low_xmmReg(x64GenContext_t* x64GenContext, sint32 registerDest, sint32 xmmRegisterSrc)
{
// SSE2
// converts bottom 64bit double to 32bit signed integer in general purpose register, round based on float-point control
x64Gen_writeU8(x64GenContext, 0xF2);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, registerDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x2D);
x64Gen_writeU8(x64GenContext, 0xC0+(registerDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_cvttsd2si_reg64Low_xmmReg(x64GenContext_t* x64GenContext, sint32 registerDest, sint32 xmmRegisterSrc)
{
// SSE2
// converts bottom 64bit double to 32bit signed integer in general purpose register, always truncate
x64Gen_writeU8(x64GenContext, 0xF2);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, registerDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x2C);
x64Gen_writeU8(x64GenContext, 0xC0+(registerDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_sqrtsd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE2
// calculates square root of bottom double
x64Gen_writeU8(x64GenContext, 0xF2);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x51);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_sqrtpd_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE2
// calculates square root of bottom and top double
x64Gen_writeU8(x64GenContext, 0x66);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x51);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_rcpss_xmmReg_xmmReg(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 xmmRegisterSrc)
{
// SSE2
// approximates reciprocal of bottom 32bit single
x64Gen_writeU8(x64GenContext, 0xF3);
x64Gen_genSSEVEXPrefix2(x64GenContext, xmmRegisterSrc, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x53);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(xmmRegisterSrc&7));
}
void x64Gen_mulss_xmmReg_memReg64(x64GenContext_t* x64GenContext, sint32 xmmRegister, sint32 memRegister, uint32 memImmU32)
{
// SSE2
if( memRegister == REG_NONE )
{
assert_dbg();
}
else if( memRegister == 15 )
{
x64Gen_writeU8(x64GenContext, 0xF3);
x64Gen_writeU8(x64GenContext, (xmmRegister<8)?0x41:0x45);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x59);
x64Gen_writeU8(x64GenContext, 0x87+(xmmRegister&7)*8);
x64Gen_writeU32(x64GenContext, memImmU32);
}
else
{
assert_dbg();
}
}
void x64Gen_movd_xmmReg_reg64Low32(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 registerSrc)
{
// SSE2
// copy low 32bit of general purpose register into xmm register
// MOVD <xmm>, <reg32>
x64Gen_writeU8(x64GenContext, 0x66);
x64Gen_genSSEVEXPrefix2(x64GenContext, registerSrc, xmmRegisterDest, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x6E); // alternative encoding: 0x29, source and destination register are exchanged
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(registerSrc&7));
}
void x64Gen_movd_reg64Low32_xmmReg(x64GenContext_t* x64GenContext, sint32 registerDest, sint32 xmmRegisterSrc)
{
// SSE2
// copy low 32bit of general purpose register into xmm register
// MOVD <reg32>, <xmm>
x64Gen_writeU8(x64GenContext, 0x66);
x64Gen_genSSEVEXPrefix2(x64GenContext, registerDest, xmmRegisterSrc, false);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x7E); // alternative encoding: 0x29, source and destination register are exchanged
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterSrc&7)*8+(registerDest&7));
}
void x64Gen_movq_xmmReg_reg64(x64GenContext_t* x64GenContext, sint32 xmmRegisterDest, sint32 registerSrc)
{
// SSE2
// copy general purpose register into xmm register
// MOVD <xmm>, <reg64>
x64Gen_writeU8(x64GenContext, 0x66);
x64Gen_genSSEVEXPrefix2(x64GenContext, registerSrc, xmmRegisterDest, true);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x6E); // alternative encoding: 0x29, source and destination register are exchanged
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterDest&7)*8+(registerSrc&7));
}
void x64Gen_movq_reg64_xmmReg(x64GenContext_t* x64GenContext, sint32 registerDst, sint32 xmmRegisterSrc)
{
// SSE2
// copy general purpose register into xmm register
// MOVD <xmm>, <reg64>
x64Gen_writeU8(x64GenContext, 0x66);
x64Gen_genSSEVEXPrefix2(x64GenContext, registerDst, xmmRegisterSrc, true);
x64Gen_writeU8(x64GenContext, 0x0F);
x64Gen_writeU8(x64GenContext, 0x7E);
x64Gen_writeU8(x64GenContext, 0xC0+(xmmRegisterSrc&7)*8+(registerDst&7));
}

View file

@ -0,0 +1,360 @@
template<uint8 op0, bool rex64Bit = false>
class x64_opc_1byte
{
public:
static void emitBytes(x64GenContext_t* x64GenContext)
{
// write out op0
x64Gen_writeU8(x64GenContext, op0);
}
static constexpr bool isRevOrder()
{
return false;
}
static constexpr bool hasRex64BitPrefix()
{
return rex64Bit;
}
};
template<uint8 op0, bool rex64Bit = false>
class x64_opc_1byte_rev
{
public:
static void emitBytes(x64GenContext_t* x64GenContext)
{
// write out op0
x64Gen_writeU8(x64GenContext, op0);
}
static constexpr bool isRevOrder()
{
return true;
}
static constexpr bool hasRex64BitPrefix()
{
return rex64Bit;
}
};
template<uint8 op0, uint8 op1, bool rex64Bit = false>
class x64_opc_2byte
{
public:
static void emitBytes(x64GenContext_t* x64GenContext)
{
x64Gen_writeU8(x64GenContext, op0);
x64Gen_writeU8(x64GenContext, op1);
}
static constexpr bool isRevOrder()
{
return false;
}
static constexpr bool hasRex64BitPrefix()
{
return rex64Bit;
}
};
enum class MODRM_OPR_TYPE
{
REG,
MEM
};
class x64MODRM_opr_reg64
{
public:
x64MODRM_opr_reg64(uint8 reg)
{
this->reg = reg;
}
static constexpr MODRM_OPR_TYPE getType()
{
return MODRM_OPR_TYPE::REG;
}
const uint8 getReg() const
{
return reg;
}
private:
uint8 reg;
};
class x64MODRM_opr_memReg64
{
public:
x64MODRM_opr_memReg64(uint8 reg)
{
this->reg = reg;
this->offset = 0;
}
x64MODRM_opr_memReg64(uint8 reg, sint32 offset)
{
this->reg = reg;
this->offset = offset;
}
static constexpr MODRM_OPR_TYPE getType()
{
return MODRM_OPR_TYPE::MEM;
}
const uint8 getBaseReg() const
{
return reg;
}
const uint32 getOffset() const
{
return (uint32)offset;
}
static constexpr bool hasBaseReg()
{
return true;
}
static constexpr bool hasIndexReg()
{
return false;
}
private:
uint8 reg;
sint32 offset;
};
class x64MODRM_opr_memRegPlusReg
{
public:
x64MODRM_opr_memRegPlusReg(uint8 regBase, uint8 regIndex)
{
if ((regIndex & 7) == 4)
{
// cant encode RSP/R12 in index register, switch with base register
// this only works if the scaler is 1
std::swap(regBase, regIndex);
cemu_assert((regBase & 7) != 4);
}
this->regBase = regBase;
this->regIndex = regIndex;
this->offset = 0;
}
x64MODRM_opr_memRegPlusReg(uint8 regBase, uint8 regIndex, sint32 offset)
{
if ((regIndex & 7) == 4)
{
std::swap(regBase, regIndex);
cemu_assert((regIndex & 7) != 4);
}
this->regBase = regBase;
this->regIndex = regIndex;
this->offset = offset;
}
static constexpr MODRM_OPR_TYPE getType()
{
return MODRM_OPR_TYPE::MEM;
}
const uint8 getBaseReg() const
{
return regBase;
}
const uint8 getIndexReg()
{
return regIndex;
}
const uint32 getOffset() const
{
return (uint32)offset;
}
static constexpr bool hasBaseReg()
{
return true;
}
static constexpr bool hasIndexReg()
{
return true;
}
private:
uint8 regBase;
uint8 regIndex; // multiplied by scaler which is fixed to 1
sint32 offset;
};
template<class opcodeBytes, typename TA, typename TB>
void _x64Gen_writeMODRM_internal(x64GenContext_t* x64GenContext, TA opA, TB opB)
{
static_assert(TA::getType() == MODRM_OPR_TYPE::REG);
x64Gen_checkBuffer(x64GenContext);
// REX prefix
// 0100 WRXB
if constexpr (TA::getType() == MODRM_OPR_TYPE::REG && TB::getType() == MODRM_OPR_TYPE::REG)
{
if (opA.getReg() & 8 || opB.getReg() & 8 || opcodeBytes::hasRex64BitPrefix())
{
// opA -> REX.B
// baseReg -> REX.R
x64Gen_writeU8(x64GenContext, 0x40 | ((opA.getReg() & 8) ? (1 << 2) : 0) | ((opB.getReg() & 8) ? (1 << 0) : 0) | (opcodeBytes::hasRex64BitPrefix() ? (1 << 3) : 0));
}
}
else if constexpr (TA::getType() == MODRM_OPR_TYPE::REG && TB::getType() == MODRM_OPR_TYPE::MEM)
{
if constexpr (opB.hasBaseReg() && opB.hasIndexReg())
{
if (opA.getReg() & 8 || opB.getBaseReg() & 8 || opB.getIndexReg() & 8 || opcodeBytes::hasRex64BitPrefix())
{
// opA -> REX.B
// baseReg -> REX.R
// indexReg -> REX.X
x64Gen_writeU8(x64GenContext, 0x40 | ((opA.getReg() & 8) ? (1 << 2) : 0) | ((opB.getBaseReg() & 8) ? (1 << 0) : 0) | ((opB.getIndexReg() & 8) ? (1 << 1) : 0) | (opcodeBytes::hasRex64BitPrefix() ? (1 << 3) : 0));
}
}
else if constexpr (opB.hasBaseReg())
{
if (opA.getReg() & 8 || opB.getBaseReg() & 8 || opcodeBytes::hasRex64BitPrefix())
{
// opA -> REX.B
// baseReg -> REX.R
x64Gen_writeU8(x64GenContext, 0x40 | ((opA.getReg() & 8) ? (1 << 2) : 0) | ((opB.getBaseReg() & 8) ? (1 << 0) : 0) | (opcodeBytes::hasRex64BitPrefix() ? (1 << 3) : 0));
}
}
else
{
if (opA.getReg() & 8 || opcodeBytes::hasRex64BitPrefix())
{
// todo - verify
// opA -> REX.B
x64Gen_writeU8(x64GenContext, 0x40 | ((opA.getReg() & 8) ? (1 << 2) : 0) | (opcodeBytes::hasRex64BitPrefix() ? (1 << 3) : 0));
}
}
}
// opcode
opcodeBytes::emitBytes(x64GenContext);
// modrm byte
if constexpr (TA::getType() == MODRM_OPR_TYPE::REG && TB::getType() == MODRM_OPR_TYPE::REG)
{
// reg, reg
x64Gen_writeU8(x64GenContext, 0xC0 + (opB.getReg() & 7) + ((opA.getReg() & 7) << 3));
}
else if constexpr (TA::getType() == MODRM_OPR_TYPE::REG && TB::getType() == MODRM_OPR_TYPE::MEM)
{
if constexpr (TB::hasBaseReg() == false) // todo - also check for index reg and secondary sib reg
{
// form: [offset]
// instruction is just offset
cemu_assert(false);
}
else if constexpr (TB::hasIndexReg())
{
// form: [base+index*scaler+offset], scaler is currently fixed to 1
cemu_assert((opB.getIndexReg() & 7) != 4); // RSP not allowed as index register
const uint32 offset = opB.getOffset();
if (offset == 0 && (opB.getBaseReg() & 7) != 5) // RBP/R13 has special meaning in no-offset encoding
{
// [form: index*1+base]
x64Gen_writeU8(x64GenContext, 0x00 + (4) + ((opA.getReg() & 7) << 3));
// SIB byte
x64Gen_writeU8(x64GenContext, ((opB.getIndexReg()&7) << 3) + (opB.getBaseReg() & 7));
}
else if (offset == (uint32)(sint32)(sint8)offset)
{
// [form: index*1+base+sbyte]
x64Gen_writeU8(x64GenContext, 0x40 + (4) + ((opA.getReg() & 7) << 3));
// SIB byte
x64Gen_writeU8(x64GenContext, ((opB.getIndexReg() & 7) << 3) + (opB.getBaseReg() & 7));
x64Gen_writeU8(x64GenContext, (uint8)offset);
}
else
{
// [form: index*1+base+sdword]
x64Gen_writeU8(x64GenContext, 0x80 + (4) + ((opA.getReg() & 7) << 3));
// SIB byte
x64Gen_writeU8(x64GenContext, ((opB.getIndexReg() & 7) << 3) + (opB.getBaseReg() & 7));
x64Gen_writeU32(x64GenContext, (uint32)offset);
}
}
else
{
// form: [baseReg + offset]
const uint32 offset = opB.getOffset();
if (offset == 0 && (opB.getBaseReg() & 7) != 5) // RBP/R13 has special meaning in no-offset encoding
{
// form: [baseReg]
// if base reg is RSP/R12 we need to use SIB form of instruction
if ((opB.getBaseReg() & 7) == 4)
{
x64Gen_writeU8(x64GenContext, 0x00 + (4) + ((opA.getReg() & 7) << 3));
// SIB byte [form: none*1+base]
x64Gen_writeU8(x64GenContext, (4 << 3) + (opB.getBaseReg() & 7));
}
else
{
x64Gen_writeU8(x64GenContext, 0x00 + (opB.getBaseReg() & 7) + ((opA.getReg() & 7) << 3));
}
}
else if (offset == (uint32)(sint32)(sint8)offset)
{
// form: [baseReg+sbyte]
// if base reg is RSP/R12 we need to use SIB form of instruction
if ((opB.getBaseReg() & 7) == 4)
{
x64Gen_writeU8(x64GenContext, 0x40 + (4) + ((opA.getReg() & 7) << 3));
// SIB byte [form: none*1+base]
x64Gen_writeU8(x64GenContext, (4 << 3) + (opB.getBaseReg() & 7));
}
else
{
x64Gen_writeU8(x64GenContext, 0x40 + (opB.getBaseReg() & 7) + ((opA.getReg() & 7) << 3));
}
x64Gen_writeU8(x64GenContext, (uint8)offset);
}
else
{
// form: [baseReg+sdword]
// if base reg is RSP/R12 we need to use SIB form of instruction
if ((opB.getBaseReg() & 7) == 4)
{
x64Gen_writeU8(x64GenContext, 0x80 + (4) + ((opA.getReg() & 7) << 3));
// SIB byte [form: none*1+base]
x64Gen_writeU8(x64GenContext, (4 << 3) + (opB.getBaseReg() & 7));
}
else
{
x64Gen_writeU8(x64GenContext, 0x80 + (opB.getBaseReg() & 7) + ((opA.getReg() & 7) << 3));
}
x64Gen_writeU32(x64GenContext, (uint32)offset);
}
}
}
else
{
assert_dbg();
}
}
template<class opcodeBytes, typename TA, typename TB>
void x64Gen_writeMODRM_dyn(x64GenContext_t* x64GenContext, TA opLeft, TB opRight)
{
if constexpr (opcodeBytes::isRevOrder())
_x64Gen_writeMODRM_internal<opcodeBytes, TB, TA>(x64GenContext, opRight, opLeft);
else
_x64Gen_writeMODRM_internal<opcodeBytes, TA, TB>(x64GenContext, opLeft, opRight);
}