#include "Cafe/HW/Espresso/PPCState.h" #include "Cafe/HW/Espresso/Interpreter/PPCInterpreterInternal.h" #include "Cafe/HW/Espresso/Interpreter/PPCInterpreterHelper.h" #include "../PPCRecompiler.h" #include "../PPCRecompilerIml.h" #include "BackendX64.h" #include "Cafe/OS/libs/coreinit/coreinit_Time.h" #include "util/MemMapper/MemMapper.h" #include "Common/cpu_features.h" static x86Assembler64::GPR32 _reg32(IMLReg physReg) { cemu_assert_debug(physReg.GetRegFormat() == IMLRegFormat::I32); IMLRegID regId = physReg.GetRegID(); cemu_assert_debug(regId < 16); return (x86Assembler64::GPR32)regId; } static uint32 _reg64(IMLReg physReg) { cemu_assert_debug(physReg.GetRegFormat() == IMLRegFormat::I64); IMLRegID regId = physReg.GetRegID(); cemu_assert_debug(regId < 16); return regId; } uint32 _regF64(IMLReg physReg) { cemu_assert_debug(physReg.GetRegFormat() == IMLRegFormat::F64); IMLRegID regId = physReg.GetRegID(); cemu_assert_debug(regId >= IMLArchX86::PHYSREG_FPR_BASE && regId < IMLArchX86::PHYSREG_FPR_BASE+16); regId -= IMLArchX86::PHYSREG_FPR_BASE; return regId; } static x86Assembler64::GPR8_REX _reg8(IMLReg physReg) { cemu_assert_debug(physReg.GetRegFormat() == IMLRegFormat::I32); // for now these are represented as 32bit return (x86Assembler64::GPR8_REX)physReg.GetRegID(); } static x86Assembler64::GPR32 _reg32_from_reg8(x86Assembler64::GPR8_REX regId) { return (x86Assembler64::GPR32)regId; } static x86Assembler64::GPR8_REX _reg8_from_reg32(x86Assembler64::GPR32 regId) { return (x86Assembler64::GPR8_REX)regId; } static x86Assembler64::GPR8_REX _reg8_from_reg64(uint32 regId) { return (x86Assembler64::GPR8_REX)regId; } static x86Assembler64::GPR64 _reg64_from_reg32(x86Assembler64::GPR32 regId) { return (x86Assembler64::GPR64)regId; } X86Cond _x86Cond(IMLCondition imlCond) { switch (imlCond) { case IMLCondition::EQ: return X86_CONDITION_Z; case IMLCondition::NEQ: return X86_CONDITION_NZ; case IMLCondition::UNSIGNED_GT: return X86_CONDITION_NBE; case IMLCondition::UNSIGNED_LT: return X86_CONDITION_B; case IMLCondition::SIGNED_GT: return X86_CONDITION_NLE; case IMLCondition::SIGNED_LT: return X86_CONDITION_L; default: break; } cemu_assert_suspicious(); return X86_CONDITION_Z; } /* * Remember current instruction output offset for reloc * The instruction generated after this method has been called will be adjusted */ void PPCRecompilerX64Gen_rememberRelocatableOffset(x64GenContext_t* x64GenContext, void* extraInfo = nullptr) { x64GenContext->relocateOffsetTable2.emplace_back(x64GenContext->emitter->GetWriteIndex(), extraInfo); } void PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext_t* x64GenContext, sint32 jumpInstructionOffset, sint32 destinationOffset) { uint8* instructionData = x64GenContext->emitter->GetBufferPtr() + jumpInstructionOffset; if (instructionData[0] == 0x0F && (instructionData[1] >= 0x80 && instructionData[1] <= 0x8F)) { // far conditional jump *(uint32*)(instructionData + 2) = (destinationOffset - (jumpInstructionOffset + 6)); } else if (instructionData[0] >= 0x70 && instructionData[0] <= 0x7F) { // short conditional jump sint32 distance = (sint32)((destinationOffset - (jumpInstructionOffset + 2))); cemu_assert_debug(distance >= -128 && distance <= 127); *(uint8*)(instructionData + 1) = (uint8)distance; } else if (instructionData[0] == 0xE9) { *(uint32*)(instructionData + 1) = (destinationOffset - (jumpInstructionOffset + 5)); } else if (instructionData[0] == 0xEB) { sint32 distance = (sint32)((destinationOffset - (jumpInstructionOffset + 2))); cemu_assert_debug(distance >= -128 && distance <= 127); *(uint8*)(instructionData + 1) = (uint8)distance; } else { assert_dbg(); } } void* ATTR_MS_ABI PPCRecompiler_virtualHLE(PPCInterpreter_t* hCPU, uint32 hleFuncId) { void* prevRSPTemp = hCPU->rspTemp; if( hleFuncId == 0xFFD0 ) { hCPU->remainingCycles -= 500; // let subtract about 500 cycles for each HLE call hCPU->gpr[3] = 0; PPCInterpreter_nextInstruction(hCPU); return hCPU; } else { auto hleCall = PPCInterpreter_getHLECall(hleFuncId); cemu_assert(hleCall != nullptr); hleCall(hCPU); } hCPU->rspTemp = prevRSPTemp; return PPCInterpreter_getCurrentInstance(); } void ATTR_MS_ABI PPCRecompiler_getTBL(PPCInterpreter_t* hCPU, uint32 gprIndex) { uint64 coreTime = coreinit::OSGetSystemTime(); hCPU->gpr[gprIndex] = (uint32)(coreTime&0xFFFFFFFF); } void ATTR_MS_ABI PPCRecompiler_getTBU(PPCInterpreter_t* hCPU, uint32 gprIndex) { uint64 coreTime = coreinit::OSGetSystemTime(); hCPU->gpr[gprIndex] = (uint32)((coreTime>>32)&0xFFFFFFFF); } bool PPCRecompilerX64Gen_imlInstruction_macro(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction) { if (imlInstruction->operation == PPCREC_IML_MACRO_B_TO_REG) { uint32 branchDstReg = _reg32(imlInstruction->op_macro.paramReg); if(X86_REG_RDX != branchDstReg) x64Gen_mov_reg64_reg64(x64GenContext, X86_REG_RDX, branchDstReg); // potential optimization: Use branchDstReg directly if possible instead of moving to RDX/EDX // JMP [offset+RDX*(8/4)+R15] x64Gen_writeU8(x64GenContext, 0x41); x64Gen_writeU8(x64GenContext, 0xFF); x64Gen_writeU8(x64GenContext, 0xA4); x64Gen_writeU8(x64GenContext, 0x57); x64Gen_writeU32(x64GenContext, (uint32)offsetof(PPCRecompilerInstanceData_t, ppcRecompilerDirectJumpTable)); return true; } else if( imlInstruction->operation == PPCREC_IML_MACRO_BL ) { // MOV DWORD [SPR_LinkRegister], newLR uint32 newLR = imlInstruction->op_macro.param + 4; x64Gen_mov_mem32Reg64_imm32(x64GenContext, X86_REG_RSP, offsetof(PPCInterpreter_t, spr.LR), newLR); // remember new instruction pointer in RDX uint32 newIP = imlInstruction->op_macro.param2; x64Gen_mov_reg64Low32_imm32(x64GenContext, X86_REG_RDX, newIP); // since RDX is constant we can use JMP [R15+const_offset] if jumpTableOffset+RDX*2 does not exceed the 2GB boundary uint64 lookupOffset = (uint64)offsetof(PPCRecompilerInstanceData_t, ppcRecompilerDirectJumpTable) + (uint64)newIP * 2ULL; if (lookupOffset >= 0x80000000ULL) { // JMP [offset+RDX*(8/4)+R15] x64Gen_writeU8(x64GenContext, 0x41); x64Gen_writeU8(x64GenContext, 0xFF); x64Gen_writeU8(x64GenContext, 0xA4); x64Gen_writeU8(x64GenContext, 0x57); x64Gen_writeU32(x64GenContext, (uint32)offsetof(PPCRecompilerInstanceData_t, ppcRecompilerDirectJumpTable)); } else { x64Gen_writeU8(x64GenContext, 0x41); x64Gen_writeU8(x64GenContext, 0xFF); x64Gen_writeU8(x64GenContext, 0xA7); x64Gen_writeU32(x64GenContext, (uint32)lookupOffset); } return true; } else if( imlInstruction->operation == PPCREC_IML_MACRO_B_FAR ) { // remember new instruction pointer in RDX uint32 newIP = imlInstruction->op_macro.param2; x64Gen_mov_reg64Low32_imm32(x64GenContext, X86_REG_RDX, newIP); // Since RDX is constant we can use JMP [R15+const_offset] if jumpTableOffset+RDX*2 does not exceed the 2GB boundary uint64 lookupOffset = (uint64)offsetof(PPCRecompilerInstanceData_t, ppcRecompilerDirectJumpTable) + (uint64)newIP * 2ULL; if (lookupOffset >= 0x80000000ULL) { // JMP [offset+RDX*(8/4)+R15] x64Gen_writeU8(x64GenContext, 0x41); x64Gen_writeU8(x64GenContext, 0xFF); x64Gen_writeU8(x64GenContext, 0xA4); x64Gen_writeU8(x64GenContext, 0x57); x64Gen_writeU32(x64GenContext, (uint32)offsetof(PPCRecompilerInstanceData_t, ppcRecompilerDirectJumpTable)); } else { x64Gen_writeU8(x64GenContext, 0x41); x64Gen_writeU8(x64GenContext, 0xFF); x64Gen_writeU8(x64GenContext, 0xA7); x64Gen_writeU32(x64GenContext, (uint32)lookupOffset); } return true; } else if( imlInstruction->operation == PPCREC_IML_MACRO_LEAVE ) { uint32 currentInstructionAddress = imlInstruction->op_macro.param; // remember PC value in REG_EDX x64Gen_mov_reg64Low32_imm32(x64GenContext, X86_REG_RDX, currentInstructionAddress); uint32 newIP = 0; // special value for recompiler exit uint64 lookupOffset = (uint64)&(((PPCRecompilerInstanceData_t*)NULL)->ppcRecompilerDirectJumpTable) + (uint64)newIP * 2ULL; // JMP [R15+offset] x64Gen_writeU8(x64GenContext, 0x41); x64Gen_writeU8(x64GenContext, 0xFF); x64Gen_writeU8(x64GenContext, 0xA7); x64Gen_writeU32(x64GenContext, (uint32)lookupOffset); return true; } else if( imlInstruction->operation == PPCREC_IML_MACRO_DEBUGBREAK ) { x64Gen_mov_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, imlInstruction->op_macro.param2); x64Gen_int3(x64GenContext); return true; } else if( imlInstruction->operation == PPCREC_IML_MACRO_COUNT_CYCLES ) { uint32 cycleCount = imlInstruction->op_macro.param; x64Gen_sub_mem32reg64_imm32(x64GenContext, X86_REG_RSP, offsetof(PPCInterpreter_t, remainingCycles), cycleCount); return true; } else if( imlInstruction->operation == PPCREC_IML_MACRO_HLE ) { uint32 ppcAddress = imlInstruction->op_macro.param; uint32 funcId = imlInstruction->op_macro.param2; //x64Gen_int3(x64GenContext); // update instruction pointer x64Gen_mov_mem32Reg64_imm32(x64GenContext, X86_REG_RSP, offsetof(PPCInterpreter_t, instructionPointer), ppcAddress); //// save hCPU (RSP) //x64Gen_mov_reg64_imm64(x64GenContext, REG_RESV_TEMP, (uint64)&ppcRecompilerX64_hCPUTemp); //x64Emit_mov_mem64_reg64(x64GenContext, REG_RESV_TEMP, 0, REG_RSP); // set parameters x64Gen_mov_reg64_reg64(x64GenContext, X86_REG_RCX, X86_REG_RSP); x64Gen_mov_reg64_imm64(x64GenContext, X86_REG_RDX, funcId); // restore stackpointer from executionContext/hCPU->rspTemp x64Emit_mov_reg64_mem64(x64GenContext, X86_REG_RSP, REG_RESV_HCPU, offsetof(PPCInterpreter_t, rspTemp)); //x64Emit_mov_reg64_mem64(x64GenContext, REG_RSP, REG_R14, 0); //x64Gen_int3(x64GenContext); // reserve space on stack for call parameters x64Gen_sub_reg64_imm32(x64GenContext, X86_REG_RSP, 8*11); // must be uneven number in order to retain stack 0x10 alignment x64Gen_mov_reg64_imm64(x64GenContext, X86_REG_RBP, 0); // call HLE function x64Gen_mov_reg64_imm64(x64GenContext, X86_REG_RAX, (uint64)PPCRecompiler_virtualHLE); x64Gen_call_reg64(x64GenContext, X86_REG_RAX); // restore RSP to hCPU (from RAX, result of PPCRecompiler_virtualHLE) //x64Gen_mov_reg64_imm64(x64GenContext, REG_RESV_TEMP, (uint64)&ppcRecompilerX64_hCPUTemp); //x64Emit_mov_reg64_mem64Reg64(x64GenContext, REG_RSP, REG_RESV_TEMP, 0); x64Gen_mov_reg64_reg64(x64GenContext, X86_REG_RSP, X86_REG_RAX); // MOV R15, ppcRecompilerInstanceData x64Gen_mov_reg64_imm64(x64GenContext, X86_REG_R15, (uint64)ppcRecompilerInstanceData); // MOV R13, memory_base x64Gen_mov_reg64_imm64(x64GenContext, X86_REG_R13, (uint64)memory_base); // check if cycles where decreased beyond zero, if yes -> leave recompiler x64Gen_bt_mem8(x64GenContext, X86_REG_RSP, offsetof(PPCInterpreter_t, remainingCycles), 31); // check if negative sint32 jumpInstructionOffset1 = x64GenContext->emitter->GetWriteIndex(); x64Gen_jmpc_near(x64GenContext, X86_CONDITION_NOT_CARRY, 0); //x64Gen_int3(x64GenContext); //x64Gen_mov_reg64Low32_imm32(x64GenContext, REG_RDX, ppcAddress); x64Emit_mov_reg64_mem32(x64GenContext, X86_REG_RDX, X86_REG_RSP, offsetof(PPCInterpreter_t, instructionPointer)); // set EAX to 0 (we assume that ppcRecompilerDirectJumpTable[0] will be a recompiler escape function) x64Gen_xor_reg32_reg32(x64GenContext, X86_REG_RAX, X86_REG_RAX); // ADD RAX, R15 (R15 -> Pointer to ppcRecompilerInstanceData x64Gen_add_reg64_reg64(x64GenContext, X86_REG_RAX, X86_REG_R15); //// JMP [recompilerCallTable+EAX/4*8] //x64Gen_int3(x64GenContext); x64Gen_jmp_memReg64(x64GenContext, X86_REG_RAX, (uint32)offsetof(PPCRecompilerInstanceData_t, ppcRecompilerDirectJumpTable)); PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpInstructionOffset1, x64GenContext->emitter->GetWriteIndex()); // check if instruction pointer was changed // assign new instruction pointer to EAX x64Emit_mov_reg64_mem32(x64GenContext, X86_REG_RAX, X86_REG_RSP, offsetof(PPCInterpreter_t, instructionPointer)); // remember instruction pointer in REG_EDX x64Gen_mov_reg64_reg64(x64GenContext, X86_REG_RDX, X86_REG_RAX); // EAX *= 2 x64Gen_add_reg64_reg64(x64GenContext, X86_REG_RAX, X86_REG_RAX); // ADD RAX, R15 (R15 -> Pointer to ppcRecompilerInstanceData x64Gen_add_reg64_reg64(x64GenContext, X86_REG_RAX, X86_REG_R15); // JMP [ppcRecompilerDirectJumpTable+RAX/4*8] x64Gen_jmp_memReg64(x64GenContext, X86_REG_RAX, (uint32)offsetof(PPCRecompilerInstanceData_t, ppcRecompilerDirectJumpTable)); return true; } else if( imlInstruction->operation == PPCREC_IML_MACRO_MFTB ) { // according to MS ABI the caller needs to save: // RAX, RCX, RDX, R8, R9, R10, R11 uint32 ppcAddress = imlInstruction->op_macro.param; uint32 sprId = imlInstruction->op_macro.param2&0xFFFF; uint32 gprIndex = (imlInstruction->op_macro.param2>>16)&0x1F; // update instruction pointer x64Gen_mov_mem32Reg64_imm32(x64GenContext, X86_REG_RSP, offsetof(PPCInterpreter_t, instructionPointer), ppcAddress); // set parameters x64Gen_mov_reg64_reg64(x64GenContext, X86_REG_RCX, X86_REG_RSP); x64Gen_mov_reg64_imm64(x64GenContext, X86_REG_RDX, gprIndex); // restore stackpointer to original RSP x64Emit_mov_reg64_mem64(x64GenContext, X86_REG_RSP, REG_RESV_HCPU, offsetof(PPCInterpreter_t, rspTemp)); // push hCPU on stack x64Gen_push_reg64(x64GenContext, X86_REG_RCX); // reserve space on stack for call parameters x64Gen_sub_reg64_imm32(x64GenContext, X86_REG_RSP, 8*11 + 8); x64Gen_mov_reg64_imm64(x64GenContext, X86_REG_RBP, 0); // call function if( sprId == SPR_TBL ) x64Gen_mov_reg64_imm64(x64GenContext, X86_REG_RAX, (uint64)PPCRecompiler_getTBL); else if( sprId == SPR_TBU ) x64Gen_mov_reg64_imm64(x64GenContext, X86_REG_RAX, (uint64)PPCRecompiler_getTBU); else assert_dbg(); x64Gen_call_reg64(x64GenContext, X86_REG_RAX); // restore hCPU from stack x64Gen_add_reg64_imm32(x64GenContext, X86_REG_RSP, 8 * 11 + 8); x64Gen_pop_reg64(x64GenContext, X86_REG_RSP); // MOV R15, ppcRecompilerInstanceData x64Gen_mov_reg64_imm64(x64GenContext, X86_REG_R15, (uint64)ppcRecompilerInstanceData); // MOV R13, memory_base x64Gen_mov_reg64_imm64(x64GenContext, X86_REG_R13, (uint64)memory_base); return true; } else { debug_printf("Unknown recompiler macro operation %d\n", imlInstruction->operation); assert_dbg(); } return false; } /* * Load from memory */ bool PPCRecompilerX64Gen_imlInstruction_load(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction, bool indexed) { cemu_assert_debug(imlInstruction->op_storeLoad.registerData.GetRegFormat() == IMLRegFormat::I32); cemu_assert_debug(imlInstruction->op_storeLoad.registerMem.GetRegFormat() == IMLRegFormat::I32); if (indexed) cemu_assert_debug(imlInstruction->op_storeLoad.registerMem2.GetRegFormat() == IMLRegFormat::I32); IMLRegID realRegisterData = imlInstruction->op_storeLoad.registerData.GetRegID(); IMLRegID realRegisterMem = imlInstruction->op_storeLoad.registerMem.GetRegID(); IMLRegID realRegisterMem2 = PPC_REC_INVALID_REGISTER; if( indexed ) realRegisterMem2 = imlInstruction->op_storeLoad.registerMem2.GetRegID(); if( indexed && realRegisterMem == realRegisterMem2 ) { return false; } if( indexed && realRegisterData == realRegisterMem2 ) { // for indexed memory access realRegisterData must not be the same register as the second memory register, // this can easily be fixed by swapping the logic of realRegisterMem and realRegisterMem2 sint32 temp = realRegisterMem; realRegisterMem = realRegisterMem2; realRegisterMem2 = temp; } bool signExtend = imlInstruction->op_storeLoad.flags2.signExtend; bool switchEndian = imlInstruction->op_storeLoad.flags2.swapEndian; if( imlInstruction->op_storeLoad.copyWidth == 32 ) { //if( indexed ) // PPCRecompilerX64Gen_crConditionFlags_forget(PPCRecFunction, ppcImlGenContext, x64GenContext); if (indexed) { x64Gen_lea_reg64Low32_reg64Low32PlusReg64Low32(x64GenContext, REG_RESV_TEMP, realRegisterMem, realRegisterMem2); } if( g_CPUFeatures.x86.movbe && switchEndian ) { if (indexed) { x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, realRegisterData, X86_REG_R13, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32); //if (indexed && realRegisterMem != realRegisterData) // x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); } else { x64Gen_movBEZeroExtend_reg64_mem32Reg64PlusReg64(x64GenContext, realRegisterData, X86_REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); } } else { if (indexed) { x64Emit_mov_reg32_mem32(x64GenContext, realRegisterData, X86_REG_R13, REG_RESV_TEMP, imlInstruction->op_storeLoad.immS32); //if (realRegisterMem != realRegisterData) // x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); if (switchEndian) x64Gen_bswap_reg64Lower32bit(x64GenContext, realRegisterData); } else { x64Emit_mov_reg32_mem32(x64GenContext, realRegisterData, X86_REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); if (switchEndian) x64Gen_bswap_reg64Lower32bit(x64GenContext, realRegisterData); } } } else if( imlInstruction->op_storeLoad.copyWidth == 16 ) { if (indexed) { x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); } if(g_CPUFeatures.x86.movbe && switchEndian ) { x64Gen_movBEZeroExtend_reg64Low16_mem16Reg64PlusReg64(x64GenContext, realRegisterData, X86_REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); if( indexed && realRegisterMem != realRegisterData ) x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); } else { x64Gen_movZeroExtend_reg64Low16_mem16Reg64PlusReg64(x64GenContext, realRegisterData, X86_REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); if( indexed && realRegisterMem != realRegisterData ) x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); if( switchEndian ) x64Gen_rol_reg64Low16_imm8(x64GenContext, realRegisterData, 8); } if( signExtend ) x64Gen_movSignExtend_reg64Low32_reg64Low16(x64GenContext, realRegisterData, realRegisterData); else x64Gen_movZeroExtend_reg64Low32_reg64Low16(x64GenContext, realRegisterData, realRegisterData); } else if( imlInstruction->op_storeLoad.copyWidth == 8 ) { if( indexed ) x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); if( signExtend ) x64Gen_movSignExtend_reg64Low32_mem8Reg64PlusReg64(x64GenContext, realRegisterData, X86_REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); else x64Emit_movZX_reg32_mem8(x64GenContext, realRegisterData, X86_REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32); if( indexed && realRegisterMem != realRegisterData ) x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); } else return false; return true; } /* * Write to memory */ bool PPCRecompilerX64Gen_imlInstruction_store(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction, bool indexed) { cemu_assert_debug(imlInstruction->op_storeLoad.registerData.GetRegFormat() == IMLRegFormat::I32); cemu_assert_debug(imlInstruction->op_storeLoad.registerMem.GetRegFormat() == IMLRegFormat::I32); if (indexed) cemu_assert_debug(imlInstruction->op_storeLoad.registerMem2.GetRegFormat() == IMLRegFormat::I32); IMLRegID realRegisterData = imlInstruction->op_storeLoad.registerData.GetRegID(); IMLRegID realRegisterMem = imlInstruction->op_storeLoad.registerMem.GetRegID(); IMLRegID realRegisterMem2 = PPC_REC_INVALID_REGISTER; if (indexed) realRegisterMem2 = imlInstruction->op_storeLoad.registerMem2.GetRegID(); if (indexed && realRegisterMem == realRegisterMem2) { return false; } if (indexed && realRegisterData == realRegisterMem2) { // for indexed memory access realRegisterData must not be the same register as the second memory register, // this can easily be fixed by swapping the logic of realRegisterMem and realRegisterMem2 sint32 temp = realRegisterMem; realRegisterMem = realRegisterMem2; realRegisterMem2 = temp; } bool signExtend = imlInstruction->op_storeLoad.flags2.signExtend; bool swapEndian = imlInstruction->op_storeLoad.flags2.swapEndian; if (imlInstruction->op_storeLoad.copyWidth == 32) { uint32 valueRegister; if ((swapEndian == false || g_CPUFeatures.x86.movbe) && realRegisterMem != realRegisterData) { valueRegister = realRegisterData; } else { x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData); valueRegister = REG_RESV_TEMP; } if (!g_CPUFeatures.x86.movbe && swapEndian) x64Gen_bswap_reg64Lower32bit(x64GenContext, valueRegister); if (indexed) x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); if (g_CPUFeatures.x86.movbe && swapEndian) x64Gen_movBETruncate_mem32Reg64PlusReg64_reg64(x64GenContext, X86_REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, valueRegister); else x64Gen_movTruncate_mem32Reg64PlusReg64_reg64(x64GenContext, X86_REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, valueRegister); if (indexed) x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); } else if (imlInstruction->op_storeLoad.copyWidth == 16) { x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData); if (swapEndian) x64Gen_rol_reg64Low16_imm8(x64GenContext, REG_RESV_TEMP, 8); if (indexed) x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); x64Gen_movTruncate_mem16Reg64PlusReg64_reg64(x64GenContext, X86_REG_R13, realRegisterMem, imlInstruction->op_storeLoad.immS32, REG_RESV_TEMP); if (indexed) x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); // todo: Optimize this, e.g. by using MOVBE } else if (imlInstruction->op_storeLoad.copyWidth == 8) { if (indexed && realRegisterMem == realRegisterData) { x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, realRegisterData); realRegisterData = REG_RESV_TEMP; } if (indexed) x64Gen_add_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); x64Gen_movTruncate_mem8Reg64PlusReg64_reg64(x64GenContext, REG_RESV_MEMBASE, realRegisterMem, imlInstruction->op_storeLoad.immS32, realRegisterData); if (indexed) x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, realRegisterMem, realRegisterMem2); } else return false; return true; } bool PPCRecompilerX64Gen_imlInstruction_atomic_cmp_store(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction) { auto regBoolOut = _reg32_from_reg8(_reg8(imlInstruction->op_atomic_compare_store.regBoolOut)); auto regEA = _reg32(imlInstruction->op_atomic_compare_store.regEA); auto regVal = _reg32(imlInstruction->op_atomic_compare_store.regWriteValue); auto regCmp = _reg32(imlInstruction->op_atomic_compare_store.regCompareValue); // make sure non of the regs are in EAX if (regEA == X86_REG_EAX || regBoolOut == X86_REG_EAX || regVal == X86_REG_EAX || regCmp == X86_REG_EAX) { printf("x86: atomic_cmp_store cannot emit due to EAX already being in use\n"); return false; } x64GenContext->emitter->XCHG_qq(REG_RESV_TEMP, X86_REG_RAX); x64GenContext->emitter->MOV_dd(X86_REG_EAX, regCmp); x64GenContext->emitter->XOR_dd(_reg32_from_reg8(regBoolOut), _reg32_from_reg8(regBoolOut)); // zero bytes unaffected by SETcc x64GenContext->emitter->LockPrefix(); x64GenContext->emitter->CMPXCHG_dd_l(REG_RESV_MEMBASE, 0, _reg64_from_reg32(regEA), 1, regVal); x64GenContext->emitter->SETcc_b(X86Cond::X86_CONDITION_Z, regBoolOut); x64GenContext->emitter->XCHG_qq(REG_RESV_TEMP, X86_REG_RAX); return true; } bool PPCRecompilerX64Gen_imlInstruction_r_r(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction) { auto regR = _reg32(imlInstruction->op_r_r.regR); auto regA = _reg32(imlInstruction->op_r_r.regA); if (imlInstruction->operation == PPCREC_IML_OP_ASSIGN) { // registerResult = registerA if (regR != regA) x64Gen_mov_reg64Low32_reg64Low32(x64GenContext, regR, regA); } else if (imlInstruction->operation == PPCREC_IML_OP_ENDIAN_SWAP) { if (regA != regR) x64Gen_mov_reg64Low32_reg64Low32(x64GenContext, regR, regA); // if movbe is available we can move and swap in a single instruction? x64Gen_bswap_reg64Lower32bit(x64GenContext, regR); } else if( imlInstruction->operation == PPCREC_IML_OP_ASSIGN_S8_TO_S32 ) { x64Gen_movSignExtend_reg64Low32_reg64Low8(x64GenContext, regR, regA); } else if (imlInstruction->operation == PPCREC_IML_OP_ASSIGN_S16_TO_S32) { x64Gen_movSignExtend_reg64Low32_reg64Low16(x64GenContext, regR, reg32ToReg16(regA)); } else if( imlInstruction->operation == PPCREC_IML_OP_NOT ) { // copy register content if different registers if( regR != regA ) x64Gen_mov_reg64_reg64(x64GenContext, regR, regA); x64Gen_not_reg64Low32(x64GenContext, regR); } else if (imlInstruction->operation == PPCREC_IML_OP_NEG) { // copy register content if different registers if (regR != regA) x64Gen_mov_reg64_reg64(x64GenContext, regR, regA); x64Gen_neg_reg64Low32(x64GenContext, regR); } else if( imlInstruction->operation == PPCREC_IML_OP_CNTLZW ) { // count leading zeros // LZCNT instruction (part of SSE4, CPUID.80000001H:ECX.ABM[Bit 5]) if(g_CPUFeatures.x86.lzcnt) { x64Gen_lzcnt_reg64Low32_reg64Low32(x64GenContext, regR, regA); } else { x64Gen_test_reg64Low32_reg64Low32(x64GenContext, regA, regA); sint32 jumpInstructionOffset1 = x64GenContext->emitter->GetWriteIndex(); x64Gen_jmpc_near(x64GenContext, X86_CONDITION_EQUAL, 0); x64Gen_bsr_reg64Low32_reg64Low32(x64GenContext, regR, regA); x64Gen_neg_reg64Low32(x64GenContext, regR); x64Gen_add_reg64Low32_imm32(x64GenContext, regR, 32-1); sint32 jumpInstructionOffset2 = x64GenContext->emitter->GetWriteIndex(); x64Gen_jmpc_near(x64GenContext, X86_CONDITION_NONE, 0); PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpInstructionOffset1, x64GenContext->emitter->GetWriteIndex()); x64Gen_mov_reg64Low32_imm32(x64GenContext, regR, 32); PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpInstructionOffset2, x64GenContext->emitter->GetWriteIndex()); } } else if( imlInstruction->operation == PPCREC_IML_OP_DCBZ ) { if( regR != regA ) { x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, regA); x64Gen_add_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, regR); x64Gen_and_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, ~0x1F); x64Gen_add_reg64_reg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE); for(sint32 f=0; f<0x20; f+=8) x64Gen_mov_mem64Reg64_imm32(x64GenContext, REG_RESV_TEMP, f, 0); } else { // calculate effective address x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, regA); x64Gen_and_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, ~0x1F); x64Gen_add_reg64_reg64(x64GenContext, REG_RESV_TEMP, REG_RESV_MEMBASE); for(sint32 f=0; f<0x20; f+=8) x64Gen_mov_mem64Reg64_imm32(x64GenContext, REG_RESV_TEMP, f, 0); } } else { debug_printf("PPCRecompilerX64Gen_imlInstruction_r_r(): Unsupported operation 0x%x\n", imlInstruction->operation); return false; } return true; } bool PPCRecompilerX64Gen_imlInstruction_r_s32(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction) { auto regR = _reg32(imlInstruction->op_r_immS32.regR); if( imlInstruction->operation == PPCREC_IML_OP_ASSIGN ) { x64Gen_mov_reg64Low32_imm32(x64GenContext, regR, (uint32)imlInstruction->op_r_immS32.immS32); } else if( imlInstruction->operation == PPCREC_IML_OP_LEFT_ROTATE ) { cemu_assert_debug((imlInstruction->op_r_immS32.immS32 & 0x80) == 0); x64Gen_rol_reg64Low32_imm8(x64GenContext, regR, (uint8)imlInstruction->op_r_immS32.immS32); } else { debug_printf("PPCRecompilerX64Gen_imlInstruction_r_s32(): Unsupported operation 0x%x\n", imlInstruction->operation); return false; } return true; } bool PPCRecompilerX64Gen_imlInstruction_conditional_r_s32(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction) { cemu_assert_unimplemented(); //if (imlInstruction->operation == PPCREC_IML_OP_ASSIGN) //{ // // registerResult = immS32 (conditional) // if (imlInstruction->crRegister != PPC_REC_INVALID_REGISTER) // { // assert_dbg(); // } // x64Gen_mov_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, (uint32)imlInstruction->op_conditional_r_s32.immS32); // uint8 crBitIndex = imlInstruction->op_conditional_r_s32.crRegisterIndex * 4 + imlInstruction->op_conditional_r_s32.crBitIndex; // x64Gen_bt_mem8(x64GenContext, X86_REG_RSP, offsetof(PPCInterpreter_t, cr) + crBitIndex * sizeof(uint8), 0); // if (imlInstruction->op_conditional_r_s32.bitMustBeSet) // x64Gen_cmovcc_reg64Low32_reg64Low32(x64GenContext, X86_CONDITION_CARRY, imlInstruction->op_conditional_r_s32.registerIndex, REG_RESV_TEMP); // else // x64Gen_cmovcc_reg64Low32_reg64Low32(x64GenContext, X86_CONDITION_NOT_CARRY, imlInstruction->op_conditional_r_s32.registerIndex, REG_RESV_TEMP); // return true; //} return false; } bool PPCRecompilerX64Gen_imlInstruction_r_r_r(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction) { auto rRegResult = _reg32(imlInstruction->op_r_r_r.regR); auto rRegOperand1 = _reg32(imlInstruction->op_r_r_r.regA); auto rRegOperand2 = _reg32(imlInstruction->op_r_r_r.regB); if (imlInstruction->operation == PPCREC_IML_OP_ADD) { // registerResult = registerOperand1 + registerOperand2 if( (rRegResult == rRegOperand1) || (rRegResult == rRegOperand2) ) { // be careful not to overwrite the operand before we use it if( rRegResult == rRegOperand1 ) x64Gen_add_reg64Low32_reg64Low32(x64GenContext, rRegResult, rRegOperand2); else x64Gen_add_reg64Low32_reg64Low32(x64GenContext, rRegResult, rRegOperand1); } else { // copy operand1 to destination register before doing addition x64Gen_mov_reg64_reg64(x64GenContext, rRegResult, rRegOperand1); x64Gen_add_reg64Low32_reg64Low32(x64GenContext, rRegResult, rRegOperand2); } } else if( imlInstruction->operation == PPCREC_IML_OP_SUB ) { if( rRegOperand1 == rRegOperand2 ) { // result = operand1 - operand1 -> 0 x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, rRegResult, rRegResult); } else if( rRegResult == rRegOperand1 ) { // result = result - operand2 x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, rRegResult, rRegOperand2); } else if ( rRegResult == rRegOperand2 ) { // result = operand1 - result x64Gen_neg_reg64Low32(x64GenContext, rRegResult); x64Gen_add_reg64Low32_reg64Low32(x64GenContext, rRegResult, rRegOperand1); } else { x64Gen_mov_reg64_reg64(x64GenContext, rRegResult, rRegOperand1); x64Gen_sub_reg64Low32_reg64Low32(x64GenContext, rRegResult, rRegOperand2); } } else if (imlInstruction->operation == PPCREC_IML_OP_OR || imlInstruction->operation == PPCREC_IML_OP_AND || imlInstruction->operation == PPCREC_IML_OP_XOR) { if (rRegResult == rRegOperand2) std::swap(rRegOperand1, rRegOperand2); if (rRegResult != rRegOperand1) x64Gen_mov_reg64_reg64(x64GenContext, rRegResult, rRegOperand1); if (imlInstruction->operation == PPCREC_IML_OP_OR) x64Gen_or_reg64Low32_reg64Low32(x64GenContext, rRegResult, rRegOperand2); else if (imlInstruction->operation == PPCREC_IML_OP_AND) x64Gen_and_reg64Low32_reg64Low32(x64GenContext, rRegResult, rRegOperand2); else x64Gen_xor_reg64Low32_reg64Low32(x64GenContext, rRegResult, rRegOperand2); } else if( imlInstruction->operation == PPCREC_IML_OP_MULTIPLY_SIGNED ) { // registerResult = registerOperand1 * registerOperand2 if( (rRegResult == rRegOperand1) || (rRegResult == rRegOperand2) ) { // be careful not to overwrite the operand before we use it if( rRegResult == rRegOperand1 ) x64Gen_imul_reg64Low32_reg64Low32(x64GenContext, rRegResult, rRegOperand2); else x64Gen_imul_reg64Low32_reg64Low32(x64GenContext, rRegResult, rRegOperand1); } else { // copy operand1 to destination register before doing multiplication x64Gen_mov_reg64_reg64(x64GenContext, rRegResult, rRegOperand1); // add operand2 x64Gen_imul_reg64Low32_reg64Low32(x64GenContext, rRegResult, rRegOperand2); } } else if( imlInstruction->operation == PPCREC_IML_OP_SLW || imlInstruction->operation == PPCREC_IML_OP_SRW ) { // registerResult = registerOperand1(rA) >> registerOperand2(rB) (up to 63 bits) if (g_CPUFeatures.x86.bmi2 && imlInstruction->operation == PPCREC_IML_OP_SRW) { // use BMI2 SHRX if available x64Gen_shrx_reg64_reg64_reg64(x64GenContext, rRegResult, rRegOperand1, rRegOperand2); } else if (g_CPUFeatures.x86.bmi2 && imlInstruction->operation == PPCREC_IML_OP_SLW) { // use BMI2 SHLX if available x64Gen_shlx_reg64_reg64_reg64(x64GenContext, rRegResult, rRegOperand1, rRegOperand2); x64Gen_and_reg64Low32_reg64Low32(x64GenContext, rRegResult, rRegResult); // trim result to 32bit } else { // lazy and slow way to do shift by register without relying on ECX/CL or BMI2 x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, rRegOperand1); for (sint32 b = 0; b < 6; b++) { x64Gen_test_reg64Low32_imm32(x64GenContext, rRegOperand2, (1 << b)); sint32 jumpInstructionOffset = x64GenContext->emitter->GetWriteIndex(); x64Gen_jmpc_near(x64GenContext, X86_CONDITION_EQUAL, 0); // jump if bit not set if (b == 5) { x64Gen_xor_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, REG_RESV_TEMP); } else { if (imlInstruction->operation == PPCREC_IML_OP_SLW) x64Gen_shl_reg64Low32_imm8(x64GenContext, REG_RESV_TEMP, (1 << b)); else x64Gen_shr_reg64Low32_imm8(x64GenContext, REG_RESV_TEMP, (1 << b)); } PPCRecompilerX64Gen_redirectRelativeJump(x64GenContext, jumpInstructionOffset, x64GenContext->emitter->GetWriteIndex()); } x64Gen_mov_reg64_reg64(x64GenContext, rRegResult, REG_RESV_TEMP); } } else if( imlInstruction->operation == PPCREC_IML_OP_LEFT_ROTATE ) { // todo: Use BMI2 rotate if available // check if CL/ECX/RCX is available if( rRegResult != X86_REG_RCX && rRegOperand1 != X86_REG_RCX && rRegOperand2 != X86_REG_RCX ) { // swap operand 2 with RCX x64Gen_xchg_reg64_reg64(x64GenContext, X86_REG_RCX, rRegOperand2); // move operand 1 to temp register x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, rRegOperand1); // rotate x64Gen_rol_reg64Low32_cl(x64GenContext, REG_RESV_TEMP); // undo swap operand 2 with RCX x64Gen_xchg_reg64_reg64(x64GenContext, X86_REG_RCX, rRegOperand2); // copy to result register x64Gen_mov_reg64_reg64(x64GenContext, rRegResult, REG_RESV_TEMP); } else { x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, rRegOperand1); // lazy and slow way to do shift by register without relying on ECX/CL for(sint32 b=0; b<5; b++) { x64Gen_test_reg64Low32_imm32(x64GenContext, rRegOperand2, (1<emitter->GetWriteIndex(); x64Gen_jmpc_near(x64GenContext, X86_CONDITION_EQUAL, 0); // jump if bit not set x64Gen_rol_reg64Low32_imm8(x64GenContext, REG_RESV_TEMP, (1<emitter->GetWriteIndex()); } x64Gen_mov_reg64_reg64(x64GenContext, rRegResult, REG_RESV_TEMP); } } else if (imlInstruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_S || imlInstruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_U || imlInstruction->operation == PPCREC_IML_OP_LEFT_SHIFT) { // x86's shift and rotate instruction have the shift amount hardwired to the CL register // since our register allocator doesn't support instruction based fixed phys registers yet // we'll instead have to temporarily shuffle registers around // we use BMI2's shift instructions until the RA can assign fixed registers if (imlInstruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_S) { x64Gen_sarx_reg32_reg32_reg32(x64GenContext, rRegResult, rRegOperand1, rRegOperand2); } else if (imlInstruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_U) { x64Gen_shrx_reg32_reg32_reg32(x64GenContext, rRegResult, rRegOperand1, rRegOperand2); } else if (imlInstruction->operation == PPCREC_IML_OP_LEFT_SHIFT) { x64Gen_shlx_reg32_reg32_reg32(x64GenContext, rRegResult, rRegOperand1, rRegOperand2); } //auto rResult = _reg32(rRegResult); //auto rOp2 = _reg8_from_reg32(_reg32(rRegOperand2)); //if (rRegResult == rRegOperand2) //{ // if (rRegResult != rRegOperand1) // DEBUG_BREAK; // cannot handle yet (we use rRegResult as a temporary reg, but its not possible if it is shared with op2) //} //if(rRegOperand1 != rRegResult) // x64Gen_mov_reg64_reg64(x64GenContext, rRegResult, rRegOperand1); //cemu_assert_debug(rRegOperand1 != X86_REG_ECX); //if (rRegOperand2 == X86_REG_ECX) //{ // if (imlInstruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_S) // x64GenContext->emitter->SAR_d_CL(rResult); // else if (imlInstruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_U) // x64GenContext->emitter->SHR_d_CL(rResult); // else if (imlInstruction->operation == PPCREC_IML_OP_LEFT_SHIFT) // x64GenContext->emitter->SHL_d_CL(rResult); // else // cemu_assert_unimplemented(); //} //else //{ // auto rRegResultOrg = rRegResult; // if (rRegResult == X86_REG_ECX) // { // x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, rRegResult); // rRegResult = REG_RESV_TEMP; // rResult = _reg32(rRegResult); // } // // x64Gen_xchg_reg64_reg64(x64GenContext, X86_REG_RCX, rRegOperand2); // // if (imlInstruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_S) // x64GenContext->emitter->SAR_d_CL(rResult); // else if (imlInstruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_U) // x64GenContext->emitter->SHR_d_CL(rResult); // else if (imlInstruction->operation == PPCREC_IML_OP_LEFT_SHIFT) // x64GenContext->emitter->SHL_d_CL(rResult); // else // cemu_assert_unimplemented(); // x64Gen_xchg_reg64_reg64(x64GenContext, X86_REG_RCX, rRegOperand2); // // move result back if it was in ECX // if (rRegResultOrg == X86_REG_ECX) // { // x64Gen_mov_reg64_reg64(x64GenContext, rRegResultOrg, REG_RESV_TEMP); // } //} } else if( imlInstruction->operation == PPCREC_IML_OP_DIVIDE_SIGNED || imlInstruction->operation == PPCREC_IML_OP_DIVIDE_UNSIGNED ) { x64Emit_mov_mem32_reg32(x64GenContext, X86_REG_RSP, (uint32)offsetof(PPCInterpreter_t, temporaryGPR[0]), X86_REG_EAX); x64Emit_mov_mem32_reg32(x64GenContext, X86_REG_RSP, (uint32)offsetof(PPCInterpreter_t, temporaryGPR[1]), X86_REG_EDX); // mov operand 2 to temp register x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, rRegOperand2); // mov operand1 to EAX x64Gen_mov_reg64Low32_reg64Low32(x64GenContext, X86_REG_EAX, rRegOperand1); // sign or zero extend EAX to EDX:EAX based on division sign mode if( imlInstruction->operation == PPCREC_IML_OP_DIVIDE_SIGNED ) x64Gen_cdq(x64GenContext); else x64Gen_xor_reg64Low32_reg64Low32(x64GenContext, X86_REG_EDX, X86_REG_EDX); // make sure we avoid division by zero x64Gen_test_reg64Low32_reg64Low32(x64GenContext, REG_RESV_TEMP, REG_RESV_TEMP); x64Gen_jmpc_near(x64GenContext, X86_CONDITION_EQUAL, 3); // divide if( imlInstruction->operation == PPCREC_IML_OP_DIVIDE_SIGNED ) x64Gen_idiv_reg64Low32(x64GenContext, REG_RESV_TEMP); else x64Gen_div_reg64Low32(x64GenContext, REG_RESV_TEMP); // result of division is now stored in EAX, move it to result register if( rRegResult != X86_REG_EAX ) x64Gen_mov_reg64_reg64(x64GenContext, rRegResult, X86_REG_EAX); // restore EAX / EDX if( rRegResult != X86_REG_RAX ) x64Emit_mov_reg64_mem32(x64GenContext, X86_REG_EAX, X86_REG_RSP, (uint32)offsetof(PPCInterpreter_t, temporaryGPR[0])); if( rRegResult != X86_REG_RDX ) x64Emit_mov_reg64_mem32(x64GenContext, X86_REG_EDX, X86_REG_RSP, (uint32)offsetof(PPCInterpreter_t, temporaryGPR[1])); } else if( imlInstruction->operation == PPCREC_IML_OP_MULTIPLY_HIGH_SIGNED || imlInstruction->operation == PPCREC_IML_OP_MULTIPLY_HIGH_UNSIGNED ) { x64Emit_mov_mem32_reg32(x64GenContext, X86_REG_RSP, (uint32)offsetof(PPCInterpreter_t, temporaryGPR[0]), X86_REG_EAX); x64Emit_mov_mem32_reg32(x64GenContext, X86_REG_RSP, (uint32)offsetof(PPCInterpreter_t, temporaryGPR[1]), X86_REG_EDX); // mov operand 2 to temp register x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, rRegOperand2); // mov operand1 to EAX x64Gen_mov_reg64Low32_reg64Low32(x64GenContext, X86_REG_EAX, rRegOperand1); if( imlInstruction->operation == PPCREC_IML_OP_MULTIPLY_HIGH_SIGNED ) { // zero extend EAX to EDX:EAX x64Gen_xor_reg64Low32_reg64Low32(x64GenContext, X86_REG_EDX, X86_REG_EDX); } else { // sign extend EAX to EDX:EAX x64Gen_cdq(x64GenContext); } // multiply if( imlInstruction->operation == PPCREC_IML_OP_MULTIPLY_HIGH_SIGNED ) x64Gen_imul_reg64Low32(x64GenContext, REG_RESV_TEMP); else x64Gen_mul_reg64Low32(x64GenContext, REG_RESV_TEMP); // result of multiplication is now stored in EDX:EAX, move it to result register if( rRegResult != X86_REG_EDX ) x64Gen_mov_reg64_reg64(x64GenContext, rRegResult, X86_REG_EDX); // restore EAX / EDX if( rRegResult != X86_REG_RAX ) x64Emit_mov_reg64_mem32(x64GenContext, X86_REG_EAX, X86_REG_RSP, (uint32)offsetof(PPCInterpreter_t, temporaryGPR[0])); if( rRegResult != X86_REG_RDX ) x64Emit_mov_reg64_mem32(x64GenContext, X86_REG_EDX, X86_REG_RSP, (uint32)offsetof(PPCInterpreter_t, temporaryGPR[1])); } else { debug_printf("PPCRecompilerX64Gen_imlInstruction_r_r_r(): Unsupported operation 0x%x\n", imlInstruction->operation); return false; } return true; } bool PPCRecompilerX64Gen_imlInstruction_r_r_r_carry(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction) { auto regR = _reg32(imlInstruction->op_r_r_r_carry.regR); auto regA = _reg32(imlInstruction->op_r_r_r_carry.regA); auto regB = _reg32(imlInstruction->op_r_r_r_carry.regB); auto regCarry = _reg32(imlInstruction->op_r_r_r_carry.regCarry); cemu_assert_debug(regCarry != regR && regCarry != regA); switch (imlInstruction->operation) { case PPCREC_IML_OP_ADD: if (regB == regR) std::swap(regB, regA); if (regR != regA) x64GenContext->emitter->MOV_dd(regR, regA); x64GenContext->emitter->XOR_dd(regCarry, regCarry); x64GenContext->emitter->ADD_dd(regR, regB); x64GenContext->emitter->SETcc_b(X86_CONDITION_B, _reg8_from_reg32(regCarry)); // below condition checks carry flag break; case PPCREC_IML_OP_ADD_WITH_CARRY: // assumes that carry is already correctly initialized as 0 or 1 if (regB == regR) std::swap(regB, regA); if (regR != regA) x64GenContext->emitter->MOV_dd(regR, regA); x64GenContext->emitter->BT_du8(regCarry, 0); // copy carry register to x86 carry flag x64GenContext->emitter->ADC_dd(regR, regB); x64GenContext->emitter->SETcc_b(X86_CONDITION_B, _reg8_from_reg32(regCarry)); break; default: cemu_assert_unimplemented(); return false; } return true; } bool PPCRecompilerX64Gen_imlInstruction_compare(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction) { auto regR = _reg8(imlInstruction->op_compare.regR); auto regA = _reg32(imlInstruction->op_compare.regA); auto regB = _reg32(imlInstruction->op_compare.regB); X86Cond cond = _x86Cond(imlInstruction->op_compare.cond); x64GenContext->emitter->XOR_dd(_reg32_from_reg8(regR), _reg32_from_reg8(regR)); // zero bytes unaffected by SETcc x64GenContext->emitter->CMP_dd(regA, regB); x64GenContext->emitter->SETcc_b(cond, regR); return true; } bool PPCRecompilerX64Gen_imlInstruction_compare_s32(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction) { auto regR = _reg8(imlInstruction->op_compare_s32.regR); auto regA = _reg32(imlInstruction->op_compare_s32.regA); sint32 imm = imlInstruction->op_compare_s32.immS32; X86Cond cond = _x86Cond(imlInstruction->op_compare_s32.cond); x64GenContext->emitter->XOR_dd(_reg32_from_reg8(regR), _reg32_from_reg8(regR)); // zero bytes unaffected by SETcc x64GenContext->emitter->CMP_di32(regA, imm); x64GenContext->emitter->SETcc_b(cond, regR); return true; } bool PPCRecompilerX64Gen_imlInstruction_cjump2(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction, IMLSegment* imlSegment) { auto regBool = _reg8(imlInstruction->op_conditional_jump.registerBool); bool mustBeTrue = imlInstruction->op_conditional_jump.mustBeTrue; x64GenContext->emitter->TEST_bb(regBool, regBool); PPCRecompilerX64Gen_rememberRelocatableOffset(x64GenContext, imlSegment->nextSegmentBranchTaken); x64GenContext->emitter->Jcc_j32(mustBeTrue ? X86_CONDITION_NZ : X86_CONDITION_Z, 0); return true; } bool PPCRecompilerX64Gen_imlInstruction_jump2(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction, IMLSegment* imlSegment) { PPCRecompilerX64Gen_rememberRelocatableOffset(x64GenContext, imlSegment->nextSegmentBranchTaken); x64GenContext->emitter->JMP_j32(0); return true; } bool PPCRecompilerX64Gen_imlInstruction_r_r_s32(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction) { auto regR = _reg32(imlInstruction->op_r_r_s32.regR); auto regA = _reg32(imlInstruction->op_r_r_s32.regA); uint32 immS32 = imlInstruction->op_r_r_s32.immS32; if( imlInstruction->operation == PPCREC_IML_OP_ADD ) { uint32 immU32 = (uint32)imlInstruction->op_r_r_s32.immS32; if(regR != regA) x64Gen_mov_reg64_reg64(x64GenContext, regR, regA); x64Gen_add_reg64Low32_imm32(x64GenContext, regR, (uint32)immU32); } else if (imlInstruction->operation == PPCREC_IML_OP_SUB) { if (regR != regA) x64Gen_mov_reg64_reg64(x64GenContext, regR, regA); x64Gen_sub_reg64Low32_imm32(x64GenContext, regR, immS32); } else if (imlInstruction->operation == PPCREC_IML_OP_AND || imlInstruction->operation == PPCREC_IML_OP_OR || imlInstruction->operation == PPCREC_IML_OP_XOR) { if (regR != regA) x64Gen_mov_reg64_reg64(x64GenContext, regR, regA); if (imlInstruction->operation == PPCREC_IML_OP_AND) x64Gen_and_reg64Low32_imm32(x64GenContext, regR, immS32); else if (imlInstruction->operation == PPCREC_IML_OP_OR) x64Gen_or_reg64Low32_imm32(x64GenContext, regR, immS32); else // XOR x64Gen_xor_reg64Low32_imm32(x64GenContext, regR, immS32); } else if( imlInstruction->operation == PPCREC_IML_OP_RLWIMI ) { // registerResult = ((registerResult<<op_r_r_s32.immS32; uint32 mb = (vImm>>0)&0xFF; uint32 me = (vImm>>8)&0xFF; uint32 sh = (vImm>>16)&0xFF; uint32 mask = ppc_mask(mb, me); // copy rS to temporary register x64Gen_mov_reg64_reg64(x64GenContext, REG_RESV_TEMP, regA); // rotate destination register if( sh ) x64Gen_rol_reg64Low32_imm8(x64GenContext, REG_RESV_TEMP, (uint8)sh&0x1F); // AND destination register with inverted mask x64Gen_and_reg64Low32_imm32(x64GenContext, regR, ~mask); // AND temporary rS register with mask x64Gen_and_reg64Low32_imm32(x64GenContext, REG_RESV_TEMP, mask); // OR result with temporary x64Gen_or_reg64Low32_reg64Low32(x64GenContext, regR, REG_RESV_TEMP); } else if( imlInstruction->operation == PPCREC_IML_OP_MULTIPLY_SIGNED ) { // registerResult = registerOperand * immS32 sint32 immS32 = (uint32)imlInstruction->op_r_r_s32.immS32; x64Gen_mov_reg64_imm64(x64GenContext, REG_RESV_TEMP, (sint64)immS32); // todo: Optimize if( regR != regA ) x64Gen_mov_reg64_reg64(x64GenContext, regR, regA); x64Gen_imul_reg64Low32_reg64Low32(x64GenContext, regR, REG_RESV_TEMP); } else if (imlInstruction->operation == PPCREC_IML_OP_LEFT_SHIFT || imlInstruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_U || imlInstruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_S) { if( regA != regR ) x64Gen_mov_reg64_reg64(x64GenContext, regR, regA); if (imlInstruction->operation == PPCREC_IML_OP_LEFT_SHIFT) x64Gen_shl_reg64Low32_imm8(x64GenContext, regR, imlInstruction->op_r_r_s32.immS32); else if (imlInstruction->operation == PPCREC_IML_OP_RIGHT_SHIFT_U) x64Gen_shr_reg64Low32_imm8(x64GenContext, regR, imlInstruction->op_r_r_s32.immS32); else // RIGHT_SHIFT_S x64Gen_sar_reg64Low32_imm8(x64GenContext, regR, imlInstruction->op_r_r_s32.immS32); } else { debug_printf("PPCRecompilerX64Gen_imlInstruction_r_r_s32(): Unsupported operation 0x%x\n", imlInstruction->operation); return false; } return true; } bool PPCRecompilerX64Gen_imlInstruction_r_r_s32_carry(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction) { auto regR = _reg32(imlInstruction->op_r_r_s32_carry.regR); auto regA = _reg32(imlInstruction->op_r_r_s32_carry.regA); sint32 immS32 = imlInstruction->op_r_r_s32_carry.immS32; auto regCarry = _reg32(imlInstruction->op_r_r_s32_carry.regCarry); cemu_assert_debug(regCarry != regR && regCarry != regA); switch (imlInstruction->operation) { case PPCREC_IML_OP_ADD: x64GenContext->emitter->XOR_dd(regCarry, regCarry); if (regR != regA) x64GenContext->emitter->MOV_dd(regR, regA); x64GenContext->emitter->ADD_di32(regR, immS32); x64GenContext->emitter->SETcc_b(X86_CONDITION_B, _reg8_from_reg32(regCarry)); break; case PPCREC_IML_OP_ADD_WITH_CARRY: // assumes that carry is already correctly initialized as 0 or 1 if (regR != regA) x64GenContext->emitter->MOV_dd(regR, regA); x64GenContext->emitter->BT_du8(regCarry, 0); // copy carry register to x86 carry flag x64GenContext->emitter->ADC_di32(regR, immS32); x64GenContext->emitter->SETcc_b(X86_CONDITION_B, _reg8_from_reg32(regCarry)); break; default: cemu_assert_unimplemented(); return false; } return true; } bool PPCRecompilerX64Gen_imlInstruction_conditionalJumpCycleCheck(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction) { // some tests (all performed on a i7-4790K) // 1) DEC [mem] + JNS has significantly worse performance than BT + JNC (probably due to additional memory write and direct dependency) // 2) CMP [mem], 0 + JG has about equal (or slightly worse) performance than BT + JNC // BT x64Gen_bt_mem8(x64GenContext, X86_REG_RSP, offsetof(PPCInterpreter_t, remainingCycles), 31); // check if negative cemu_assert_debug(x64GenContext->currentSegment->GetBranchTaken()); PPCRecompilerX64Gen_rememberRelocatableOffset(x64GenContext, x64GenContext->currentSegment->GetBranchTaken()); x64Gen_jmpc_far(x64GenContext, X86_CONDITION_CARRY, 0); return true; } void PPCRecompilerX64Gen_imlInstruction_r_name(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction) { uint32 name = imlInstruction->op_r_name.name; if (imlInstruction->op_r_name.regR.GetBaseFormat() == IMLRegFormat::I64) { auto regR = _reg64(imlInstruction->op_r_name.regR); if (name >= PPCREC_NAME_R0 && name < PPCREC_NAME_R0 + 32) { x64Emit_mov_reg64_mem32(x64GenContext, regR, X86_REG_RSP, offsetof(PPCInterpreter_t, gpr) + sizeof(uint32) * (name - PPCREC_NAME_R0)); } else if (name >= PPCREC_NAME_SPR0 && name < PPCREC_NAME_SPR0 + 999) { sint32 sprIndex = (name - PPCREC_NAME_SPR0); if (sprIndex == SPR_LR) x64Emit_mov_reg64_mem32(x64GenContext, regR, X86_REG_RSP, offsetof(PPCInterpreter_t, spr.LR)); else if (sprIndex == SPR_CTR) x64Emit_mov_reg64_mem32(x64GenContext, regR, X86_REG_RSP, offsetof(PPCInterpreter_t, spr.CTR)); else if (sprIndex == SPR_XER) x64Emit_mov_reg64_mem32(x64GenContext, regR, X86_REG_RSP, offsetof(PPCInterpreter_t, spr.XER)); else if (sprIndex >= SPR_UGQR0 && sprIndex <= SPR_UGQR7) { sint32 memOffset = offsetof(PPCInterpreter_t, spr.UGQR) + sizeof(PPCInterpreter_t::spr.UGQR[0]) * (sprIndex - SPR_UGQR0); x64Emit_mov_reg64_mem32(x64GenContext, regR, X86_REG_RSP, memOffset); } else assert_dbg(); } else if (name >= PPCREC_NAME_TEMPORARY && name < PPCREC_NAME_TEMPORARY + 4) { x64Emit_mov_reg64_mem32(x64GenContext, regR, X86_REG_RSP, offsetof(PPCInterpreter_t, temporaryGPR_reg) + sizeof(uint32) * (name - PPCREC_NAME_TEMPORARY)); } else if (name == PPCREC_NAME_XER_CA) { x64Emit_movZX_reg64_mem8(x64GenContext, regR, X86_REG_RSP, offsetof(PPCInterpreter_t, xer_ca)); } else if (name == PPCREC_NAME_XER_SO) { x64Emit_movZX_reg64_mem8(x64GenContext, regR, X86_REG_RSP, offsetof(PPCInterpreter_t, xer_so)); } else if (name >= PPCREC_NAME_CR && name <= PPCREC_NAME_CR_LAST) { x64Emit_movZX_reg64_mem8(x64GenContext, regR, X86_REG_RSP, offsetof(PPCInterpreter_t, cr) + (name - PPCREC_NAME_CR)); } else if (name == PPCREC_NAME_CPU_MEMRES_EA) { x64Emit_mov_reg64_mem32(x64GenContext, regR, X86_REG_RSP, offsetof(PPCInterpreter_t, reservedMemAddr)); } else if (name == PPCREC_NAME_CPU_MEMRES_VAL) { x64Emit_mov_reg64_mem32(x64GenContext, regR, X86_REG_RSP, offsetof(PPCInterpreter_t, reservedMemValue)); } else assert_dbg(); } else if (imlInstruction->op_r_name.regR.GetBaseFormat() == IMLRegFormat::F64) { auto regR = _regF64(imlInstruction->op_r_name.regR); if (name >= PPCREC_NAME_FPR0 && name < (PPCREC_NAME_FPR0 + 32)) { x64Gen_movupd_xmmReg_memReg128(x64GenContext, regR, X86_REG_ESP, offsetof(PPCInterpreter_t, fpr) + sizeof(FPR_t) * (name - PPCREC_NAME_FPR0)); } else if (name >= PPCREC_NAME_TEMPORARY_FPR0 || name < (PPCREC_NAME_TEMPORARY_FPR0 + 8)) { x64Gen_movupd_xmmReg_memReg128(x64GenContext, regR, X86_REG_ESP, offsetof(PPCInterpreter_t, temporaryFPR) + sizeof(FPR_t) * (name - PPCREC_NAME_TEMPORARY_FPR0)); } else { cemu_assert_debug(false); } } else DEBUG_BREAK; } void PPCRecompilerX64Gen_imlInstruction_name_r(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction) { uint32 name = imlInstruction->op_r_name.name; if (imlInstruction->op_r_name.regR.GetBaseFormat() == IMLRegFormat::I64) { auto regR = _reg64(imlInstruction->op_r_name.regR); if (name >= PPCREC_NAME_R0 && name < PPCREC_NAME_R0 + 32) { x64Emit_mov_mem32_reg64(x64GenContext, X86_REG_RSP, offsetof(PPCInterpreter_t, gpr) + sizeof(uint32) * (name - PPCREC_NAME_R0), regR); } else if (name >= PPCREC_NAME_SPR0 && name < PPCREC_NAME_SPR0 + 999) { uint32 sprIndex = (name - PPCREC_NAME_SPR0); if (sprIndex == SPR_LR) x64Emit_mov_mem32_reg64(x64GenContext, X86_REG_RSP, offsetof(PPCInterpreter_t, spr.LR), regR); else if (sprIndex == SPR_CTR) x64Emit_mov_mem32_reg64(x64GenContext, X86_REG_RSP, offsetof(PPCInterpreter_t, spr.CTR), regR); else if (sprIndex == SPR_XER) x64Emit_mov_mem32_reg64(x64GenContext, X86_REG_RSP, offsetof(PPCInterpreter_t, spr.XER), regR); else if (sprIndex >= SPR_UGQR0 && sprIndex <= SPR_UGQR7) { sint32 memOffset = offsetof(PPCInterpreter_t, spr.UGQR) + sizeof(PPCInterpreter_t::spr.UGQR[0]) * (sprIndex - SPR_UGQR0); x64Emit_mov_mem32_reg64(x64GenContext, X86_REG_RSP, memOffset, regR); } else assert_dbg(); } else if (name >= PPCREC_NAME_TEMPORARY && name < PPCREC_NAME_TEMPORARY + 4) { x64Emit_mov_mem32_reg64(x64GenContext, X86_REG_RSP, offsetof(PPCInterpreter_t, temporaryGPR_reg) + sizeof(uint32) * (name - PPCREC_NAME_TEMPORARY), regR); } else if (name == PPCREC_NAME_XER_CA) { x64GenContext->emitter->MOV_bb_l(X86_REG_RSP, offsetof(PPCInterpreter_t, xer_ca), X86_REG_NONE, 0, _reg8_from_reg64(regR)); } else if (name == PPCREC_NAME_XER_SO) { x64GenContext->emitter->MOV_bb_l(X86_REG_RSP, offsetof(PPCInterpreter_t, xer_so), X86_REG_NONE, 0, _reg8_from_reg64(regR)); } else if (name >= PPCREC_NAME_CR && name <= PPCREC_NAME_CR_LAST) { x64GenContext->emitter->MOV_bb_l(X86_REG_RSP, offsetof(PPCInterpreter_t, cr) + (name - PPCREC_NAME_CR), X86_REG_NONE, 0, _reg8_from_reg64(regR)); } else if (name == PPCREC_NAME_CPU_MEMRES_EA) { x64Emit_mov_mem32_reg64(x64GenContext, X86_REG_RSP, offsetof(PPCInterpreter_t, reservedMemAddr), regR); } else if (name == PPCREC_NAME_CPU_MEMRES_VAL) { x64Emit_mov_mem32_reg64(x64GenContext, X86_REG_RSP, offsetof(PPCInterpreter_t, reservedMemValue), regR); } else assert_dbg(); } else if (imlInstruction->op_r_name.regR.GetBaseFormat() == IMLRegFormat::F64) { auto regR = _regF64(imlInstruction->op_r_name.regR); uint32 name = imlInstruction->op_r_name.name; if (name >= PPCREC_NAME_FPR0 && name < (PPCREC_NAME_FPR0 + 32)) { x64Gen_movupd_memReg128_xmmReg(x64GenContext, regR, X86_REG_ESP, offsetof(PPCInterpreter_t, fpr) + sizeof(FPR_t) * (name - PPCREC_NAME_FPR0)); } else if (name >= PPCREC_NAME_TEMPORARY_FPR0 && name < (PPCREC_NAME_TEMPORARY_FPR0 + 8)) { x64Gen_movupd_memReg128_xmmReg(x64GenContext, regR, X86_REG_ESP, offsetof(PPCInterpreter_t, temporaryFPR) + sizeof(FPR_t) * (name - PPCREC_NAME_TEMPORARY_FPR0)); } else { cemu_assert_debug(false); } } else DEBUG_BREAK; } //void PPCRecompilerX64Gen_imlInstruction_fpr_r_name(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction) //{ // uint32 name = imlInstruction->op_r_name.name; // uint32 fprReg = _regF64(imlInstruction->op_r_name.regR); // if (name >= PPCREC_NAME_FPR0 && name < (PPCREC_NAME_FPR0 + 32)) // { // x64Gen_movupd_xmmReg_memReg128(x64GenContext, fprReg, X86_REG_ESP, offsetof(PPCInterpreter_t, fpr) + sizeof(FPR_t) * (name - PPCREC_NAME_FPR0)); // } // else if (name >= PPCREC_NAME_TEMPORARY_FPR0 || name < (PPCREC_NAME_TEMPORARY_FPR0 + 8)) // { // x64Gen_movupd_xmmReg_memReg128(x64GenContext, fprReg, X86_REG_ESP, offsetof(PPCInterpreter_t, temporaryFPR) + sizeof(FPR_t) * (name - PPCREC_NAME_TEMPORARY_FPR0)); // } // else // { // cemu_assert_debug(false); // } //} // //void PPCRecompilerX64Gen_imlInstruction_fpr_name_r(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext, x64GenContext_t* x64GenContext, IMLInstruction* imlInstruction) //{ // uint32 name = imlInstruction->op_r_name.name; // uint32 fprReg = _regF64(imlInstruction->op_r_name.regR); // if (name >= PPCREC_NAME_FPR0 && name < (PPCREC_NAME_FPR0 + 32)) // { // x64Gen_movupd_memReg128_xmmReg(x64GenContext, fprReg, X86_REG_ESP, offsetof(PPCInterpreter_t, fpr) + sizeof(FPR_t) * (name - PPCREC_NAME_FPR0)); // } // else if (name >= PPCREC_NAME_TEMPORARY_FPR0 && name < (PPCREC_NAME_TEMPORARY_FPR0 + 8)) // { // x64Gen_movupd_memReg128_xmmReg(x64GenContext, fprReg, X86_REG_ESP, offsetof(PPCInterpreter_t, temporaryFPR) + sizeof(FPR_t) * (name - PPCREC_NAME_TEMPORARY_FPR0)); // } // else // { // cemu_assert_debug(false); // } //} uint8* codeMemoryBlock = nullptr; sint32 codeMemoryBlockIndex = 0; sint32 codeMemoryBlockSize = 0; std::mutex mtx_allocExecutableMemory; uint8* PPCRecompilerX86_allocateExecutableMemory(sint32 size) { std::lock_guard lck(mtx_allocExecutableMemory); if( codeMemoryBlockIndex+size > codeMemoryBlockSize ) { // allocate new block codeMemoryBlockSize = std::max(1024*1024*4, size+1024); // 4MB (or more if the function is larger than 4MB) codeMemoryBlockIndex = 0; codeMemoryBlock = (uint8*)MemMapper::AllocateMemory(nullptr, codeMemoryBlockSize, MemMapper::PAGE_PERMISSION::P_RWX); } uint8* codeMem = codeMemoryBlock + codeMemoryBlockIndex; codeMemoryBlockIndex += size; // pad to 4 byte alignment while (codeMemoryBlockIndex & 3) { codeMemoryBlock[codeMemoryBlockIndex] = 0x90; codeMemoryBlockIndex++; } return codeMem; } bool PPCRecompiler_generateX64Code(PPCRecFunction_t* PPCRecFunction, ppcImlGenContext_t* ppcImlGenContext) { x64GenContext_t x64GenContext{}; // generate iml instruction code bool codeGenerationFailed = false; for (IMLSegment* segIt : ppcImlGenContext->segmentList2) { x64GenContext.currentSegment = segIt; segIt->x64Offset = x64GenContext.emitter->GetWriteIndex(); for(size_t i=0; iimlList.size(); i++) { IMLInstruction* imlInstruction = segIt->imlList.data() + i; if( imlInstruction->type == PPCREC_IML_TYPE_R_NAME ) { PPCRecompilerX64Gen_imlInstruction_r_name(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction); } else if( imlInstruction->type == PPCREC_IML_TYPE_NAME_R ) { PPCRecompilerX64Gen_imlInstruction_name_r(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction); } else if( imlInstruction->type == PPCREC_IML_TYPE_R_R ) { if( PPCRecompilerX64Gen_imlInstruction_r_r(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction) == false ) codeGenerationFailed = true; } else if (imlInstruction->type == PPCREC_IML_TYPE_R_S32) { if (PPCRecompilerX64Gen_imlInstruction_r_s32(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction) == false) codeGenerationFailed = true; } else if (imlInstruction->type == PPCREC_IML_TYPE_CONDITIONAL_R_S32) { if (PPCRecompilerX64Gen_imlInstruction_conditional_r_s32(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction) == false) codeGenerationFailed = true; } else if (imlInstruction->type == PPCREC_IML_TYPE_R_R_S32) { if (PPCRecompilerX64Gen_imlInstruction_r_r_s32(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction) == false) codeGenerationFailed = true; } else if (imlInstruction->type == PPCREC_IML_TYPE_R_R_S32_CARRY) { if (PPCRecompilerX64Gen_imlInstruction_r_r_s32_carry(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction) == false) codeGenerationFailed = true; } else if (imlInstruction->type == PPCREC_IML_TYPE_R_R_R) { if (PPCRecompilerX64Gen_imlInstruction_r_r_r(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction) == false) codeGenerationFailed = true; } else if (imlInstruction->type == PPCREC_IML_TYPE_R_R_R_CARRY) { if (PPCRecompilerX64Gen_imlInstruction_r_r_r_carry(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction) == false) codeGenerationFailed = true; } else if (imlInstruction->type == PPCREC_IML_TYPE_COMPARE) { PPCRecompilerX64Gen_imlInstruction_compare(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction); } else if (imlInstruction->type == PPCREC_IML_TYPE_COMPARE_S32) { PPCRecompilerX64Gen_imlInstruction_compare_s32(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction); } else if (imlInstruction->type == PPCREC_IML_TYPE_CONDITIONAL_JUMP) { if (PPCRecompilerX64Gen_imlInstruction_cjump2(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction, segIt) == false) codeGenerationFailed = true; } else if (imlInstruction->type == PPCREC_IML_TYPE_JUMP) { if (PPCRecompilerX64Gen_imlInstruction_jump2(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction, segIt) == false) codeGenerationFailed = true; } else if( imlInstruction->type == PPCREC_IML_TYPE_CJUMP_CYCLE_CHECK ) { PPCRecompilerX64Gen_imlInstruction_conditionalJumpCycleCheck(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction); } else if( imlInstruction->type == PPCREC_IML_TYPE_MACRO ) { if( PPCRecompilerX64Gen_imlInstruction_macro(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction) == false ) { codeGenerationFailed = true; } } else if( imlInstruction->type == PPCREC_IML_TYPE_LOAD ) { if( PPCRecompilerX64Gen_imlInstruction_load(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction, false) == false ) { codeGenerationFailed = true; } } else if( imlInstruction->type == PPCREC_IML_TYPE_LOAD_INDEXED ) { if( PPCRecompilerX64Gen_imlInstruction_load(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction, true) == false ) { codeGenerationFailed = true; } } else if( imlInstruction->type == PPCREC_IML_TYPE_STORE ) { if( PPCRecompilerX64Gen_imlInstruction_store(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction, false) == false ) { codeGenerationFailed = true; } } else if( imlInstruction->type == PPCREC_IML_TYPE_STORE_INDEXED ) { if( PPCRecompilerX64Gen_imlInstruction_store(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction, true) == false ) { codeGenerationFailed = true; } } else if (imlInstruction->type == PPCREC_IML_TYPE_ATOMIC_CMP_STORE) { if (!PPCRecompilerX64Gen_imlInstruction_atomic_cmp_store(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction)) codeGenerationFailed = true; } else if( imlInstruction->type == PPCREC_IML_TYPE_NO_OP ) { // no op } else if( imlInstruction->type == PPCREC_IML_TYPE_FPR_LOAD ) { if( PPCRecompilerX64Gen_imlInstruction_fpr_load(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction, false) == false ) { codeGenerationFailed = true; } } else if( imlInstruction->type == PPCREC_IML_TYPE_FPR_LOAD_INDEXED ) { if( PPCRecompilerX64Gen_imlInstruction_fpr_load(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction, true) == false ) { codeGenerationFailed = true; } } else if( imlInstruction->type == PPCREC_IML_TYPE_FPR_STORE ) { if( PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction, false) == false ) { codeGenerationFailed = true; } } else if( imlInstruction->type == PPCREC_IML_TYPE_FPR_STORE_INDEXED ) { if( PPCRecompilerX64Gen_imlInstruction_fpr_store(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction, true) == false ) { codeGenerationFailed = true; } } else if( imlInstruction->type == PPCREC_IML_TYPE_FPR_R_R ) { PPCRecompilerX64Gen_imlInstruction_fpr_r_r(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction); } else if( imlInstruction->type == PPCREC_IML_TYPE_FPR_R_R_R ) { PPCRecompilerX64Gen_imlInstruction_fpr_r_r_r(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction); } else if( imlInstruction->type == PPCREC_IML_TYPE_FPR_R_R_R_R ) { PPCRecompilerX64Gen_imlInstruction_fpr_r_r_r_r(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction); } else if( imlInstruction->type == PPCREC_IML_TYPE_FPR_R ) { PPCRecompilerX64Gen_imlInstruction_fpr_r(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction); } else if (imlInstruction->type == PPCREC_IML_TYPE_FPR_COMPARE) { PPCRecompilerX64Gen_imlInstruction_fpr_compare(PPCRecFunction, ppcImlGenContext, &x64GenContext, imlInstruction); } else { debug_printf("PPCRecompiler_generateX64Code(): Unsupported iml type 0x%x\n", imlInstruction->type); assert_dbg(); } } } // handle failed code generation if( codeGenerationFailed ) { return false; } // allocate executable memory uint8* executableMemory = PPCRecompilerX86_allocateExecutableMemory(x64GenContext.emitter->GetBuffer().size_bytes()); size_t baseAddress = (size_t)executableMemory; // fix relocs for(auto& relocIt : x64GenContext.relocateOffsetTable2) { // search for segment that starts with this offset uint32 ppcOffset = (uint32)(size_t)relocIt.extraInfo; uint32 x64Offset = 0xFFFFFFFF; IMLSegment* destSegment = (IMLSegment*)relocIt.extraInfo; x64Offset = destSegment->x64Offset; uint32 relocBase = relocIt.offset; uint8* relocInstruction = x64GenContext.emitter->GetBufferPtr()+relocBase; if( relocInstruction[0] == 0x0F && (relocInstruction[1] >= 0x80 && relocInstruction[1] <= 0x8F) ) { // Jcc relativeImm32 sint32 distanceNearJump = (sint32)((baseAddress + x64Offset) - (baseAddress + relocBase + 2)); if (distanceNearJump >= -128 && distanceNearJump < 127) // disabled { // convert to near Jcc *(uint8*)(relocInstruction + 0) = (uint8)(relocInstruction[1]-0x80 + 0x70); // patch offset *(uint8*)(relocInstruction + 1) = (uint8)distanceNearJump; // replace unused 4 bytes with NOP instruction relocInstruction[2] = 0x0F; relocInstruction[3] = 0x1F; relocInstruction[4] = 0x40; relocInstruction[5] = 0x00; } else { // patch offset *(uint32*)(relocInstruction + 2) = (uint32)((baseAddress + x64Offset) - (baseAddress + relocBase + 6)); } } else if( relocInstruction[0] == 0xE9 ) { // JMP relativeImm32 *(uint32*)(relocInstruction+1) = (uint32)((baseAddress+x64Offset)-(baseAddress+relocBase+5)); } else assert_dbg(); } // copy code to executable memory std::span codeBuffer = x64GenContext.emitter->GetBuffer(); memcpy(executableMemory, codeBuffer.data(), codeBuffer.size_bytes()); // set code PPCRecFunction->x86Code = executableMemory; PPCRecFunction->x86Size = codeBuffer.size_bytes(); return true; } void PPCRecompilerX64Gen_generateEnterRecompilerCode() { x64GenContext_t x64GenContext{}; // start of recompiler entry function x64Gen_push_reg64(&x64GenContext, X86_REG_RAX); x64Gen_push_reg64(&x64GenContext, X86_REG_RCX); x64Gen_push_reg64(&x64GenContext, X86_REG_RDX); x64Gen_push_reg64(&x64GenContext, X86_REG_RBX); x64Gen_push_reg64(&x64GenContext, X86_REG_RBP); x64Gen_push_reg64(&x64GenContext, X86_REG_RDI); x64Gen_push_reg64(&x64GenContext, X86_REG_RSI); x64Gen_push_reg64(&x64GenContext, X86_REG_R8); x64Gen_push_reg64(&x64GenContext, X86_REG_R9); x64Gen_push_reg64(&x64GenContext, X86_REG_R10); x64Gen_push_reg64(&x64GenContext, X86_REG_R11); x64Gen_push_reg64(&x64GenContext, X86_REG_R12); x64Gen_push_reg64(&x64GenContext, X86_REG_R13); x64Gen_push_reg64(&x64GenContext, X86_REG_R14); x64Gen_push_reg64(&x64GenContext, X86_REG_R15); // 000000007775EF04 | E8 00 00 00 00 call +0x00 x64Gen_writeU8(&x64GenContext, 0xE8); x64Gen_writeU8(&x64GenContext, 0x00); x64Gen_writeU8(&x64GenContext, 0x00); x64Gen_writeU8(&x64GenContext, 0x00); x64Gen_writeU8(&x64GenContext, 0x00); //000000007775EF09 | 48 83 04 24 05 add qword ptr ss:[rsp],5 x64Gen_writeU8(&x64GenContext, 0x48); x64Gen_writeU8(&x64GenContext, 0x83); x64Gen_writeU8(&x64GenContext, 0x04); x64Gen_writeU8(&x64GenContext, 0x24); uint32 jmpPatchOffset = x64GenContext.emitter->GetWriteIndex(); x64Gen_writeU8(&x64GenContext, 0); // skip the distance until after the JMP x64Emit_mov_mem64_reg64(&x64GenContext, X86_REG_RDX, offsetof(PPCInterpreter_t, rspTemp), X86_REG_RSP); // MOV RSP, RDX (ppc interpreter instance) x64Gen_mov_reg64_reg64(&x64GenContext, X86_REG_RSP, X86_REG_RDX); // MOV R15, ppcRecompilerInstanceData x64Gen_mov_reg64_imm64(&x64GenContext, X86_REG_R15, (uint64)ppcRecompilerInstanceData); // MOV R13, memory_base x64Gen_mov_reg64_imm64(&x64GenContext, X86_REG_R13, (uint64)memory_base); //JMP recFunc x64Gen_jmp_reg64(&x64GenContext, X86_REG_RCX); // call argument 1 x64GenContext.emitter->GetBuffer()[jmpPatchOffset] = (x64GenContext.emitter->GetWriteIndex() -(jmpPatchOffset-4)); //recompilerExit1: x64Gen_pop_reg64(&x64GenContext, X86_REG_R15); x64Gen_pop_reg64(&x64GenContext, X86_REG_R14); x64Gen_pop_reg64(&x64GenContext, X86_REG_R13); x64Gen_pop_reg64(&x64GenContext, X86_REG_R12); x64Gen_pop_reg64(&x64GenContext, X86_REG_R11); x64Gen_pop_reg64(&x64GenContext, X86_REG_R10); x64Gen_pop_reg64(&x64GenContext, X86_REG_R9); x64Gen_pop_reg64(&x64GenContext, X86_REG_R8); x64Gen_pop_reg64(&x64GenContext, X86_REG_RSI); x64Gen_pop_reg64(&x64GenContext, X86_REG_RDI); x64Gen_pop_reg64(&x64GenContext, X86_REG_RBP); x64Gen_pop_reg64(&x64GenContext, X86_REG_RBX); x64Gen_pop_reg64(&x64GenContext, X86_REG_RDX); x64Gen_pop_reg64(&x64GenContext, X86_REG_RCX); x64Gen_pop_reg64(&x64GenContext, X86_REG_RAX); // RET x64Gen_ret(&x64GenContext); uint8* executableMemory = PPCRecompilerX86_allocateExecutableMemory(x64GenContext.emitter->GetBuffer().size_bytes()); // copy code to executable memory memcpy(executableMemory, x64GenContext.emitter->GetBuffer().data(), x64GenContext.emitter->GetBuffer().size_bytes()); PPCRecompiler_enterRecompilerCode = (void ATTR_MS_ABI (*)(uint64,uint64))executableMemory; } void* PPCRecompilerX64Gen_generateLeaveRecompilerCode() { x64GenContext_t x64GenContext{}; // update instruction pointer // LR is in EDX x64Emit_mov_mem32_reg32(&x64GenContext, X86_REG_RSP, offsetof(PPCInterpreter_t, instructionPointer), X86_REG_EDX); // MOV RSP, [ppcRecompilerX64_rspTemp] x64Emit_mov_reg64_mem64(&x64GenContext, X86_REG_RSP, REG_RESV_HCPU, offsetof(PPCInterpreter_t, rspTemp)); // RET x64Gen_ret(&x64GenContext); uint8* executableMemory = PPCRecompilerX86_allocateExecutableMemory(x64GenContext.emitter->GetBuffer().size_bytes()); // copy code to executable memory memcpy(executableMemory, x64GenContext.emitter->GetBuffer().data(), x64GenContext.emitter->GetBuffer().size_bytes()); return executableMemory; } void PPCRecompilerX64Gen_generateRecompilerInterfaceFunctions() { PPCRecompilerX64Gen_generateEnterRecompilerCode(); PPCRecompiler_leaveRecompilerCode_unvisited = (void ATTR_MS_ABI (*)())PPCRecompilerX64Gen_generateLeaveRecompilerCode(); PPCRecompiler_leaveRecompilerCode_visited = (void ATTR_MS_ABI (*)())PPCRecompilerX64Gen_generateLeaveRecompilerCode(); cemu_assert_debug(PPCRecompiler_leaveRecompilerCode_unvisited != PPCRecompiler_leaveRecompilerCode_visited); }