Enable ASLR

This commit is contained in:
Vestral 2025-03-15 10:02:46 +09:00 committed by Megamouse
parent 68e7f4e820
commit e2df71d87c
8 changed files with 84 additions and 69 deletions

View file

@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 3.28)
project(rpcs3 LANGUAGES C CXX) project(rpcs3 LANGUAGES C CXX)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11) if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 11)

View file

@ -344,15 +344,7 @@ jit_runtime_base& asmjit::get_global_runtime()
{ {
custom_runtime() noexcept custom_runtime() noexcept
{ {
// Search starting in first 2 GiB of memory ensure(m_pos.raw() = static_cast<uchar*>(utils::memory_reserve(size)));
for (u64 addr = size;; addr += size)
{
if (auto ptr = utils::memory_reserve(size, reinterpret_cast<void*>(addr)))
{
m_pos.raw() = static_cast<uchar*>(ptr);
break;
}
}
// Initialize "end" pointer // Initialize "end" pointer
m_max = m_pos + size; m_max = m_pos + size;

View file

@ -5,13 +5,12 @@ if(MSVC)
add_compile_definitions( add_compile_definitions(
_CRT_SECURE_NO_DEPRECATE=1 _CRT_NON_CONFORMING_SWPRINTFS=1 _SCL_SECURE_NO_WARNINGS=1 _CRT_SECURE_NO_DEPRECATE=1 _CRT_NON_CONFORMING_SWPRINTFS=1 _SCL_SECURE_NO_WARNINGS=1
NOMINMAX _ENABLE_EXTENDED_ALIGNED_STORAGE=1 _HAS_EXCEPTIONS=0) NOMINMAX _ENABLE_EXTENDED_ALIGNED_STORAGE=1 _HAS_EXCEPTIONS=0)
add_link_options(/DYNAMICBASE:NO /BASE:0x10000 /FIXED) add_link_options(/DYNAMICBASE:YES)
#TODO: Some of these could be cleaned up #TODO: Some of these could be cleaned up
add_compile_options(/wd4805) # Comparing boolean and int add_compile_options(/wd4805) # Comparing boolean and int
add_compile_options(/wd4804) # Using integer operators with booleans add_compile_options(/wd4804) # Using integer operators with booleans
add_compile_options(/wd4200) # Zero-sized array in struct/union add_compile_options(/wd4200) # Zero-sized array in struct/union
add_link_options(/ignore:4281) # Undesirable base address 0x10000
# MSVC 2017 uses iterator as base class internally, causing a lot of warning spam # MSVC 2017 uses iterator as base class internally, causing a lot of warning spam
add_compile_definitions(_SILENCE_CXX17_ITERATOR_BASE_CLASS_DEPRECATION_WARNING=1) add_compile_definitions(_SILENCE_CXX17_ITERATOR_BASE_CLASS_DEPRECATION_WARNING=1)
@ -19,8 +18,6 @@ if(MSVC)
# Increase stack limit to 8 MB # Increase stack limit to 8 MB
add_link_options(/STACK:8388608,1048576) add_link_options(/STACK:8388608,1048576)
else() else()
# Some distros have the compilers set to use PIE by default, but RPCS3 doesn't work with PIE, so we need to disable it.
check_cxx_compiler_flag("-no-pie" HAS_NO_PIE)
check_cxx_compiler_flag("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE) check_cxx_compiler_flag("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
check_cxx_compiler_flag("-msse -msse2 -mcx16" COMPILER_X86) check_cxx_compiler_flag("-msse -msse2 -mcx16" COMPILER_X86)
if (APPLE) if (APPLE)
@ -99,15 +96,6 @@ else()
if(NOT APPLE AND NOT WIN32) if(NOT APPLE AND NOT WIN32)
# This hides our LLVM from mesa's LLVM, otherwise we get some unresolvable conflicts. # This hides our LLVM from mesa's LLVM, otherwise we get some unresolvable conflicts.
add_link_options(-Wl,--exclude-libs,ALL) add_link_options(-Wl,--exclude-libs,ALL)
if(HAS_NO_PIE)
add_link_options(-no-pie)
endif()
elseif(APPLE)
if (CMAKE_OSX_ARCHITECTURES MATCHES "x86_64")
add_link_options(-Wl,-image_base,0x10000 -Wl,-pagezero_size,0x10000)
add_link_options(-Wl,-no_pie)
endif()
elseif(WIN32) elseif(WIN32)
add_compile_definitions(__STDC_FORMAT_MACROS=1) add_compile_definitions(__STDC_FORMAT_MACROS=1)
@ -116,11 +104,6 @@ else()
# Increase stack limit to 8 MB # Increase stack limit to 8 MB
add_link_options(-Wl,--stack -Wl,8388608) add_link_options(-Wl,--stack -Wl,8388608)
# For arm64 windows, the image base cannot be below 4GB or the OS rejects the binary without much explanation.
if(COMPILER_X86)
add_link_options(-Wl,--image-base,0x10000)
endif()
endif() endif()
# Specify C++ library to use as standard C++ when using clang (not required on linux due to GNU) # Specify C++ library to use as standard C++ when using clang (not required on linux due to GNU)

View file

@ -1902,8 +1902,9 @@ auto gen_ghc_cpp_trampoline(ppu_intrp_func_t fn_target)
// Take second ghc arg // Take second ghc arg
c.mov(args[0], x86::rbp); c.mov(args[0], x86::rbp);
c.mov(args[2].r32(), x86::dword_ptr(args[0], ::offset32(&ppu_thread::cia))); c.mov(args[2].r32(), x86::dword_ptr(args[0], ::offset32(&ppu_thread::cia)));
c.add(args[2], x86::qword_ptr(reinterpret_cast<u64>(&vm::g_base_addr))); c.movabs(args[1], reinterpret_cast<u64>(&vm::g_base_addr));
c.jmp(fn_target); c.add(args[2], x86::qword_ptr(args[1]));
c.jmp(Imm(fn_target));
}; };
} }

View file

@ -220,7 +220,8 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
c.mov(x86::qword_ptr(args[0], ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs)), x86::rsp); c.mov(x86::qword_ptr(args[0], ::offset32(&ppu_thread::hv_ctx, &rpcs3::hypervisor_context_t::regs)), x86::rsp);
// Initialize args // Initialize args
c.mov(x86::r13, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_exec_addr))); c.movabs(x86::r13, reinterpret_cast<u64>(&vm::g_exec_addr));
c.mov(x86::r13, x86::qword_ptr(x86::r13));
c.mov(x86::rbp, args[0]); c.mov(x86::rbp, args[0]);
c.mov(x86::edx, x86::dword_ptr(x86::rbp, ::offset32(&ppu_thread::cia))); // Load PC c.mov(x86::edx, x86::dword_ptr(x86::rbp, ::offset32(&ppu_thread::cia))); // Load PC
@ -232,7 +233,8 @@ const auto ppu_gateway = build_function_asm<void(*)(ppu_thread*)>("ppu_gateway",
c.shl(x86::edx, 13); c.shl(x86::edx, 13);
c.mov(x86::r12d, x86::edx); // Load relocation base c.mov(x86::r12d, x86::edx); // Load relocation base
c.mov(x86::rbx, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_base_addr))); c.movabs(x86::rbx, reinterpret_cast<u64>(&vm::g_base_addr));
c.mov(x86::rbx, x86::qword_ptr(x86::rbx));
c.mov(x86::r14, x86::qword_ptr(x86::rbp, ::offset32(&ppu_thread::gpr, 0))); // Load some registers c.mov(x86::r14, x86::qword_ptr(x86::rbp, ::offset32(&ppu_thread::gpr, 0))); // Load some registers
c.mov(x86::rsi, x86::qword_ptr(x86::rbp, ::offset32(&ppu_thread::gpr, 1))); c.mov(x86::rsi, x86::qword_ptr(x86::rbp, ::offset32(&ppu_thread::gpr, 1)));
c.mov(x86::rdi, x86::qword_ptr(x86::rbp, ::offset32(&ppu_thread::gpr, 2))); c.mov(x86::rdi, x86::qword_ptr(x86::rbp, ::offset32(&ppu_thread::gpr, 2)));
@ -3164,8 +3166,9 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers) // Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
c.push(x86::rbp); c.push(x86::rbp);
c.push(x86::r13);
c.push(x86::r14); c.push(x86::r14);
c.sub(x86::rsp, 40); c.sub(x86::rsp, 48);
#ifdef _WIN32 #ifdef _WIN32
if (!s_tsx_avx) if (!s_tsx_avx)
{ {
@ -3176,14 +3179,16 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime
// Prepare registers // Prepare registers
build_swap_rdx_with(c, args, x86::r10); build_swap_rdx_with(c, args, x86::r10);
c.mov(x86::rbp, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr))); c.movabs(x86::rbp, reinterpret_cast<u64>(&vm::g_sudo_addr));
c.mov(x86::rbp, x86::qword_ptr(x86::rbp));
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0])); c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
c.and_(x86::rbp, -128); c.and_(x86::rbp, -128);
c.prefetchw(x86::byte_ptr(x86::rbp, 0)); c.prefetchw(x86::byte_ptr(x86::rbp, 0));
c.prefetchw(x86::byte_ptr(x86::rbp, 64)); c.prefetchw(x86::byte_ptr(x86::rbp, 64));
c.movzx(args[0].r32(), args[0].r16()); c.movzx(args[0].r32(), args[0].r16());
c.shr(args[0].r32(), 1); c.shr(args[0].r32(), 1);
c.lea(x86::r11, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0])); c.movabs(x86::r11, reinterpret_cast<u64>(+vm::g_reservations));
c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
c.and_(x86::r11, -128 / 2); c.and_(x86::r11, -128 / 2);
c.and_(args[0].r32(), 63); c.and_(args[0].r32(), 63);
@ -3217,7 +3222,8 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime
{ {
build_get_tsc(c); build_get_tsc(c);
c.sub(x86::rax, stamp0); c.sub(x86::rax, stamp0);
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2))); c.movabs(x86::r13, reinterpret_cast<u64>(&g_rtm_tx_limit2));
c.cmp(x86::rax, x86::qword_ptr(x86::r13));
c.jae(fall); c.jae(fall);
}); });
@ -3342,8 +3348,9 @@ const auto ppu_stcx_accurate_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime
c.vzeroupper(); c.vzeroupper();
} }
c.add(x86::rsp, 40); c.add(x86::rsp, 48);
c.pop(x86::r14); c.pop(x86::r14);
c.pop(x86::r13);
c.pop(x86::rbp); c.pop(x86::rbp);
maybe_flush_lbr(c); maybe_flush_lbr(c);
@ -4179,7 +4186,7 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
// 2 7MB overlay files -> 14GB // 2 7MB overlay files -> 14GB
// The growth in memory requirements of LLVM is not linear with file size of course // The growth in memory requirements of LLVM is not linear with file size of course
// But these estimates should hopefully protect RPCS3 in the coming years // But these estimates should hopefully protect RPCS3 in the coming years
// Especially when thread count is on the rise with each CPU generation // Especially when thread count is on the rise with each CPU generation
atomic_t<u32> file_size_limit = static_cast<u32>(std::clamp<u64>(utils::aligned_div<u64>(utils::get_total_memory(), 2000), 65536, u32{umax})); atomic_t<u32> file_size_limit = static_cast<u32>(std::clamp<u64>(utils::aligned_div<u64>(utils::get_total_memory(), 2000), 65536, u32{umax}));
const u32 software_thread_limit = std::min<u32>(g_cfg.core.llvm_threads ? g_cfg.core.llvm_threads : u32{umax}, ::size32(file_queue)); const u32 software_thread_limit = std::min<u32>(g_cfg.core.llvm_threads ? g_cfg.core.llvm_threads : u32{umax}, ::size32(file_queue));
@ -4301,8 +4308,8 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
if (!src && !Emu.klic.empty() && src.open(path)) if (!src && !Emu.klic.empty() && src.open(path))
{ {
src = decrypt_self(src, reinterpret_cast<u8*>(&Emu.klic[0])); src = decrypt_self(src, reinterpret_cast<u8*>(&Emu.klic[0]));
if (src) if (src)
{ {
ppu_log.error("Possible missed KLIC for precompilation of '%s', please report to developers.", path); ppu_log.error("Possible missed KLIC for precompilation of '%s', please report to developers.", path);
@ -4333,7 +4340,7 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
{ {
if (value) if (value)
{ {
// Allow at least one file, make 0 the "memory unavailable" sign value for atomic waiting efficiency // Allow at least one file, make 0 the "memory unavailable" sign value for atomic waiting efficiency
const u32 new_val = static_cast<u32>(utils::sub_saturate<u64>(value, file_size)); const u32 new_val = static_cast<u32>(utils::sub_saturate<u64>(value, file_size));
restore_mem = value - new_val; restore_mem = value - new_val;
value = new_val; value = new_val;
@ -4506,8 +4513,8 @@ extern void ppu_precompile(std::vector<std::string>& dir_queue, std::vector<ppu_
if (!src && !Emu.klic.empty() && src.open(path)) if (!src && !Emu.klic.empty() && src.open(path))
{ {
src = decrypt_self(src, reinterpret_cast<u8*>(&Emu.klic[0])); src = decrypt_self(src, reinterpret_cast<u8*>(&Emu.klic[0]));
if (src) if (src)
{ {
ppu_log.error("Possible missed KLIC for precompilation of '%s', please report to developers.", path); ppu_log.error("Possible missed KLIC for precompilation of '%s', please report to developers.", path);
} }
@ -5079,7 +5086,8 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
code_size_until_jump = buf_end - buf_start; code_size_until_jump = buf_end - buf_start;
c.add(x86::edx, seg0); c.add(x86::edx, seg0);
c.mov(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_exec_addr))); c.movabs(x86::rax, reinterpret_cast<u64>(&vm::g_exec_addr));
c.mov(x86::rax, x86::qword_ptr(x86::rax));
c.mov(x86::dword_ptr(x86::rbp, ::offset32(&ppu_thread::cia)), x86::edx); c.mov(x86::dword_ptr(x86::rbp, ::offset32(&ppu_thread::cia)), x86::edx);
c.mov(x86::rax, x86::qword_ptr(x86::rax, x86::rdx, 1, 0)); // Load call target c.mov(x86::rax, x86::qword_ptr(x86::rax, x86::rdx, 1, 0)); // Load call target
@ -5340,7 +5348,7 @@ bool ppu_initialize(const ppu_module<lv2_obj>& info, bool check_only, u64 file_s
sha1_update(&ctx, reinterpret_cast<const u8*>(addrs.data()), addrs.size() * sizeof(be_t<u32>)); sha1_update(&ctx, reinterpret_cast<const u8*>(addrs.data()), addrs.size() * sizeof(be_t<u32>));
} }
part.jit_bounds = std::move(local_jit_bounds); part.jit_bounds = std::move(local_jit_bounds);
local_jit_bounds = std::make_shared<std::pair<u32, u32>>(u32{umax}, 0); local_jit_bounds = std::make_shared<std::pair<u32, u32>>(u32{umax}, 0);
} }

View file

@ -2770,14 +2770,17 @@ void spu_recompiler::FREST(spu_opcode_t op)
const u64 fraction_lut_addr = reinterpret_cast<u64>(spu_frest_fraction_lut); const u64 fraction_lut_addr = reinterpret_cast<u64>(spu_frest_fraction_lut);
const u64 exponent_lut_addr = reinterpret_cast<u64>(spu_frest_exponent_lut); const u64 exponent_lut_addr = reinterpret_cast<u64>(spu_frest_exponent_lut);
c->movabs(*arg0, fraction_lut_addr);
c->movabs(*arg1, exponent_lut_addr);
for (u32 index = 0; index < 4; index++) for (u32 index = 0; index < 4; index++)
{ {
c->pextrd(*qw0, v_fraction, index); c->pextrd(*qw0, v_fraction, index);
c->mov(*qw1, asmjit::x86::dword_ptr(fraction_lut_addr, *qw0, 2)); c->mov(*qw1, asmjit::x86::dword_ptr(*arg0, *qw0, 2));
c->pinsrd(v_fraction, *qw1, index); c->pinsrd(v_fraction, *qw1, index);
c->pextrd(*qw0, v_exponent, index); c->pextrd(*qw0, v_exponent, index);
c->mov(*qw1, asmjit::x86::dword_ptr(exponent_lut_addr, *qw0, 2)); c->mov(*qw1, asmjit::x86::dword_ptr(*arg1, *qw0, 2));
c->pinsrd(v_exponent, *qw1, index); c->pinsrd(v_exponent, *qw1, index);
} }
@ -2810,14 +2813,17 @@ void spu_recompiler::FRSQEST(spu_opcode_t op)
const u64 fraction_lut_addr = reinterpret_cast<u64>(spu_frsqest_fraction_lut); const u64 fraction_lut_addr = reinterpret_cast<u64>(spu_frsqest_fraction_lut);
const u64 exponent_lut_addr = reinterpret_cast<u64>(spu_frsqest_exponent_lut); const u64 exponent_lut_addr = reinterpret_cast<u64>(spu_frsqest_exponent_lut);
c->movabs(*arg0, fraction_lut_addr);
c->movabs(*arg1, exponent_lut_addr);
for (u32 index = 0; index < 4; index++) for (u32 index = 0; index < 4; index++)
{ {
c->pextrd(*qw0, v_fraction, index); c->pextrd(*qw0, v_fraction, index);
c->mov(*qw1, asmjit::x86::dword_ptr(fraction_lut_addr, *qw0, 2)); c->mov(*qw1, asmjit::x86::dword_ptr(*arg0, *qw0, 2));
c->pinsrd(v_fraction, *qw1, index); c->pinsrd(v_fraction, *qw1, index);
c->pextrd(*qw0, v_exponent, index); c->pextrd(*qw0, v_exponent, index);
c->mov(*qw1, asmjit::x86::dword_ptr(exponent_lut_addr, *qw0, 2)); c->mov(*qw1, asmjit::x86::dword_ptr(*arg1, *qw0, 2));
c->pinsrd(v_exponent, *qw1, index); c->pinsrd(v_exponent, *qw1, index);
} }

View file

@ -628,6 +628,8 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, void*
//} //}
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers) // Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
c.push(x86::rbp);
c.push(x86::rbx);
#ifdef _WIN32 #ifdef _WIN32
c.sub(x86::rsp, 168); c.sub(x86::rsp, 168);
if (s_tsx_avx) if (s_tsx_avx)
@ -648,17 +650,21 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, void*
c.movups(x86::oword_ptr(x86::rsp, 128), x86::xmm14); c.movups(x86::oword_ptr(x86::rsp, 128), x86::xmm14);
c.movups(x86::oword_ptr(x86::rsp, 144), x86::xmm15); c.movups(x86::oword_ptr(x86::rsp, 144), x86::xmm15);
} }
#else
c.sub(x86::rsp, 40);
#endif #endif
// Prepare registers // Prepare registers
build_swap_rdx_with(c, args, x86::r10); build_swap_rdx_with(c, args, x86::r10);
c.mov(args[1], x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr))); c.movabs(args[1], reinterpret_cast<u64>(&vm::g_sudo_addr));
c.mov(args[1], x86::qword_ptr(args[1]));
c.lea(args[1], x86::qword_ptr(args[1], args[0])); c.lea(args[1], x86::qword_ptr(args[1], args[0]));
c.prefetchw(x86::byte_ptr(args[1], 0)); c.prefetchw(x86::byte_ptr(args[1], 0));
c.prefetchw(x86::byte_ptr(args[1], 64)); c.prefetchw(x86::byte_ptr(args[1], 64));
c.and_(args[0].r32(), 0xff80); c.and_(args[0].r32(), 0xff80);
c.shr(args[0].r32(), 1); c.shr(args[0].r32(), 1);
c.lea(x86::r11, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0])); c.movabs(x86::r11, reinterpret_cast<u64>(+vm::g_reservations));
c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
// Prepare data // Prepare data
if (s_tsx_avx) if (s_tsx_avx)
@ -703,7 +709,8 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, void*
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx) - ::offset32(&spu_thread::rdata)), 1); c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx) - ::offset32(&spu_thread::rdata)), 1);
build_get_tsc(c); build_get_tsc(c);
c.sub(x86::rax, stamp0); c.sub(x86::rax, stamp0);
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2))); c.movabs(x86::rbx, reinterpret_cast<u64>(&g_rtm_tx_limit2));
c.cmp(x86::rax, x86::qword_ptr(x86::rbx));
c.jae(fall); c.jae(fall);
}); });
@ -853,8 +860,13 @@ const auto spu_putllc_tx = build_function_asm<u64(*)(u32 raddr, u64 rtime, void*
c.movups(x86::xmm15, x86::oword_ptr(x86::rsp, 144)); c.movups(x86::xmm15, x86::oword_ptr(x86::rsp, 144));
} }
c.add(x86::rsp, 168); c.add(x86::rsp, 168);
#else
c.add(x86::rsp, 40);
#endif #endif
c.pop(x86::rbx);
c.pop(x86::rbp);
if (s_tsx_avx) if (s_tsx_avx)
{ {
c.vzeroupper(); c.vzeroupper();
@ -884,8 +896,10 @@ const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rda
//} //}
// Create stack frame if necessary (Windows ABI has only 6 volatile vector registers) // Create stack frame if necessary (Windows ABI has only 6 volatile vector registers)
#ifdef _WIN32 c.push(x86::rbp);
c.push(x86::rbx);
c.sub(x86::rsp, 40); c.sub(x86::rsp, 40);
#ifdef _WIN32
if (!s_tsx_avx) if (!s_tsx_avx)
{ {
c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6); c.movups(x86::oword_ptr(x86::rsp, 0), x86::xmm6);
@ -894,7 +908,8 @@ const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rda
#endif #endif
// Prepare registers // Prepare registers
build_swap_rdx_with(c, args, x86::r10); build_swap_rdx_with(c, args, x86::r10);
c.mov(x86::r11, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr))); c.movabs(x86::r11, reinterpret_cast<u64>(&vm::g_sudo_addr));
c.mov(x86::r11, x86::qword_ptr(x86::r11));
c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0])); c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
c.prefetchw(x86::byte_ptr(x86::r11, 0)); c.prefetchw(x86::byte_ptr(x86::r11, 0));
c.prefetchw(x86::byte_ptr(x86::r11, 64)); c.prefetchw(x86::byte_ptr(x86::r11, 64));
@ -921,7 +936,8 @@ const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rda
c.and_(args[0].r32(), 0xff80); c.and_(args[0].r32(), 0xff80);
c.shr(args[0].r32(), 1); c.shr(args[0].r32(), 1);
c.lea(args[1], x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0])); c.movabs(args[1], reinterpret_cast<u64>(+vm::g_reservations));
c.lea(args[1], x86::qword_ptr(args[1], args[0]));
// Alloc args[0] to stamp0 // Alloc args[0] to stamp0
const auto stamp0 = args[0]; const auto stamp0 = args[0];
@ -933,7 +949,8 @@ const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rda
c.add(x86::qword_ptr(args[3]), 1); c.add(x86::qword_ptr(args[3]), 1);
build_get_tsc(c); build_get_tsc(c);
c.sub(x86::rax, stamp0); c.sub(x86::rax, stamp0);
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit2))); c.movabs(x86::rbx, reinterpret_cast<u64>(&g_rtm_tx_limit2));
c.cmp(x86::rax, x86::qword_ptr(x86::rbx));
c.jae(fall); c.jae(fall);
}); });
@ -986,6 +1003,10 @@ const auto spu_putlluc_tx = build_function_asm<u64(*)(u32 raddr, const void* rda
c.vzeroupper(); c.vzeroupper();
} }
c.add(x86::rsp, 40);
c.pop(x86::rbx);
c.pop(x86::rbp);
maybe_flush_lbr(c); maybe_flush_lbr(c);
c.ret(); c.ret();
#else #else
@ -1023,11 +1044,13 @@ const auto spu_getllar_tx = build_function_asm<u64(*)(u32 raddr, void* rdata, cp
// Prepare registers // Prepare registers
build_swap_rdx_with(c, args, x86::r10); build_swap_rdx_with(c, args, x86::r10);
c.mov(x86::rbp, x86::qword_ptr(reinterpret_cast<u64>(&vm::g_sudo_addr))); c.movabs(x86::rbp, reinterpret_cast<u64>(&vm::g_sudo_addr));
c.mov(x86::rbp, x86::qword_ptr(x86::rbp));
c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0])); c.lea(x86::rbp, x86::qword_ptr(x86::rbp, args[0]));
c.and_(args[0].r32(), 0xff80); c.and_(args[0].r32(), 0xff80);
c.shr(args[0].r32(), 1); c.shr(args[0].r32(), 1);
c.lea(x86::r11, x86::qword_ptr(reinterpret_cast<u64>(+vm::g_reservations), args[0])); c.movabs(x86::r11, reinterpret_cast<u64>(+vm::g_reservations));
c.lea(x86::r11, x86::qword_ptr(x86::r11, args[0]));
// Alloc args[0] to stamp0 // Alloc args[0] to stamp0
const auto stamp0 = args[0]; const auto stamp0 = args[0];
@ -1039,7 +1062,8 @@ const auto spu_getllar_tx = build_function_asm<u64(*)(u32 raddr, void* rdata, cp
c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1); c.add(x86::qword_ptr(args[2], ::offset32(&spu_thread::ftx)), 1);
build_get_tsc(c); build_get_tsc(c);
c.sub(x86::rax, stamp0); c.sub(x86::rax, stamp0);
c.cmp(x86::rax, x86::qword_ptr(reinterpret_cast<u64>(&g_rtm_tx_limit1))); c.movabs(x86::rbx, reinterpret_cast<u64>(&g_rtm_tx_limit1));
c.cmp(x86::rax, x86::qword_ptr(x86::rbx));
c.jae(fall); c.jae(fall);
}); });
@ -4443,7 +4467,7 @@ bool spu_thread::is_exec_code(u32 addr, std::span<const u8> ls_ptr, u32 base_add
// Detect "invalid" relative branches // Detect "invalid" relative branches
// Branch offsets that, although are the only way to get X code address using relative address // Branch offsets that, although are the only way to get X code address using relative address
// Rely on overflow/underflow of SPU memory bounds // Rely on overflow/underflow of SPU memory bounds
// Thus they would behave differently if SPU LS memory size was to increase (evolving the CELL architecture was the original plan) // Thus they would behave differently if SPU LS memory size was to increase (evolving the CELL architecture was the original plan)
// Making them highly unlikely to be valid code // Making them highly unlikely to be valid code
if (rel < 0) if (rel < 0)
@ -4664,7 +4688,7 @@ bool spu_thread::process_mfc_cmd()
// Add to chance if previous wait was long enough // Add to chance if previous wait was long enough
const u32 add_count = zero_count == 3 && total_wait >= 40 ? (total_wait - 39) * 40 const u32 add_count = zero_count == 3 && total_wait >= 40 ? (total_wait - 39) * 40
: zero_count == 2 && total_wait >= 11 ? (total_wait - 10) * 40 : zero_count == 2 && total_wait >= 11 ? (total_wait - 10) * 40
: zero_count == 1 && total_wait >= 8 ? (total_wait - 7) * 40 : zero_count == 1 && total_wait >= 8 ? (total_wait - 7) * 40
: zero_count == 0 && total_wait >= 6 ? (total_wait - 5) * 40 : zero_count == 0 && total_wait >= 6 ? (total_wait - 5) * 40
: 0; : 0;
@ -5002,7 +5026,7 @@ bool spu_thread::process_mfc_cmd()
if (group->spurs_running == max_run - 1) if (group->spurs_running == max_run - 1)
{ {
// Try to let another thread slip in and take over execution // Try to let another thread slip in and take over execution
thread_ctrl::wait_for(300); thread_ctrl::wait_for(300);
// Update value // Update value
@ -5027,7 +5051,7 @@ bool spu_thread::process_mfc_cmd()
if (spurs_last_task_timestamp) if (spurs_last_task_timestamp)
{ {
const u64 avg_entry = spurs_average_task_duration / spurs_task_count_to_calculate; const u64 avg_entry = spurs_average_task_duration / spurs_task_count_to_calculate;
spurs_average_task_duration -= avg_entry; spurs_average_task_duration -= avg_entry;
spurs_average_task_duration += std::min<u64>(45'000, current - spurs_last_task_timestamp); spurs_average_task_duration += std::min<u64>(45'000, current - spurs_last_task_timestamp);
spu_log.trace("duration: %d, avg=%d", current - spurs_last_task_timestamp, spurs_average_task_duration / spurs_task_count_to_calculate); spu_log.trace("duration: %d, avg=%d", current - spurs_last_task_timestamp, spurs_average_task_duration / spurs_task_count_to_calculate);
spurs_last_task_timestamp = 0; spurs_last_task_timestamp = 0;
@ -5048,7 +5072,7 @@ bool spu_thread::process_mfc_cmd()
} }
max_run = group->max_run; max_run = group->max_run;
prev_running = group->spurs_running.fetch_op([max_run](u32& x) prev_running = group->spurs_running.fetch_op([max_run](u32& x)
{ {
if (x < max_run) if (x < max_run)
@ -5113,7 +5137,7 @@ bool spu_thread::process_mfc_cmd()
if (spurs_last_task_timestamp) if (spurs_last_task_timestamp)
{ {
const u64 avg_entry = spurs_average_task_duration / spurs_task_count_to_calculate; const u64 avg_entry = spurs_average_task_duration / spurs_task_count_to_calculate;
spurs_average_task_duration -= avg_entry; spurs_average_task_duration -= avg_entry;
spurs_average_task_duration += std::min<u64>(45'000, current - spurs_last_task_timestamp); spurs_average_task_duration += std::min<u64>(45'000, current - spurs_last_task_timestamp);
spu_log.trace("duration: %d, avg=%d", current - spurs_last_task_timestamp, spurs_average_task_duration / spurs_task_count_to_calculate); spu_log.trace("duration: %d, avg=%d", current - spurs_last_task_timestamp, spurs_average_task_duration / spurs_task_count_to_calculate);
spurs_last_task_timestamp = 0; spurs_last_task_timestamp = 0;

View file

@ -96,10 +96,9 @@
<IgnoreImportLibrary>true</IgnoreImportLibrary> <IgnoreImportLibrary>true</IgnoreImportLibrary>
<LinkIncremental>false</LinkIncremental> <LinkIncremental>false</LinkIncremental>
<OutputFile>$(OutDir)\rpcs3.exe</OutputFile> <OutputFile>$(OutDir)\rpcs3.exe</OutputFile>
<RandomizedBaseAddress>false</RandomizedBaseAddress> <RandomizedBaseAddress>true</RandomizedBaseAddress>
<SubSystem>Windows</SubSystem> <SubSystem>Windows</SubSystem>
<SuppressStartupBanner>true</SuppressStartupBanner> <SuppressStartupBanner>true</SuppressStartupBanner>
<BaseAddress>0x10000</BaseAddress>
<EntryPointSymbol>mainCRTStartup</EntryPointSymbol> <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
</Link> </Link>
<Midl> <Midl>
@ -156,10 +155,11 @@
<GenerateDebugInformation>Debug</GenerateDebugInformation> <GenerateDebugInformation>Debug</GenerateDebugInformation>
<IgnoreImportLibrary>true</IgnoreImportLibrary> <IgnoreImportLibrary>true</IgnoreImportLibrary>
<OutputFile>$(OutDir)\rpcs3d.exe</OutputFile> <OutputFile>$(OutDir)\rpcs3d.exe</OutputFile>
<RandomizedBaseAddress>false</RandomizedBaseAddress> <RandomizedBaseAddress>true</RandomizedBaseAddress>
<SubSystem>Windows</SubSystem> <SubSystem>Windows</SubSystem>
<SuppressStartupBanner>true</SuppressStartupBanner> <SuppressStartupBanner>true</SuppressStartupBanner>
<BaseAddress>0x10000</BaseAddress> <BaseAddress>
</BaseAddress>
<EntryPointSymbol>mainCRTStartup</EntryPointSymbol> <EntryPointSymbol>mainCRTStartup</EntryPointSymbol>
</Link> </Link>
<Midl> <Midl>
@ -2173,4 +2173,4 @@
<UserProperties MocDir=".\QTGeneratedFiles\$(ConfigurationName)" Qt5Version_x0020_x64="$(DefaultQtVersion)" RccDir=".\QTGeneratedFiles" UicDir=".\QTGeneratedFiles" /> <UserProperties MocDir=".\QTGeneratedFiles\$(ConfigurationName)" Qt5Version_x0020_x64="$(DefaultQtVersion)" RccDir=".\QTGeneratedFiles" UicDir=".\QTGeneratedFiles" />
</VisualStudio> </VisualStudio>
</ProjectExtensions> </ProjectExtensions>
</Project> </Project>