JitArm64: Increase farcode & nearcode cache size

This is a JitArm64 version of 219610d8a0.

Due to limitations on how far you can jump with a single AArch64 branch
instruction, going above the former limit of 128 MiB of code (counting
nearcode and farcode combined) requires a bit of restructuring. With the
restructuring in place, the limit now is 256 MiB. See the new large
comment in Jit.h for a description of the new memory layout.
This commit is contained in:
JosJuice 2024-03-24 11:49:47 +01:00
parent b6f0e8876e
commit e8154a529f
3 changed files with 148 additions and 47 deletions

View file

@ -82,6 +82,10 @@ public:
} }
bool IsInSpace(const u8* ptr) const { return ptr >= region && ptr < (region + region_size); } bool IsInSpace(const u8* ptr) const { return ptr >= region && ptr < (region + region_size); }
bool IsInSpaceOrChildSpace(const u8* ptr) const
{
return ptr >= region && ptr < (region + total_region_size);
}
void WriteProtect(bool allow_execute) void WriteProtect(bool allow_execute)
{ {
Common::WriteProtectMemory(region, region_size, allow_execute); Common::WriteProtectMemory(region, region_size, allow_execute);
@ -106,7 +110,7 @@ public:
bool HasChildren() const { return region_size != total_region_size; } bool HasChildren() const { return region_size != total_region_size; }
u8* AllocChildCodeSpace(size_t child_size) u8* AllocChildCodeSpace(size_t child_size)
{ {
ASSERT_MSG(DYNA_REC, child_size < GetSpaceLeft(), "Insufficient space for child allocation."); ASSERT_MSG(DYNA_REC, child_size <= GetSpaceLeft(), "Insufficient space for child allocation.");
u8* child_region = region + region_size - child_size; u8* child_region = region + region_size - child_size;
region_size -= child_size; region_size -= child_size;
ResetCodePtr(); ResetCodePtr();

View file

@ -4,6 +4,7 @@
#include "Core/PowerPC/JitArm64/Jit.h" #include "Core/PowerPC/JitArm64/Jit.h"
#include <cstdio> #include <cstdio>
#include <optional>
#include "Common/Arm64Emitter.h" #include "Common/Arm64Emitter.h"
#include "Common/CommonTypes.h" #include "Common/CommonTypes.h"
@ -29,13 +30,13 @@
using namespace Arm64Gen; using namespace Arm64Gen;
constexpr size_t CODE_SIZE = 1024 * 1024 * 32; constexpr size_t NEAR_CODE_SIZE = 1024 * 1024 * 64;
// We use a bigger farcode size for JitArm64 than Jit64, because JitArm64 always emits farcode // We use a bigger farcode size for JitArm64 than Jit64, because JitArm64 always emits farcode
// for the slow path of each loadstore instruction. Jit64 postpones emitting farcode until the // for the slow path of each loadstore instruction. Jit64 postpones emitting farcode until the
// farcode actually is needed, saving it from having to emit farcode for most instructions. // farcode actually is needed, saving it from having to emit farcode for most instructions.
// TODO: Perhaps implement something similar to Jit64. But using more RAM isn't much of a problem. // TODO: Perhaps implement something similar to Jit64. But using more RAM isn't much of a problem.
constexpr size_t FARCODE_SIZE = 1024 * 1024 * 64; constexpr size_t FAR_CODE_SIZE = 1024 * 1024 * 64;
constexpr size_t FARCODE_SIZE_MMU = 1024 * 1024 * 64; constexpr size_t TOTAL_CODE_SIZE = NEAR_CODE_SIZE * 2 + FAR_CODE_SIZE * 2;
JitArm64::JitArm64(Core::System& system) : JitBase(system), m_float_emit(this) JitArm64::JitArm64(Core::System& system) : JitBase(system), m_float_emit(this)
{ {
@ -49,9 +50,18 @@ void JitArm64::Init()
RefreshConfig(); RefreshConfig();
const size_t child_code_size = jo.memcheck ? FARCODE_SIZE_MMU : FARCODE_SIZE; // We want the regions to be laid out in this order in memory:
AllocCodeSpace(CODE_SIZE + child_code_size); // m_far_code_0, m_near_code_0, m_near_code_1, m_far_code_1.
AddChildCodeSpace(&m_far_code, child_code_size); // AddChildCodeSpace grabs space from the end of the parent region,
// so we have to call AddChildCodeSpace in reverse order.
AllocCodeSpace(TOTAL_CODE_SIZE);
AddChildCodeSpace(&m_far_code_1, FAR_CODE_SIZE);
AddChildCodeSpace(&m_near_code_1, NEAR_CODE_SIZE);
AddChildCodeSpace(&m_near_code_0, NEAR_CODE_SIZE);
AddChildCodeSpace(&m_far_code_0, FAR_CODE_SIZE);
ASSERT(m_far_code_0.GetCodeEnd() == m_near_code_0.GetCodePtr());
ASSERT(m_near_code_0.GetCodeEnd() == m_near_code_1.GetCodePtr());
ASSERT(m_near_code_1.GetCodeEnd() == m_far_code_1.GetCodePtr());
jo.optimizeGatherPipe = true; jo.optimizeGatherPipe = true;
SetBlockLinkingEnabled(true); SetBlockLinkingEnabled(true);
@ -66,9 +76,7 @@ void JitArm64::Init()
InitBLROptimization(); InitBLROptimization();
GenerateAsm(); GenerateAsmAndResetFreeMemoryRanges();
ResetFreeMemoryRanges();
} }
void JitArm64::SetBlockLinkingEnabled(bool enabled) void JitArm64::SetBlockLinkingEnabled(bool enabled)
@ -113,7 +121,7 @@ bool JitArm64::HandleFault(uintptr_t access_address, SContext* ctx)
success = HandleStackFault(); success = HandleStackFault();
// If the fault is in JIT code space, look for fastmem areas. // If the fault is in JIT code space, look for fastmem areas.
if (!success && IsInSpace(reinterpret_cast<u8*>(ctx->CTX_PC))) if (!success && IsInSpaceOrChildSpace(reinterpret_cast<u8*>(ctx->CTX_PC)))
{ {
auto& memory = m_system.GetMemory(); auto& memory = m_system.GetMemory();
if (memory.IsAddressInFastmemArea(reinterpret_cast<u8*>(access_address))) if (memory.IsAddressInFastmemArea(reinterpret_cast<u8*>(access_address)))
@ -153,22 +161,47 @@ void JitArm64::ClearCache()
blocks.Clear(); blocks.Clear();
blocks.ClearRangesToFree(); blocks.ClearRangesToFree();
const Common::ScopedJITPageWriteAndNoExecute enable_jit_page_writes; const Common::ScopedJITPageWriteAndNoExecute enable_jit_page_writes;
ClearCodeSpace(); m_far_code_0.ClearCodeSpace();
m_far_code.ClearCodeSpace(); m_near_code_0.ClearCodeSpace();
m_near_code_1.ClearCodeSpace();
m_far_code_1.ClearCodeSpace();
RefreshConfig(); RefreshConfig();
GenerateAsmAndResetFreeMemoryRanges();
}
void JitArm64::GenerateAsmAndResetFreeMemoryRanges()
{
SetCodePtr(m_near_code_1.GetWritableCodePtr(), m_near_code_1.GetWritableCodeEnd());
m_far_code.SetCodePtr(m_far_code_1.GetWritableCodePtr(), m_far_code_1.GetWritableCodeEnd());
const u8* routines_near_start = GetCodePtr();
const u8* routines_far_start = m_far_code.GetCodePtr();
GenerateAsm(); GenerateAsm();
ResetFreeMemoryRanges(); const u8* routines_near_end = GetCodePtr();
const u8* routines_far_end = m_far_code.GetCodePtr();
ResetFreeMemoryRanges(routines_near_end - routines_near_start,
routines_far_end - routines_far_start);
} }
void JitArm64::ResetFreeMemoryRanges() void JitArm64::ResetFreeMemoryRanges(size_t routines_near_size, size_t routines_far_size)
{ {
// Set the near and far code regions as unused. // Set the near and far code regions as unused.
m_free_ranges_near.clear(); m_free_ranges_far_0.clear();
m_free_ranges_near.insert(GetWritableCodePtr(), GetWritableCodeEnd()); m_free_ranges_far_0.insert(m_far_code_0.GetWritableCodePtr() + routines_near_size,
m_free_ranges_far.clear(); m_far_code_0.GetWritableCodeEnd());
m_free_ranges_far.insert(m_far_code.GetWritableCodePtr(), m_far_code.GetWritableCodeEnd()); m_free_ranges_near_0.clear();
m_free_ranges_near_0.insert(m_near_code_0.GetWritableCodePtr(),
m_near_code_0.GetWritableCodeEnd());
m_free_ranges_near_1.clear();
m_free_ranges_near_1.insert(m_near_code_1.GetWritableCodePtr() + routines_near_size,
m_near_code_1.GetWritableCodeEnd());
m_free_ranges_far_1.clear();
m_free_ranges_far_1.insert(m_far_code_1.GetWritableCodePtr() + routines_far_size,
m_far_code_1.GetWritableCodeEnd());
} }
void JitArm64::Shutdown() void JitArm64::Shutdown()
@ -889,11 +922,17 @@ void JitArm64::Jit(u32 em_address, bool clear_cache_and_retry_on_failure)
++last_fastmem_area; ++last_fastmem_area;
m_fault_to_handler.erase(first_fastmem_area, last_fastmem_area); m_fault_to_handler.erase(first_fastmem_area, last_fastmem_area);
m_free_ranges_near.insert(range.first, range.second); if (range.first < m_near_code_0.GetCodeEnd())
m_free_ranges_near_0.insert(range.first, range.second);
else
m_free_ranges_near_1.insert(range.first, range.second);
} }
for (auto range : blocks.GetRangesToFreeFar()) for (auto range : blocks.GetRangesToFreeFar())
{ {
m_free_ranges_far.insert(range.first, range.second); if (range.first < m_far_code_0.GetCodeEnd())
m_free_ranges_far_0.insert(range.first, range.second);
else
m_free_ranges_far_1.insert(range.first, range.second);
} }
blocks.ClearRangesToFree(); blocks.ClearRangesToFree();
@ -939,7 +978,7 @@ void JitArm64::Jit(u32 em_address, bool clear_cache_and_retry_on_failure)
return; return;
} }
if (SetEmitterStateToFreeCodeRegion()) if (std::optional<size_t> code_region_index = SetEmitterStateToFreeCodeRegion())
{ {
u8* near_start = GetWritableCodePtr(); u8* near_start = GetWritableCodePtr();
u8* far_start = m_far_code.GetWritableCodePtr(); u8* far_start = m_far_code.GetWritableCodePtr();
@ -952,10 +991,16 @@ void JitArm64::Jit(u32 em_address, bool clear_cache_and_retry_on_failure)
// Mark the memory regions that this code block uses as used in the local rangesets. // Mark the memory regions that this code block uses as used in the local rangesets.
u8* near_end = GetWritableCodePtr(); u8* near_end = GetWritableCodePtr();
if (near_start != near_end) if (near_start != near_end)
m_free_ranges_near.erase(near_start, near_end); {
(code_region_index == 0 ? m_free_ranges_near_0 : m_free_ranges_near_1)
.erase(near_start, near_end);
}
u8* far_end = m_far_code.GetWritableCodePtr(); u8* far_end = m_far_code.GetWritableCodePtr();
if (far_start != far_end) if (far_start != far_end)
m_free_ranges_far.erase(far_start, far_end); {
(code_region_index == 0 ? m_free_ranges_far_0 : m_free_ranges_far_1)
.erase(far_start, far_end);
}
// Store the used memory regions in the block so we know what to mark as unused when the // Store the used memory regions in the block so we know what to mark as unused when the
// block gets invalidated. // block gets invalidated.
@ -984,27 +1029,52 @@ void JitArm64::Jit(u32 em_address, bool clear_cache_and_retry_on_failure)
exit(-1); exit(-1);
} }
bool JitArm64::SetEmitterStateToFreeCodeRegion() std::optional<size_t> JitArm64::SetEmitterStateToFreeCodeRegion()
{ {
// Find the largest free memory blocks and set code emitters to point at them. // Find some large free memory blocks and set code emitters to point at them. If we can't find
// If we can't find a free block return false instead, which will trigger a JIT cache clear. // free blocks, return std::nullopt instead, which will trigger a JIT cache clear.
auto free_near = m_free_ranges_near.by_size_begin(); const auto free_near_0 = m_free_ranges_near_0.by_size_begin();
if (free_near == m_free_ranges_near.by_size_end()) const auto free_near_1 = m_free_ranges_near_1.by_size_begin();
{ const auto free_far_0 = m_free_ranges_far_0.by_size_begin();
WARN_LOG_FMT(DYNA_REC, "Failed to find free memory region in near code region."); const auto free_far_1 = m_free_ranges_far_1.by_size_begin();
return false;
}
SetCodePtr(free_near.from(), free_near.to());
auto free_far = m_free_ranges_far.by_size_begin(); const size_t free_near_1_size = free_near_1.to() - free_near_1.from();
if (free_far == m_free_ranges_far.by_size_end()) const size_t free_far_1_size = free_far_1.to() - free_far_1.from();
{ const size_t free_1_smallest_size = std::min(free_near_1_size, free_far_1_size);
WARN_LOG_FMT(DYNA_REC, "Failed to find free memory region in far code region.");
return false;
}
m_far_code.SetCodePtr(free_far.from(), free_far.to());
return true; if (free_1_smallest_size >= 1024 * 1024)
{
// Don't use region 0 unless region 1 is getting full. This improves cache friendliness.
SetCodePtr(free_near_1.from(), free_near_1.to());
m_far_code.SetCodePtr(free_far_1.from(), free_far_1.to());
return std::make_optional(1);
}
const size_t free_near_0_size = free_near_0.to() - free_near_0.from();
const size_t free_far_0_size = free_far_0.to() - free_far_0.from();
const size_t free_0_smallest_size = std::min(free_near_0_size, free_far_0_size);
if (free_0_smallest_size == 0 && free_1_smallest_size == 0)
{
if (free_near_0_size == 0 && free_near_1_size == 0)
WARN_LOG_FMT(DYNA_REC, "Failed to find free memory region in near code regions.");
else if (free_far_0_size == 0 && free_far_1_size == 0)
WARN_LOG_FMT(DYNA_REC, "Failed to find free memory region in far code regions.");
return std::nullopt;
}
if (free_0_smallest_size > free_1_smallest_size)
{
SetCodePtr(free_near_0.from(), free_near_0.to());
m_far_code.SetCodePtr(free_far_0.from(), free_far_0.to());
return std::make_optional(0);
}
else
{
SetCodePtr(free_near_1.from(), free_near_1.to());
m_far_code.SetCodePtr(free_far_1.from(), free_far_1.to());
return std::make_optional(1);
}
} }
bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC) bool JitArm64::DoJit(u32 em_address, JitBlock* b, u32 nextPC)

View file

@ -5,6 +5,7 @@
#include <cstddef> #include <cstddef>
#include <map> #include <map>
#include <optional>
#include <tuple> #include <tuple>
#include <rangeset/rangesizeset.h> #include <rangeset/rangesizeset.h>
@ -285,14 +286,16 @@ protected:
void Trace(); void Trace();
// Finds a free memory region and sets the near and far code emitters to point at that region. // Finds a free memory region and sets the near and far code emitters to point at that region.
// Returns false if no free memory region can be found for either of the two. // On success, returns the index of the memory region (either 0 or 1).
bool SetEmitterStateToFreeCodeRegion(); // If either near code or far code is full, returns std::nullopt.
std::optional<size_t> SetEmitterStateToFreeCodeRegion();
void DoDownCount(); void DoDownCount();
void Cleanup(); void Cleanup();
void ResetStack(); void ResetStack();
void ResetFreeMemoryRanges(); void GenerateAsmAndResetFreeMemoryRanges();
void ResetFreeMemoryRanges(size_t routines_near_size, size_t routines_far_size);
void IntializeSpeculativeConstants(); void IntializeSpeculativeConstants();
@ -372,6 +375,28 @@ protected:
Arm64Gen::ARM64FloatEmitter m_float_emit; Arm64Gen::ARM64FloatEmitter m_float_emit;
// Because B instructions can't jump farther than +/- 128 MiB, code memory is allocated like this:
//
// m_far_code_0: x MiB of unused space, followed by 64 - x MiB of far code
// m_near_code_0: 64 MiB of near code
// m_near_code_1: x MiB of asm routines, followed by 64 - x MiB of near code
// m_far_code_1: 64 MiB of far code
//
// This ensures that:
//
// * Any code in m_near_code_0 can reach any code in m_far_code_0, and vice versa
// * Any code in m_near_code_1 can reach any code in m_far_code_1, and vice versa
// * Any near code can reach any near code
// * Any code can reach any asm routine
//
// m_far_code_0 and m_far_code_1 can't reach each other, but that isn't needed, because all blocks
// have their entry points in near code.
Arm64Gen::ARM64CodeBlock m_near_code_0;
Arm64Gen::ARM64CodeBlock m_near_code_1;
Arm64Gen::ARM64CodeBlock m_far_code_0;
Arm64Gen::ARM64CodeBlock m_far_code_1;
Arm64Gen::ARM64CodeBlock m_far_code; Arm64Gen::ARM64CodeBlock m_far_code;
bool m_in_far_code = false; bool m_in_far_code = false;
@ -380,6 +405,8 @@ protected:
u8* m_near_code_end = nullptr; u8* m_near_code_end = nullptr;
bool m_near_code_write_failed = false; bool m_near_code_write_failed = false;
HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_near; HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_near_0;
HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_far; HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_near_1;
HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_far_0;
HyoutaUtilities::RangeSizeSet<u8*> m_free_ranges_far_1;
}; };