mirror of
https://github.com/mupen64plus/mupen64plus-rsp-cxd4.git
synced 2024-05-13 09:49:42 -04:00
Add libretro NEON optimizations
credits: https://github.com/libretro/parallel-n64/tree/master/mupen64plus-rsp-cxd4
This commit is contained in:
parent
c904e8f70e
commit
cc6b8833e3
11
my_types.h
11
my_types.h
|
@ -32,6 +32,11 @@
|
|||
#ifndef _MY_TYPES_H_
|
||||
#define _MY_TYPES_H_
|
||||
|
||||
#if defined(USE_SSE2NEON) && defined(__ARM_NEON__)
|
||||
#include "sse2neon/SSE2NEON.h"
|
||||
#define ARCH_MIN_SSE2
|
||||
#endif
|
||||
|
||||
/*
|
||||
* This is the only method we really need to care about for defining types.
|
||||
*
|
||||
|
@ -493,4 +498,10 @@ typedef struct {
|
|||
} MIPS_type_J;
|
||||
#endif
|
||||
|
||||
#if defined(__arm__) && (defined(__GNUC__) || defined(__clang__))
|
||||
#define COMPILER_FENCE() __asm__ __volatile__("":::"memory")
|
||||
#else
|
||||
#define COMPILER_FENCE()
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
1067
sse2neon/SSE2NEON.h
Normal file
1067
sse2neon/SSE2NEON.h
Normal file
File diff suppressed because it is too large
Load diff
16
su.c
16
su.c
|
@ -1825,7 +1825,7 @@ PROFILE_MODE void MWC2_load(u32 inst)
|
|||
const unsigned int vt = (inst >> 16) % (1 << 5);
|
||||
const unsigned int element = (inst >> 7) % (1 << 4);
|
||||
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
#if defined(ARCH_MIN_SSE2) && !defined(SSE2NEON)
|
||||
offset = (s16)inst;
|
||||
offset <<= 5 + 4; /* safe on x86, skips 5-bit rd, 4-bit element */
|
||||
offset >>= 5 + 4;
|
||||
|
@ -1841,7 +1841,7 @@ PROFILE_MODE void MWC2_store(u32 inst)
|
|||
const unsigned int vt = (inst >> 16) % (1 << 5);
|
||||
const unsigned int element = (inst >> 7) % (1 << 4);
|
||||
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
#if defined(ARCH_MIN_SSE2) && !defined(SSE2NEON)
|
||||
offset = (s16)inst;
|
||||
offset <<= 5 + 4; /* safe on x86, skips 5-bit rd, 4-bit element */
|
||||
offset >>= 5 + 4;
|
||||
|
@ -1911,6 +1911,12 @@ PROFILE_MODE void COP2(u32 inst)
|
|||
case 022:
|
||||
case 023:
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
#ifdef __ARM_NEON__
|
||||
target = (v16)vld1q_u16(&VR[vt][0 + op - 0x12]);
|
||||
target = (v16)vshlq_n_u32((uint32x4_t)target, 16);
|
||||
target = (v16)vorrq_u16((uint16x8_t)target,
|
||||
(uint16x8_t)vshrq_n_u32((uint32x4_t)target, 16));
|
||||
#else
|
||||
shuffle_temporary[0] = VR[vt][0 + op - 0x12];
|
||||
shuffle_temporary[2] = VR[vt][2 + op - 0x12];
|
||||
shuffle_temporary[4] = VR[vt][4 + op - 0x12];
|
||||
|
@ -1918,6 +1924,7 @@ PROFILE_MODE void COP2(u32 inst)
|
|||
target = *(v16 *)(&shuffle_temporary[0]);
|
||||
target = _mm_shufflehi_epi16(target, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
target = _mm_shufflelo_epi16(target, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
#endif
|
||||
*(v16 *)(VR[vd]) = COP2_C2[func](*(v16 *)VR[vs], target);
|
||||
#else
|
||||
for (i = 0; i < N; i++)
|
||||
|
@ -1931,11 +1938,16 @@ PROFILE_MODE void COP2(u32 inst)
|
|||
case 026:
|
||||
case 027:
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
#ifdef __ARM_NEON__
|
||||
target = (v16)vcombine_s16(vdup_n_s16(VR[vt][0 + op - 0x14]),
|
||||
vdup_n_s16(VR[vt][4 + op - 0x14]));
|
||||
#else
|
||||
target = _mm_setzero_si128();
|
||||
target = _mm_insert_epi16(target, VR[vt][0 + op - 0x14], 0);
|
||||
target = _mm_insert_epi16(target, VR[vt][4 + op - 0x14], 4);
|
||||
target = _mm_shufflehi_epi16(target, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
target = _mm_shufflelo_epi16(target, _MM_SHUFFLE(0, 0, 0, 0));
|
||||
#endif
|
||||
*(v16 *)(VR[vd]) = COP2_C2[func](*(v16 *)VR[vs], target);
|
||||
#else
|
||||
for (i = 0; i < N; i++)
|
||||
|
|
2
su.h
2
su.h
|
@ -182,7 +182,7 @@ extern void set_PC(unsigned int address);
|
|||
*
|
||||
* Some of these also will only work assuming 2's complement (e.g., Intel).
|
||||
*/
|
||||
#if defined(ARCH_MIN_SSE2)
|
||||
#if defined(ARCH_MIN_SSE2) && !defined(SSE2NEON)
|
||||
#define MASK_SA(sa) (sa)
|
||||
#define IW_RD(inst) ((u16)(inst) >> 11)
|
||||
#define SIGNED_IMM16(imm) (s16)(imm)
|
||||
|
|
5
vu/add.c
5
vu/add.c
|
@ -227,6 +227,7 @@ VECTOR_OPERATION VADD(v16 vs, v16 vt)
|
|||
#endif
|
||||
clr_ci(VD, VS, VT);
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
COMPILER_FENCE();
|
||||
vs = *(v16 *)VD;
|
||||
return (vs);
|
||||
#else
|
||||
|
@ -251,6 +252,7 @@ VECTOR_OPERATION VSUB(v16 vs, v16 vt)
|
|||
#endif
|
||||
clr_bi(VD, VS, VT);
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
COMPILER_FENCE();
|
||||
vs = *(v16 *)VD;
|
||||
return (vs);
|
||||
#else
|
||||
|
@ -275,6 +277,7 @@ VECTOR_OPERATION VABS(v16 vs, v16 vt)
|
|||
#endif
|
||||
do_abs(VD, VS, VT);
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
COMPILER_FENCE();
|
||||
vs = *(v16 *)VD;
|
||||
return (vs);
|
||||
#else
|
||||
|
@ -299,6 +302,7 @@ VECTOR_OPERATION VADDC(v16 vs, v16 vt)
|
|||
#endif
|
||||
set_co(VD, VS, VT);
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
COMPILER_FENCE();
|
||||
vs = *(v16 *)VD;
|
||||
return (vs);
|
||||
#else
|
||||
|
@ -323,6 +327,7 @@ VECTOR_OPERATION VSUBC(v16 vs, v16 vt)
|
|||
#endif
|
||||
set_bo(VD, VS, VT);
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
COMPILER_FENCE();
|
||||
vs = *(v16 *)VD;
|
||||
return (vs);
|
||||
#else
|
||||
|
|
|
@ -1139,6 +1139,7 @@ VECTOR_OPERATION VRCP(v16 vs, v16 vt)
|
|||
VR[result][source & 07] = (i16)DivOut;
|
||||
DPH = SP_DIV_PRECISION_SINGLE;
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
COMPILER_FENCE();
|
||||
vs = *(v16 *)VR[result];
|
||||
return (vs);
|
||||
#else
|
||||
|
@ -1166,6 +1167,7 @@ VECTOR_OPERATION VRCPL(v16 vs, v16 vt)
|
|||
VR[result][source & 07] = (i16)DivOut;
|
||||
DPH = SP_DIV_PRECISION_SINGLE;
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
COMPILER_FENCE();
|
||||
vs = *(v16 *)VR[result];
|
||||
return (vs);
|
||||
#else
|
||||
|
@ -1191,6 +1193,7 @@ VECTOR_OPERATION VRCPH(v16 vs, v16 vt)
|
|||
VR[result][source & 07] = DivOut >> 16;
|
||||
DPH = SP_DIV_PRECISION_DOUBLE;
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
COMPILER_FENCE();
|
||||
vs = *(v16 *)VR[result];
|
||||
return (vs);
|
||||
#else
|
||||
|
@ -1213,6 +1216,7 @@ VECTOR_OPERATION VMOV(v16 vs, v16 vt)
|
|||
#endif
|
||||
VR[result][source & 07] = VACC_L[element];
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
COMPILER_FENCE();
|
||||
vs = *(v16 *)VR[result];
|
||||
return (vs);
|
||||
#else
|
||||
|
@ -1239,6 +1243,7 @@ VECTOR_OPERATION VRSQ(v16 vs, v16 vt)
|
|||
VR[result][source & 07] = (i16)DivOut;
|
||||
DPH = SP_DIV_PRECISION_SINGLE;
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
COMPILER_FENCE();
|
||||
vs = *(v16 *)VR[result];
|
||||
return (vs);
|
||||
#else
|
||||
|
@ -1266,6 +1271,7 @@ VECTOR_OPERATION VRSQL(v16 vs, v16 vt)
|
|||
VR[result][source & 07] = (i16)DivOut;
|
||||
DPH = SP_DIV_PRECISION_SINGLE;
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
COMPILER_FENCE();
|
||||
vs = *(v16 *)VR[result];
|
||||
return (vs);
|
||||
#else
|
||||
|
@ -1291,6 +1297,7 @@ VECTOR_OPERATION VRSQH(v16 vs, v16 vt)
|
|||
VR[result][source & 07] = DivOut >> 16;
|
||||
DPH = SP_DIV_PRECISION_DOUBLE;
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
COMPILER_FENCE();
|
||||
vs = *(v16 *)VR[result];
|
||||
return (vs);
|
||||
#else
|
||||
|
|
|
@ -447,6 +447,7 @@ VECTOR_OPERATION VMACF(v16 vs, v16 vt)
|
|||
#endif
|
||||
do_macf(VD, VS, VT);
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
COMPILER_FENCE();
|
||||
vs = *(v16 *)VD;
|
||||
return (vs);
|
||||
#else
|
||||
|
@ -471,6 +472,7 @@ VECTOR_OPERATION VMACU(v16 vs, v16 vt)
|
|||
#endif
|
||||
do_macu(VD, VS, VT);
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
COMPILER_FENCE();
|
||||
vs = *(v16 *)VD;
|
||||
return (vs);
|
||||
#else
|
||||
|
|
|
@ -360,6 +360,7 @@ VECTOR_OPERATION VLT(v16 vs, v16 vt)
|
|||
#endif
|
||||
do_lt(VD, VS, VT);
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
COMPILER_FENCE();
|
||||
vs = *(v16 *)VD;
|
||||
return (vs);
|
||||
#else
|
||||
|
@ -384,6 +385,7 @@ VECTOR_OPERATION VEQ(v16 vs, v16 vt)
|
|||
#endif
|
||||
do_eq(VD, VS, VT);
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
COMPILER_FENCE();
|
||||
vs = *(v16 *)VD;
|
||||
return (vs);
|
||||
#else
|
||||
|
@ -408,6 +410,7 @@ VECTOR_OPERATION VNE(v16 vs, v16 vt)
|
|||
#endif
|
||||
do_ne(VD, VS, VT);
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
COMPILER_FENCE();
|
||||
vs = *(v16 *)VD;
|
||||
return (vs);
|
||||
#else
|
||||
|
@ -432,6 +435,7 @@ VECTOR_OPERATION VGE(v16 vs, v16 vt)
|
|||
#endif
|
||||
do_ge(VD, VS, VT);
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
COMPILER_FENCE();
|
||||
vs = *(v16 *)VD;
|
||||
return (vs);
|
||||
#else
|
||||
|
@ -456,6 +460,7 @@ VECTOR_OPERATION VCL(v16 vs, v16 vt)
|
|||
#endif
|
||||
do_cl(VD, VS, VT);
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
COMPILER_FENCE();
|
||||
vs = *(v16 *)VD;
|
||||
return (vs);
|
||||
#else
|
||||
|
@ -480,6 +485,7 @@ VECTOR_OPERATION VCH(v16 vs, v16 vt)
|
|||
#endif
|
||||
do_ch(VD, VS, VT);
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
COMPILER_FENCE();
|
||||
vs = *(v16 *)VD;
|
||||
return (vs);
|
||||
#else
|
||||
|
@ -504,6 +510,7 @@ VECTOR_OPERATION VCR(v16 vs, v16 vt)
|
|||
#endif
|
||||
do_cr(VD, VS, VT);
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
COMPILER_FENCE();
|
||||
vs = *(v16 *)VD;
|
||||
return (vs);
|
||||
#else
|
||||
|
@ -528,6 +535,7 @@ VECTOR_OPERATION VMRG(v16 vs, v16 vt)
|
|||
#endif
|
||||
do_mrg(VD, VS, VT);
|
||||
#ifdef ARCH_MIN_SSE2
|
||||
COMPILER_FENCE();
|
||||
vs = *(v16 *)VD;
|
||||
return (vs);
|
||||
#else
|
||||
|
|
Loading…
Reference in a new issue