Add libretro NEON optimizations

credits: https://github.com/libretro/parallel-n64/tree/master/mupen64plus-rsp-cxd4
This commit is contained in:
Francisco Zurita 2017-03-04 23:36:21 -05:00
parent c904e8f70e
commit cc6b8833e3
9 changed files with 1116 additions and 4 deletions

View file

@ -32,6 +32,11 @@
#ifndef _MY_TYPES_H_
#define _MY_TYPES_H_
#if defined(USE_SSE2NEON) && defined(__ARM_NEON__)
#include "sse2neon/SSE2NEON.h"
#define ARCH_MIN_SSE2
#endif
/*
* This is the only method we really need to care about for defining types.
*
@ -493,4 +498,10 @@ typedef struct {
} MIPS_type_J;
#endif
#if defined(__arm__) && (defined(__GNUC__) || defined(__clang__))
#define COMPILER_FENCE() __asm__ __volatile__("":::"memory")
#else
#define COMPILER_FENCE()
#endif
#endif

1067
sse2neon/SSE2NEON.h Normal file

File diff suppressed because it is too large Load diff

16
su.c
View file

@ -1825,7 +1825,7 @@ PROFILE_MODE void MWC2_load(u32 inst)
const unsigned int vt = (inst >> 16) % (1 << 5);
const unsigned int element = (inst >> 7) % (1 << 4);
#ifdef ARCH_MIN_SSE2
#if defined(ARCH_MIN_SSE2) && !defined(SSE2NEON)
offset = (s16)inst;
offset <<= 5 + 4; /* safe on x86, skips 5-bit rd, 4-bit element */
offset >>= 5 + 4;
@ -1841,7 +1841,7 @@ PROFILE_MODE void MWC2_store(u32 inst)
const unsigned int vt = (inst >> 16) % (1 << 5);
const unsigned int element = (inst >> 7) % (1 << 4);
#ifdef ARCH_MIN_SSE2
#if defined(ARCH_MIN_SSE2) && !defined(SSE2NEON)
offset = (s16)inst;
offset <<= 5 + 4; /* safe on x86, skips 5-bit rd, 4-bit element */
offset >>= 5 + 4;
@ -1911,6 +1911,12 @@ PROFILE_MODE void COP2(u32 inst)
case 022:
case 023:
#ifdef ARCH_MIN_SSE2
#ifdef __ARM_NEON__
target = (v16)vld1q_u16(&VR[vt][0 + op - 0x12]);
target = (v16)vshlq_n_u32((uint32x4_t)target, 16);
target = (v16)vorrq_u16((uint16x8_t)target,
(uint16x8_t)vshrq_n_u32((uint32x4_t)target, 16));
#else
shuffle_temporary[0] = VR[vt][0 + op - 0x12];
shuffle_temporary[2] = VR[vt][2 + op - 0x12];
shuffle_temporary[4] = VR[vt][4 + op - 0x12];
@ -1918,6 +1924,7 @@ PROFILE_MODE void COP2(u32 inst)
target = *(v16 *)(&shuffle_temporary[0]);
target = _mm_shufflehi_epi16(target, _MM_SHUFFLE(2, 2, 0, 0));
target = _mm_shufflelo_epi16(target, _MM_SHUFFLE(2, 2, 0, 0));
#endif
*(v16 *)(VR[vd]) = COP2_C2[func](*(v16 *)VR[vs], target);
#else
for (i = 0; i < N; i++)
@ -1931,11 +1938,16 @@ PROFILE_MODE void COP2(u32 inst)
case 026:
case 027:
#ifdef ARCH_MIN_SSE2
#ifdef __ARM_NEON__
target = (v16)vcombine_s16(vdup_n_s16(VR[vt][0 + op - 0x14]),
vdup_n_s16(VR[vt][4 + op - 0x14]));
#else
target = _mm_setzero_si128();
target = _mm_insert_epi16(target, VR[vt][0 + op - 0x14], 0);
target = _mm_insert_epi16(target, VR[vt][4 + op - 0x14], 4);
target = _mm_shufflehi_epi16(target, _MM_SHUFFLE(0, 0, 0, 0));
target = _mm_shufflelo_epi16(target, _MM_SHUFFLE(0, 0, 0, 0));
#endif
*(v16 *)(VR[vd]) = COP2_C2[func](*(v16 *)VR[vs], target);
#else
for (i = 0; i < N; i++)

2
su.h
View file

@ -182,7 +182,7 @@ extern void set_PC(unsigned int address);
*
* Some of these also will only work assuming 2's complement (e.g., Intel).
*/
#if defined(ARCH_MIN_SSE2)
#if defined(ARCH_MIN_SSE2) && !defined(SSE2NEON)
#define MASK_SA(sa) (sa)
#define IW_RD(inst) ((u16)(inst) >> 11)
#define SIGNED_IMM16(imm) (s16)(imm)

View file

@ -227,6 +227,7 @@ VECTOR_OPERATION VADD(v16 vs, v16 vt)
#endif
clr_ci(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VD;
return (vs);
#else
@ -251,6 +252,7 @@ VECTOR_OPERATION VSUB(v16 vs, v16 vt)
#endif
clr_bi(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VD;
return (vs);
#else
@ -275,6 +277,7 @@ VECTOR_OPERATION VABS(v16 vs, v16 vt)
#endif
do_abs(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VD;
return (vs);
#else
@ -299,6 +302,7 @@ VECTOR_OPERATION VADDC(v16 vs, v16 vt)
#endif
set_co(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VD;
return (vs);
#else
@ -323,6 +327,7 @@ VECTOR_OPERATION VSUBC(v16 vs, v16 vt)
#endif
set_bo(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VD;
return (vs);
#else

View file

@ -1139,6 +1139,7 @@ VECTOR_OPERATION VRCP(v16 vs, v16 vt)
VR[result][source & 07] = (i16)DivOut;
DPH = SP_DIV_PRECISION_SINGLE;
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VR[result];
return (vs);
#else
@ -1166,6 +1167,7 @@ VECTOR_OPERATION VRCPL(v16 vs, v16 vt)
VR[result][source & 07] = (i16)DivOut;
DPH = SP_DIV_PRECISION_SINGLE;
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VR[result];
return (vs);
#else
@ -1191,6 +1193,7 @@ VECTOR_OPERATION VRCPH(v16 vs, v16 vt)
VR[result][source & 07] = DivOut >> 16;
DPH = SP_DIV_PRECISION_DOUBLE;
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VR[result];
return (vs);
#else
@ -1213,6 +1216,7 @@ VECTOR_OPERATION VMOV(v16 vs, v16 vt)
#endif
VR[result][source & 07] = VACC_L[element];
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VR[result];
return (vs);
#else
@ -1239,6 +1243,7 @@ VECTOR_OPERATION VRSQ(v16 vs, v16 vt)
VR[result][source & 07] = (i16)DivOut;
DPH = SP_DIV_PRECISION_SINGLE;
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VR[result];
return (vs);
#else
@ -1266,6 +1271,7 @@ VECTOR_OPERATION VRSQL(v16 vs, v16 vt)
VR[result][source & 07] = (i16)DivOut;
DPH = SP_DIV_PRECISION_SINGLE;
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VR[result];
return (vs);
#else
@ -1291,6 +1297,7 @@ VECTOR_OPERATION VRSQH(v16 vs, v16 vt)
VR[result][source & 07] = DivOut >> 16;
DPH = SP_DIV_PRECISION_DOUBLE;
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VR[result];
return (vs);
#else

View file

@ -447,6 +447,7 @@ VECTOR_OPERATION VMACF(v16 vs, v16 vt)
#endif
do_macf(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VD;
return (vs);
#else
@ -471,6 +472,7 @@ VECTOR_OPERATION VMACU(v16 vs, v16 vt)
#endif
do_macu(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VD;
return (vs);
#else

View file

@ -360,6 +360,7 @@ VECTOR_OPERATION VLT(v16 vs, v16 vt)
#endif
do_lt(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VD;
return (vs);
#else
@ -384,6 +385,7 @@ VECTOR_OPERATION VEQ(v16 vs, v16 vt)
#endif
do_eq(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VD;
return (vs);
#else
@ -408,6 +410,7 @@ VECTOR_OPERATION VNE(v16 vs, v16 vt)
#endif
do_ne(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VD;
return (vs);
#else
@ -432,6 +435,7 @@ VECTOR_OPERATION VGE(v16 vs, v16 vt)
#endif
do_ge(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VD;
return (vs);
#else
@ -456,6 +460,7 @@ VECTOR_OPERATION VCL(v16 vs, v16 vt)
#endif
do_cl(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VD;
return (vs);
#else
@ -480,6 +485,7 @@ VECTOR_OPERATION VCH(v16 vs, v16 vt)
#endif
do_ch(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VD;
return (vs);
#else
@ -504,6 +510,7 @@ VECTOR_OPERATION VCR(v16 vs, v16 vt)
#endif
do_cr(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VD;
return (vs);
#else
@ -528,6 +535,7 @@ VECTOR_OPERATION VMRG(v16 vs, v16 vt)
#endif
do_mrg(VD, VS, VT);
#ifdef ARCH_MIN_SSE2
COMPILER_FENCE();
vs = *(v16 *)VD;
return (vs);
#else

View file

@ -15,7 +15,7 @@
#ifndef _VU_H_
#define _VU_H_
#ifdef ARCH_MIN_SSE2
#if defined(ARCH_MIN_SSE2) && !defined(SSE2NEON)
#include <emmintrin.h>
#endif