mupen64plus-rsp-cxd4/su.h
2020-05-27 21:13:21 -07:00

394 lines
12 KiB
C

/******************************************************************************\
* Project: Basic MIPS R4000 Instruction Set for Scalar Unit Operations *
* Authors: Iconoclast *
* Release: 2018.03.17 *
* License: CC0 Public Domain Dedication *
* *
* To the extent possible under law, the author(s) have dedicated all copyright *
* and related and neighboring rights to this software to the public domain *
* worldwide. This software is distributed without any warranty. *
* *
* You should have received a copy of the CC0 Public Domain Dedication along *
* with this software. *
* If not, see <http://creativecommons.org/publicdomain/zero/1.0/>. *
\******************************************************************************/
#ifndef _SU_H_
#define _SU_H_
#include <limits.h>
#include <stdio.h>
#include "my_types.h"
#include "rsp.h"
#define SEMAPHORE_LOCK_CORRECTIONS
#define WAIT_FOR_CPU_HOST
#if (0)
#define SP_EXECUTE_LOG
#define VU_EMULATE_SCALAR_ACCUMULATOR_READ
#endif
/*
* Currently, the plugin system this module is written for doesn't notify us
* of how much RDRAM is installed to the system, so we'll use signal handlers
* to catch memory segment access faults in the trial search to find it out.
*/
extern unsigned long su_max_address;
/*
* Interact with memory using server-side byte order (MIPS big-endian) or
* client-side (VM host's) native byte order on a 32-bit boundary.
*
* Unfortunately, most op-codes are optimized to require this to be TRUE.
*/
#if (ENDIAN_M == 0)
#define USE_CLIENT_ENDIAN 0
#else
#define USE_CLIENT_ENDIAN 1
#endif
/*
* Always keep this enabled for faster interpreter CPU.
*
* If you disable this, the branch delay slot algorithm will match the
* documentation found in the MIPS manuals (which is not entirely accurate).
*
* Enabled:
* while (CPU_running) {
* PC = static_delay_slot_adjustments();
* switch (opcode) { ... continue; }
* Disabled:
* while (CPU_running) {
* switch (opcode) { ... break; }
* PC = documented_branch_delay_slot();
* continue;
*/
#if 1
#define EMULATE_STATIC_PC
#endif
#if (0 != 0)
#define PROFILE_MODE static NOINLINE
#else
#define PROFILE_MODE static INLINE
#endif
typedef enum {
zero = 0,
at = 1,
#ifdef TRUE_MIPS_AND_NOT_JUST_THE_RSP_SUBSET
v0 = 2,
v1 = 3,
a0 = 4,
a1 = 5,
a2 = 6,
a3 = 7,
t0 = 8,
t1 = 9,
t2 = 10,
t3 = 11,
t4 = 12,
t5 = 13,
t6 = 14,
t7 = 15,
t8 = 24,
t9 = 25,
s0 = 16,
s1 = 17,
s2 = 18,
s3 = 19,
s4 = 20,
s5 = 21,
s6 = 22,
s7 = 23,
k0 = 26,
k1 = 27,
gp = 28,
#endif
sp = 29,
fp = 30, /* new, official MIPS name for it: "frame pointer" */
ra = 31,
NUMBER_OF_SCALAR_REGISTERS,
S8 = fp /* older name for GPR $fp as of the R4000 ISA */
} GPR_specifier;
extern RSP_INFO RSP_INFO_NAME;
extern pu8 DRAM;
extern pu8 DMEM;
extern pu8 IMEM;
extern u8 conf[];
/*
* general-purpose scalar registers
*
* based on the MIPS instruction set architecture but without most of the
* original register names (for example, no kernel-reserved registers)
*/
extern u32 SR[];
#define FIT_IMEM(PC) ((PC) & 0xFFFu & 0xFFCu)
#ifdef EMULATE_STATIC_PC
#define JUMP goto set_branch_delay
#else
#define JUMP break
#endif
#ifdef EMULATE_STATIC_PC
#define BASE_OFF 0x000
#else
#define BASE_OFF 0x004
#endif
#ifndef EMULATE_STATIC_PC
int stage;
#endif
#ifdef WAIT_FOR_CPU_HOST
extern short MFC0_count[];
/* Keep one C0 MF status read count for each scalar register. */
#endif
/*
* The number of times to tolerate executing `MFC0 $at, $c4`.
* Replace $at with any register--the timeout limit is per each.
*
* Set to a higher value to avoid prematurely quitting the interpreter.
* Set to a lower value for speed...you could get away with 10 sometimes.
*/
extern int MF_SP_STATUS_TIMEOUT;
#define SLOT_OFF ((BASE_OFF) + 0x000)
#define LINK_OFF ((BASE_OFF) + 0x004)
extern void set_PC(unsigned int address);
/*
* If the client CPU's shift amount is exactly 5 bits for a 32-bit source,
* then omit emulating (sa & 31) in the SLL/SRL/SRA interpreter steps.
* (Additionally, omit doing (GPR[rs] & 31) in SLLV/SRLV/SRAV.)
*
* As C pre-processor logic seems incapable of interpreting type storage,
* stuff like #if (1U << 31 == 1U << ~0U) will generally just fail.
*
* Some of these also will only work assuming 2's complement (e.g., Intel).
*/
#if defined(ARCH_MIN_SSE2) && !defined(SSE2NEON)
#define MASK_SA(sa) (sa)
#define IW_RD(inst) ((u16)(inst) >> 11)
#define SIGNED_IMM16(imm) (s16)(imm)
#else
#define MASK_SA(sa) ((sa) & 31)
#define IW_RD(inst) (u8)(((inst) >> 11) % (1 << 5))
#define SIGNED_IMM16(imm) (s16)(((imm) & 0x8000u) ? -(~(imm) + 1) : (imm))
#endif
/*
* If primary op-code is SPECIAL (000000), we could skip ANDing the rs shift.
* Shifts losing precision are undefined, so don't assume that (1 >> 1 == 0).
*/
#if (0xFFFFFFFFul >> 31 != 0x000000001ul) || defined(_DEBUG)
#define SPECIAL_DECODE_RS(inst) (((inst) & 0x03E00000UL) >> 21)
#else
#define SPECIAL_DECODE_RS(inst) ((inst) >> 21)
#endif
/*
* Try to stick to (unsigned char) to conform to strict aliasing rules.
*
* Do not say `u8`. My custom type definitions are minimum-size types.
* Do not say `uint8_t`. Exact-width types are not portable/universal.
*/
#if (CHAR_BIT != 8)
#error Non-POSIX-compliant (char) storage width.
#endif
/*
* RSP general-purpose registers (GPRs) are always 32-bit scalars (SRs).
* SR_B(gpr, 0) is SR[gpr]31..24, and SR_B(gpr, 3) is SR[gpr]7..0.
*/
#define SR_B(scalar, i) *((unsigned char *)&(SR[scalar]) + BES(i))
/*
* Universal byte-access macro for 8-element vectors of 16-bit halfwords.
* Use this macro if you are not sure whether the element is odd or even.
*
* Maybe a typedef union{} can be better, but it's less readable for RSP
* vector registers. Only 16-bit element computations exist, so the correct
* allocation of the register file is int16_t v[32][8], not a_union v[32].
*
* Either method--dynamic union reads or special aliasing--is undefined
* behavior and will not truly be portable code anyway, so it hardly matters.
*/
#define VR_B(vt, element) *((unsigned char *)&(VR[vt][0]) + MES(element))
/*
* Optimized byte-access macros for the vector registers.
* Use these ONLY if you know the element is even (VR_A) or odd (VR_U).
*
* They are faster because LEA PTR [offset +/- 1] means fewer CPU
* instructions generated than (offset ^ 1) does, in most cases.
*/
#define VR_A(vt, e) *((unsigned char *)&(VR[vt][0]) + e + MES(0))
#define VR_U(vt, e) *((unsigned char *)&(VR[vt][0]) + e - MES(0))
/*
* Use this ONLY if you know the element is even, not odd.
*
* This is only provided for purposes of consistency with VR_B() and friends.
* Saying `VR[vt][1] = x;` instead of `VR_S(vt, 2) = x` works as well.
*/
#define VR_S(vt, element) *(pi16)((unsigned char *)&(VR[vt][0]) + element)
/*** Scalar, Coprocessor Operations (system control) ***/
#define SP_STATUS_HALT (0x00000001ul << 0)
#define SP_STATUS_BROKE (0x00000001ul << 1)
#define SP_STATUS_DMA_BUSY (0x00000001ul << 2)
#define SP_STATUS_DMA_FULL (0x00000001ul << 3)
#define SP_STATUS_IO_FULL (0x00000001ul << 4)
#define SP_STATUS_SSTEP (0x00000001ul << 5)
#define SP_STATUS_INTR_BREAK (0x00000001ul << 6)
#define SP_STATUS_SIG0 (0x00000001ul << 7)
#define SP_STATUS_SIG1 (0x00000001ul << 8)
#define SP_STATUS_SIG2 (0x00000001ul << 9)
#define SP_STATUS_SIG3 (0x00000001ul << 10)
#define SP_STATUS_SIG4 (0x00000001ul << 11)
#define SP_STATUS_SIG5 (0x00000001ul << 12)
#define SP_STATUS_SIG6 (0x00000001ul << 13)
#define SP_STATUS_SIG7 (0x00000001ul << 14)
typedef enum {
RCP_SP_MEM_ADDR_REG,
RCP_SP_DRAM_ADDR_REG,
RCP_SP_RD_LEN_REG,
RCP_SP_WR_LEN_REG,
RCP_SP_STATUS_REG,
RCP_SP_DMA_FULL_REG,
RCP_SP_DMA_BUSY_REG,
RCP_SP_SEMAPHORE_REG,
RCP_DPC_START_REG,
RCP_DPC_END_REG,
RCP_DPC_CURRENT_REG,
RCP_DPC_STATUS_REG,
RCP_DPC_CLOCK_REG,
RCP_DPC_BUFBUSY_REG,
RCP_DPC_PIPEBUSY_REG,
RCP_DPC_TMEM_REG,
NUMBER_OF_CP0_REGISTERS
} CPR_specifier;
extern pu32 CR[];
extern void SP_DMA_READ(void);
extern void SP_DMA_WRITE(void);
extern u16 rwR_VCE(void);
extern void rwW_VCE(u16 VCE);
extern void MFC2(unsigned int rt, unsigned int vs, unsigned int e);
extern void MTC2(unsigned int rt, unsigned int vd, unsigned int e);
extern void CFC2(unsigned int rt, unsigned int rd);
extern void CTC2(unsigned int rt, unsigned int rd);
/*** Modern pseudo-operations (not real instructions, but nice shortcuts) ***/
extern void ULW(unsigned int rd, u32 addr);
extern void USW(unsigned int rs, u32 addr);
/*
* The scalar unit controls the primary R4000 operations implementation,
* which inherently includes interfacing with the vector unit under COP2.
*
* Although no scalar unit operations are computational vector operations,
* several of them will access machine states shared with the vector unit.
*
* We will need access to the vector unit's vector register file and its
* vector control register file used mainly for vector select instructions.
*/
#include "vu/select.h"
NOINLINE extern void res_S(void);
extern void SP_CP0_MF(unsigned int rt, unsigned int rd);
/*
* example syntax (basically the same for all LWC2/SWC2 ops):
* LTWV $v0[0], -64($at)
* SBV $v0[9], 0xFFE($0)
*/
typedef void(*mwc2_func)(
unsigned int vt,
unsigned int element,
signed int offset,
unsigned int base
);
extern mwc2_func LWC2[2 * 8*2];
extern mwc2_func SWC2[2 * 8*2];
extern void res_lsw(
unsigned int vt,
unsigned int element,
signed int offset,
unsigned int base
);
/*** Scalar, Coprocessor Operations (vector unit, scalar cache transfers) ***/
extern void LBV(unsigned vt, unsigned element, signed offset, unsigned base);
extern void LSV(unsigned vt, unsigned element, signed offset, unsigned base);
extern void LLV(unsigned vt, unsigned element, signed offset, unsigned base);
extern void LDV(unsigned vt, unsigned element, signed offset, unsigned base);
extern void SBV(unsigned vt, unsigned element, signed offset, unsigned base);
extern void SSV(unsigned vt, unsigned element, signed offset, unsigned base);
extern void SLV(unsigned vt, unsigned element, signed offset, unsigned base);
extern void SDV(unsigned vt, unsigned element, signed offset, unsigned base);
/*
* Group II vector loads and stores:
* PV and UV (As of RCP implementation, XV and ZV are reserved opcodes.)
*/
extern void LPV(unsigned vt, unsigned element, signed offset, unsigned base);
extern void LUV(unsigned vt, unsigned element, signed offset, unsigned base);
extern void SPV(unsigned vt, unsigned element, signed offset, unsigned base);
extern void SUV(unsigned vt, unsigned element, signed offset, unsigned base);
/*
* Group III vector loads and stores:
* HV, FV, and AV (As of RCP implementation, AV opcodes are reserved.)
*/
extern void LHV(unsigned vt, unsigned element, signed offset, unsigned base);
extern void LFV(unsigned vt, unsigned element, signed offset, unsigned base);
extern void SHV(unsigned vt, unsigned element, signed offset, unsigned base);
extern void SFV(unsigned vt, unsigned element, signed offset, unsigned base);
/*
* Group IV vector loads and stores:
* QV and RV
*/
extern void LQV(unsigned vt, unsigned element, signed offset, unsigned base);
extern void LRV(unsigned vt, unsigned element, signed offset, unsigned base);
extern void SQV(unsigned vt, unsigned element, signed offset, unsigned base);
extern void SRV(unsigned vt, unsigned element, signed offset, unsigned base);
/*
* Group V vector loads and stores
* TV and SWV (As of RCP implementation, LTWV opcode was undesired.)
*/
extern void LTV(unsigned vt, unsigned element, signed offset, unsigned base);
extern void SWV(unsigned vt, unsigned element, signed offset, unsigned base);
extern void STV(unsigned vt, unsigned element, signed offset, unsigned base);
NOINLINE extern void run_task(void);
#endif