Merge commit '73232513e7889c82f86fd77f81ac6a060fe7d828'

This commit is contained in:
Gillou68310 2015-11-10 11:57:18 +01:00
parent be3799bbcf
commit 8796295a2c
14 changed files with 298 additions and 354 deletions

View file

@ -10,11 +10,11 @@ In the engineering make-up of the Nintendo 64 (original codename: Project Reali
Here, the entire MIPS R4000 instruction set was modified for very fast, exception-free processing flow, and operation definitions for each instruction do not fall within the scope of this section. Presented instead are layouts of the new instructions added to the scalar unit (those under `LWC2` and `SWC2`, even though they do interface with the vector unit) and the vector unit (essentially, any instruction under `COP2` whose mnemonic starts with a 'V'). Information of how pre-existing MIPS R4000 instructions were modified or which ones were removed is the adventure of the MIPS programmer to research.
V*_vd_, _vs_, _vt_[_element_] `/* exceptions: scalar divide reads */`
<table border="1">
<tr align="center"><td>COP2</td><td>element</td><td>vs1</td><td>vs2</td><td>vt</td><td>func</td></tr>
<tr align="right"><td>010010</td><td>1----</td><td>-----</td><td>-----</td><td>-----</td><td>??????</td></tr>
</table>
`C2` _vd_, _vs_, _vt_[_element_] `/* exceptions: scalar divide reads */`
| COP2 | element | vs1 | vs2 | vt | func |
| ------ |:-------:| ----- | ----- | ----- | ------ |
|`010010`| `1eeee` |`ttttt`|`sssss`|`ddddd`|`??????`|
The major types of VU computational instructions are _multiply,_ _add,_ _select,_ _logical,_ and _divide._
@ -25,14 +25,13 @@ Multiply instructions are the most frequent and classifiable as follows:
* If `(format & 0b100) == 0`, then the operation is single-precision (`VMUL*` and `VMAC*`).
* If `(format & 0b100) != 0`, then the operation is double-precision (`VMUD*` and `VMAD*`).
<table border="1">
<tr align="center"><td><i>op-code</i></td><td>Type</td></tr>
<tr><td>0 0 a x x x</td><td>multiply</td></tr>
<tr><td>0 1 x x x x</td><td>add</td></tr>
<tr><td>1 0 0 x x x</td><td>select</td></tr>
<tr><td>1 0 1 x x x</td><td>logical</td></tr>
<tr><td>1 1 0 x x x</td><td>divide</td></tr>
</table>
|_op-code_| Type |
| -------:| -------- |
| `00axxx`| multiply |
| `01xxxx`| add |
| `100xxx`| select |
| `101xxx`| logical |
| `110xxx`| divide |
* `00 (VMULF)` Vector Multiply Signed Fractions
* `01 (VMULU)` Vector Multiply Unsigned Fractions
@ -101,25 +100,24 @@ Multiply instructions are the most frequent and classifiable as follows:
### RSP Vector Load Transfers
The VR-DMEM transaction instruction cycles are still processed by the scalar unit, not the vector unit. In the modern implementations accepted by most vector unit communications systems today, the transfer instructions are classifiable under five groups:
<ol>
<li>BV, SV, LV, DV</li>
<li>PV, UV, XV, ZV</li>
<li>HV, FV, AV</li>
<li>QV, RV</li>
<li>TV, WV</li>
</ol>
The VR-DMEM transaction instruction cycles are still processed by the scalar unit, not the vector unit. In the modern implementations accepted by most vector unit communications systems today, the transfer instructions are classifiable under five groups:
Not all of those instructions were implemented as of the time of the Nintendo 64's RCP, however. Additionally, their ordering in the opcode matrix was a little skewed to what is seen below. At this time, it is better to use only three categories of instructions:
1. BV, SV, LV, DV
2. PV, UV, XV, ZV
3. HV, FV, AV
4. QV, RV
5. TV, WV
Not all of those instructions were implemented as of the time of the Nintendo 64's RCP, however. Additionally, their ordering in the opcode matrix was a little skewed to what is seen below. At this time, it is better to use only three categories of instructions:
* _normal_: Anything under Group I or Group IV is normal type. Only the element must be aligned; `addr & 1` may resolve true.
* _packed_: Anything under Group II or Group III. Useful for working with specially mapped data, such as pixels.
* _transposed_: `LTV`, *LTWV,* `STV`, and `SWV` can be found in heaps of 16 instructions, all dedicated to matrix transposition through eight diagonals of halfword elements.
LWC2&#8;_vt_[_element_], _offset_(_base_)
<table border="1">
<tr align="center"><td>LWC2</td><td>base</td><td>vt</td><td>rd</td><td>element</td><td>offset</td></tr>
<tr align="right"><td>110010</td><td>-----</td><td>-----</td><td>?????</td><td>----</td><td>-------</td></tr>
</table>
`LWC2` _vt_[_element_], _offset_(_base_)
| LWC2 | base | vt | rd | element | offset |
| ------ | ----- | ----- | ----- |:-------:| -------- |
|`110010`|`sssss`|`ttttt`|`?????`| `eeee` | `Xxxxxxx`|
* `00 (LBV)` Load Byte to Vector Unit
* `01 (LSV)` Load Shortword to Vector Unit
@ -138,10 +136,11 @@ LWC2&#8;_vt_[_element_], _offset_(_base_)
* `16 reserved`
* `17 reserved`
<table border="1">
<tr align="center"><td>SWC2</td><td>base</td><td>vt</td><td>rd</td><td>element</td><td>offset</td></tr>
<tr align="right"><td>111010</td><td>-----</td><td>-----</td><td>?????</td><td>----</td><td>-------</td></tr>
</table>
`SWC2` _vt_[_element_], _offset_(_base_)
| SWC2 | base | vt | rd | element | offset |
| ------ | ----- | ----- | ----- |:-------:| -------- |
|`111010`|`sssss`|`ttttt`|`?????`| `eeee` | `Xxxxxxx`|
* `00 (SBV)` Store Byte from Vector Unit
* `01 (SSV)` Store Shortword from Vector Unit

View file

@ -1,20 +1,20 @@
@ECHO OFF
TITLE MinGW Compiler Suite Invocation
set MinGW=C:/MinGW
REM set rsp=%USERPROFILE%/rsp
set MinGW=C:\MinGW
REM set rsp=%USERPROFILE%\rsp
set rsp=%CD%
set obj=%rsp%/obj
set obj=%rsp%\obj
set OBJ_LIST=^
%obj%/module.o ^
%obj%/su.o ^
%obj%/vu/vu.o ^
%obj%/vu/multiply.o ^
%obj%/vu/add.o ^
%obj%/vu/select.o ^
%obj%/vu/logical.o ^
%obj%/vu/divide.o
%obj%\module.o ^
%obj%\su.o ^
%obj%\vu\vu.o ^
%obj%\vu\multiply.o ^
%obj%\vu\add.o ^
%obj%\vu\select.o ^
%obj%\vu\logical.o ^
%obj%\vu\divide.o
set FLAGS_ANSI=-O3^
-DPLUGIN_API_VERSION=0x0101^
@ -30,38 +30,38 @@ set FLAGS_x86=-O3^
-mstackrealign^
-Wall^
-pedantic
set C_FLAGS=%FLAGS_X86%
set C_FLAGS=%FLAGS_x86%
if not exist obj (
mkdir obj
cd obj
mkdir vu
)
cd %MinGW%/bin
cd %MinGW%\bin
ECHO Compiling C source code...
cc -S %C_FLAGS% -o %obj%/module.asm %rsp%/module.c
cc -S %C_FLAGS% -o %obj%/su.asm %rsp%/su.c
cc -S %C_FLAGS% -o %obj%/vu/vu.asm %rsp%/vu/vu.c
cc -S %C_FLAGS% -o %obj%/vu/multiply.asm %rsp%/vu/multiply.c
cc -S %C_FLAGS% -o %obj%/vu/add.asm %rsp%/vu/add.c
cc -S %C_FLAGS% -o %obj%/vu/select.asm %rsp%/vu/select.c
cc -S %C_FLAGS% -o %obj%/vu/logical.asm %rsp%/vu/logical.c
cc -S %C_FLAGS% -o %obj%/vu/divide.asm %rsp%/vu/divide.c
cc -S %C_FLAGS% -o %obj%\module.asm %rsp%\module.c
cc -S %C_FLAGS% -o %obj%\su.asm %rsp%\su.c
cc -S %C_FLAGS% -o %obj%\vu\vu.asm %rsp%\vu\vu.c
cc -S %C_FLAGS% -o %obj%\vu\multiply.asm %rsp%\vu\multiply.c
cc -S %C_FLAGS% -o %obj%\vu\add.asm %rsp%\vu\add.c
cc -S %C_FLAGS% -o %obj%\vu\select.asm %rsp%\vu\select.c
cc -S %C_FLAGS% -o %obj%\vu\logical.asm %rsp%\vu\logical.c
cc -S %C_FLAGS% -o %obj%\vu\divide.asm %rsp%\vu\divide.c
ECHO.
ECHO Assembling compiled sources...
as --statistics -o %obj%/module.o %obj%/module.asm
as --statistics -o %obj%/su.o %obj%/su.asm
as --statistics -o %obj%/vu/vu.o %obj%/vu/vu.asm
as -o %obj%/vu/multiply.o %obj%/vu/multiply.asm
as -o %obj%/vu/add.o %obj%/vu/add.asm
as -o %obj%/vu/select.o %obj%/vu/select.asm
as -o %obj%/vu/logical.o %obj%/vu/logical.asm
as -o %obj%/vu/divide.o %obj%/vu/divide.asm
as --statistics -o %obj%\module.o %obj%\module.asm
as --statistics -o %obj%\su.o %obj%\su.asm
as --statistics -o %obj%\vu\vu.o %obj%\vu\vu.asm
as -o %obj%\vu\multiply.o %obj%\vu\multiply.asm
as -o %obj%\vu\add.o %obj%\vu\add.asm
as -o %obj%\vu\select.o %obj%\vu\select.asm
as -o %obj%\vu\logical.o %obj%\vu\logical.asm
as -o %obj%\vu\divide.o %obj%\vu\divide.asm
ECHO.
ECHO Linking assembled object files...
ld --shared -e _DllMain@12 -o %obj%/rspdebug.dll %OBJ_LIST% -lkernel32
ld --shared -e _DllMain@12 -o %obj%\rspdebug.dll %OBJ_LIST% %MinGW%\lib\libkernel32.a
strip -o %obj%/rsp.dll %obj%/rspdebug.dll
PAUSE

67
make_w64.cmd Normal file
View file

@ -0,0 +1,67 @@
@ECHO OFF
TITLE MinGW Compiler Suite Invocation
set version=x86_64-5.1.0-win32-seh-rt_v4-rev0
set MinGW="C:\Program Files\mingw-w64\%version%\mingw64"
REM set rsp=%USERPROFILE%\rsp
set rsp=%CD%
set obj=%rsp%\obj
set OBJ_LIST=^
%obj%\module.o ^
%obj%\su.o ^
%obj%\vu\vu.o ^
%obj%\vu\multiply.o ^
%obj%\vu\add.o ^
%obj%\vu\select.o ^
%obj%\vu\logical.o ^
%obj%\vu\divide.o ^
%MinGW%\x86_64-w64-mingw32\lib\libkernel32.a
set FLAGS_ANSI=-Wall^
-DPLUGIN_API_VERSION=0x0101^
-march=native^
-mstackrealign^
-pedantic
set FLAGS_x86=-Wall^
-masm=intel^
-DPLUGIN_API_VERSION=0x0101^
-DARCH_MIN_SSE2^
-march=native^
-mstackrealign^
-pedantic
set C_FLAGS=%FLAGS_x86%
if not exist obj (
mkdir obj
cd obj
mkdir vu
)
cd %MinGW%\bin
ECHO Compiling C source code...
%MinGW%\bin\gcc.exe -S -Os %C_FLAGS% -o %obj%\module.asm %rsp%\module.c
%MinGW%\bin\gcc.exe -S -O3 %C_FLAGS% -o %obj%\su.asm %rsp%\su.c
%MinGW%\bin\gcc.exe -S -O3 %C_FLAGS% -o %obj%\vu\vu.asm %rsp%\vu\vu.c
%MinGW%\bin\gcc.exe -S -O3 %C_FLAGS% -o %obj%\vu\multiply.asm %rsp%\vu\multiply.c
%MinGW%\bin\gcc.exe -S -O3 %C_FLAGS% -o %obj%\vu\add.asm %rsp%\vu\add.c
%MinGW%\bin\gcc.exe -S -O3 %C_FLAGS% -o %obj%\vu\select.asm %rsp%\vu\select.c
%MinGW%\bin\gcc.exe -S -O3 %C_FLAGS% -o %obj%\vu\logical.asm %rsp%\vu\logical.c
%MinGW%\bin\gcc.exe -S -O3 %C_FLAGS% -o %obj%\vu\divide.asm %rsp%\vu\divide.c
ECHO.
ECHO Assembling compiled sources...
%MinGW%\bin\as.exe -o %obj%\module.o %obj%\module.asm
%MinGW%\bin\as.exe -o %obj%\su.o %obj%\su.asm
%MinGW%\bin\as.exe -o %obj%\vu\vu.o %obj%\vu\vu.asm
%MinGW%\bin\as.exe -o %obj%\vu\multiply.o %obj%\vu\multiply.asm
%MinGW%\bin\as.exe -o %obj%\vu\add.o %obj%\vu\add.asm
%MinGW%\bin\as.exe -o %obj%\vu\select.o %obj%\vu\select.asm
%MinGW%\bin\as.exe -o %obj%\vu\logical.o %obj%\vu\logical.asm
%MinGW%\bin\as.exe -o %obj%\vu\divide.o %obj%\vu\divide.asm
ECHO.
ECHO Linking assembled object files...
%MinGW%\bin\ld.exe --shared -e DllMain -o %obj%\rspdebug.dll %OBJ_LIST%
%MinGW%\bin\strip.exe -o %obj%/rsp.dll %obj%/rspdebug.dll
PAUSE

View file

@ -17,7 +17,6 @@
#define _MODULE_H_
#include <stdio.h>
#include "rsp.h"
#define CFG_FILE "rsp_conf.bin"

8
rsp.h
View file

@ -18,7 +18,7 @@ extern "C" {
#define PLUGIN_TYPE_RSP 1
#define PLUGIN_TYPE_GFX 2
#define PLUGIN_TYPE_AUDIO 3
#define PLUGIN_TYPE_CONTROLLER 4z
#define PLUGIN_TYPE_CONTROLLER 4
#ifndef PLUGIN_API_VERSION
#define PLUGIN_API_VERSION 0x0102
@ -38,8 +38,12 @@ extern "C" {
* versus
* `GET_RCP_REG(MI_INTR_REG) |= MI_INTR_MASK_SP;'.
*/
#if !defined(RSP_INFO_NAME) && !defined(M64P_PLUGIN_API)
#ifndef RSP_INFO_NAME
#ifdef M64P_PLUGIN_API
#define RSP_INFO_NAME RSP_info
#else
#define RSP_INFO_NAME RCP_info_SP
#endif
#define GET_RSP_INFO(member) ((RSP_INFO_NAME).member)
#define GET_RCP_REG(member) (*(RSP_INFO_NAME).member)
#endif

234
su.c
View file

@ -1535,6 +1535,26 @@ mwc2_func SWC2[2 * 8*2] = {
res_lsw,res_lsw,res_lsw,res_lsw,res_lsw,res_lsw,res_lsw,res_lsw,
};
static ALIGNED i16 shuffle_temporary[N];
static const unsigned char ei[1 << 4][N] = {
{ 00, 01, 02, 03, 04, 05, 06, 07 }, /* none (vector-only operand) */
{ 00, 01, 02, 03, 04, 05, 06, 07 },
{ 00, 00, 02, 02, 04, 04, 06, 06 }, /* 0Q */
{ 01, 01, 03, 03, 05, 05, 07, 07 }, /* 1Q */
{ 00, 00, 00, 00, 04, 04, 04, 04 }, /* 0H */
{ 01, 01, 01, 01, 05, 05, 05, 05 }, /* 1H */
{ 02, 02, 02, 02, 06, 06, 06, 06 }, /* 2H */
{ 03, 03, 03, 03, 07, 07, 07, 07 }, /* 3H */
{ 00, 00, 00, 00, 00, 00, 00, 00 }, /* 0W */
{ 01, 01, 01, 01, 01, 01, 01, 01 }, /* 1W */
{ 02, 02, 02, 02, 02, 02, 02, 02 }, /* 2W */
{ 03, 03, 03, 03, 03, 03, 03, 03 }, /* 3W */
{ 04, 04, 04, 04, 04, 04, 04, 04 }, /* 4W */
{ 05, 05, 05, 05, 05, 05, 05, 05 }, /* 5W */
{ 06, 06, 06, 06, 06, 06, 06, 06 }, /* 6W */
{ 07, 07, 07, 07, 07, 07, 07, 07 }, /* 7W */
};
NOINLINE void run_task(void)
{
register unsigned int PC;
@ -1559,9 +1579,7 @@ NOINLINE void run_task(void)
#else
ALIGNED i16 source[N], target[N];
#endif
unsigned int op, base, element;
unsigned int rd, rs, rt;
unsigned int vd, vs, vt;
unsigned int op, element;
inst = *(pi32)(IMEM + FIT_IMEM(PC));
#ifdef EMULATE_STATIC_PC
@ -1573,19 +1591,21 @@ EX:
#endif
op = inst >> 26;
rs = inst >> 21; /* &= 31 */
rt = (inst >> 16) & 31;
rd = (u16)(inst) >> 11;
base = rs & 31;
#ifdef _DEBUG
SR[0] = 0x00000000; /* already handled on per-instruction basis */
#endif
switch (op)
{
s16 offset;
unsigned int rd, vd;
unsigned int rs, vs;
unsigned int rt, vt;
unsigned int base; /* a synonym of `rs' for memory load/store ops */
register u32 addr;
case 000: /* SPECIAL */
rd = (inst & 0x0000FFFF) >> 11;
rt = (inst >> 16) & 31;
switch (inst % 64)
{
case 000: /* SLL */
@ -1601,14 +1621,17 @@ EX:
SR[0] = 0x00000000;
CONTINUE;
case 004: /* SLLV */
rs = SPECIAL_DECODE_RS(inst);
SR[rd] = SR[rt] << MASK_SA(SR[rs]);
SR[0] = 0x00000000;
CONTINUE;
case 006: /* SRLV */
rs = SPECIAL_DECODE_RS(inst);
SR[rd] = (u32)(SR[rt]) >> MASK_SA(SR[rs]);
SR[0] = 0x00000000;
CONTINUE;
case 007: /* SRAV */
rs = SPECIAL_DECODE_RS(inst);
SR[rd] = (s32)(SR[rt]) >> MASK_SA(SR[rs]);
SR[0] = 0x00000000;
CONTINUE;
@ -1616,6 +1639,7 @@ EX:
SR[rd] = (PC + LINK_OFF) & 0x00000FFC;
SR[0] = 0x00000000;
case 010: /* JR */
rs = SPECIAL_DECODE_RS(inst);
set_PC(SR[rs]);
JUMP;
case 015: /* BREAK */
@ -1629,35 +1653,43 @@ EX:
CONTINUE;
case 040: /* ADD */
case 041: /* ADDU */
rs = SPECIAL_DECODE_RS(inst);
SR[rd] = SR[rs] + SR[rt];
SR[0] = 0x00000000; /* needed for Rareware ucodes */
CONTINUE;
case 042: /* SUB */
case 043: /* SUBU */
rs = SPECIAL_DECODE_RS(inst);
SR[rd] = SR[rs] - SR[rt];
SR[0] = 0x00000000;
CONTINUE;
case 044: /* AND */
rs = SPECIAL_DECODE_RS(inst);
SR[rd] = SR[rs] & SR[rt];
SR[0] = 0x00000000; /* needed for Rareware ucodes */
CONTINUE;
case 045: /* OR */
rs = SPECIAL_DECODE_RS(inst);
SR[rd] = SR[rs] | SR[rt];
SR[0] = 0x00000000;
CONTINUE;
case 046: /* XOR */
rs = SPECIAL_DECODE_RS(inst);
SR[rd] = SR[rs] ^ SR[rt];
SR[0] = 0x00000000;
CONTINUE;
case 047: /* NOR */
rs = SPECIAL_DECODE_RS(inst);
SR[rd] = ~(SR[rs] | SR[rt]);
SR[0] = 0x00000000;
CONTINUE;
case 052: /* SLT */
rs = SPECIAL_DECODE_RS(inst);
SR[rd] = ((s32)(SR[rs]) < (s32)(SR[rt]));
SR[0] = 0x00000000;
CONTINUE;
case 053: /* SLTU */
rs = SPECIAL_DECODE_RS(inst);
SR[rd] = ((u32)(SR[rs]) < (u32)(SR[rt]));
SR[0] = 0x00000000;
CONTINUE;
@ -1666,13 +1698,14 @@ EX:
CONTINUE;
}
case 001: /* REGIMM */
switch (rt)
rs = (inst >> 21) & 31;
switch (rt = (inst >> 16) & 31)
{
case 020: /* BLTZAL */
SR[31] = (PC + LINK_OFF) & 0x00000FFC;
/* fall through */
case 000: /* BLTZ */
if (!((s32)SR[base] < 0))
if (!((s32)SR[rs] < 0))
CONTINUE;
set_PC(PC + 4*inst + SLOT_OFF);
JUMP;
@ -1680,7 +1713,7 @@ EX:
SR[31] = (PC + LINK_OFF) & 0x00000FFC;
/* fall through */
case 001: /* BGEZ */
if (!((s32)SR[base] >= 0))
if (!((s32)SR[rs] >= 0))
CONTINUE;
set_PC(PC + 4*inst + SLOT_OFF);
JUMP;
@ -1694,56 +1727,77 @@ EX:
set_PC(4*inst);
JUMP;
case 004: /* BEQ */
if (!(SR[base] == SR[rt]))
rs = (inst >> 21) & 31;
rt = (inst >> 16) & 31;
if (!(SR[rs] == SR[rt]))
CONTINUE;
set_PC(PC + 4*inst + SLOT_OFF);
JUMP;
case 005: /* BNE */
if (!(SR[base] != SR[rt]))
rs = (inst >> 21) & 31;
rt = (inst >> 16) & 31;
if (!(SR[rs] != SR[rt]))
CONTINUE;
set_PC(PC + 4*inst + SLOT_OFF);
JUMP;
case 006: /* BLEZ */
if (!((s32)SR[base] <= 0x00000000))
rs = (inst >> 21) & 31;
if (!((s32)SR[rs] <= 0x00000000))
CONTINUE;
set_PC(PC + 4*inst + SLOT_OFF);
JUMP;
case 007: /* BGTZ */
if (!((s32)SR[base] > 0x00000000))
rs = (inst >> 21) & 31;
if (!((s32)SR[rs] > 0x00000000))
CONTINUE;
set_PC(PC + 4*inst + SLOT_OFF);
JUMP;
case 010: /* ADDI */
case 011: /* ADDIU */
SR[rt] = SR[base] + (s16)(inst);
rs = (inst >> 21) & 31;
rt = (inst >> 16) & 31;
SR[rt] = SR[rs] + (s16)(inst);
SR[0] = 0x00000000;
CONTINUE;
case 012: /* SLTI */
SR[rt] = ((s32)(SR[base]) < (s16)(inst));
rs = (inst >> 21) & 31;
rt = (inst >> 16) & 31;
SR[rt] = ((s32)(SR[rs]) < (s16)(inst));
SR[0] = 0x00000000;
CONTINUE;
case 013: /* SLTIU */
SR[rt] = ((u32)(SR[base]) < (u16)(inst));
rs = (inst >> 21) & 31;
rt = (inst >> 16) & 31;
SR[rt] = ((u32)(SR[rs]) < (u16)(inst));
SR[0] = 0x00000000;
CONTINUE;
case 014: /* ANDI */
SR[rt] = SR[base] & (inst & 0x0000FFFF);
rs = (inst >> 21) & 31;
rt = (inst >> 16) & 31;
SR[rt] = SR[rs] & (inst & 0x0000FFFF);
SR[0] = 0x00000000;
CONTINUE;
case 015: /* ORI */
SR[rt] = SR[base] | (inst & 0x0000FFFF);
rs = (inst >> 21) & 31;
rt = (inst >> 16) & 31;
SR[rt] = SR[rs] | (inst & 0x0000FFFF);
SR[0] = 0x00000000;
CONTINUE;
case 016: /* XORI */
SR[rt] = SR[base] ^ (inst & 0x0000FFFF);
rs = (inst >> 21) & 31;
rt = (inst >> 16) & 31;
SR[rt] = SR[rs] ^ (inst & 0x0000FFFF);
SR[0] = 0x00000000;
CONTINUE;
case 017: /* LUI */
rt = (inst >> 16) & 31;
SR[rt] = inst << 16;
SR[0] = 0x00000000;
CONTINUE;
case 020: /* COP0 */
switch (base)
rd = (inst & 0x0000FFFF) >> 11;
rt = (inst >> 16) & 31;
switch (rs = (inst >> 21) & 31)
{
case 000: /* MFC0 */
SP_CP0_MF(rt, rd & 0xF);
@ -1757,90 +1811,64 @@ EX:
}
case 022: /* COP2 */
op = inst & 0x0000003F;
vd = (inst & 0x000007FF) >> 6; /* inst.R.sa */
vs = rd;
vt = rt;
vd = (inst & 0x000007FF) >> 6; /* inst.R.sa */
vs = (inst & 0x0000FFFF) >> 11; /* inst.R.rd */
vt = (inst >> 16) & 31;
rs = (inst >> 21) & 31;
if (rs < 16)
switch (rs)
{
case 000:
MFC2(vt, vs, vd >>= 1);
CONTINUE;
case 002:
CFC2(vt, vs);
CONTINUE;
case 004:
MTC2(vt, vs, vd >>= 1);
CONTINUE;
case 006:
CTC2(vt, vs);
CONTINUE;
default:
res_S();
CONTINUE;
}
vector_op = COP2_C2[op];
element = rs & 0xFu;
for (i = 0; i < N; i++)
shuffle_temporary[i] = VR[vt][ei[element][i]];
#ifdef ARCH_MIN_SSE2
source = *(v16 *)VR[vs];
target = *(v16 *)shuffle_temporary;
*(v16 *)(VR[vd]) = vector_op(source, target);
#else
vector_copy(source, VR[vs]);
vector_copy(target, shuffle_temporary);
vector_op(source, target);
vector_copy(VR[vd], V_result);
#endif
switch (base)
{
case 000:
MFC2(vt, vs, vd >>= 1);
CONTINUE;
case 002:
CFC2(vt, vs);
CONTINUE;
case 004:
MTC2(vt, vs, vd >>= 1);
CONTINUE;
case 006:
CTC2(vt, vs);
CONTINUE;
case 020:
case 021:
EXECUTE_VU();
CONTINUE;
case 022:
EXECUTE_VU_0Q();
CONTINUE;
case 023:
EXECUTE_VU_1Q();
CONTINUE;
case 024:
EXECUTE_VU_0H();
CONTINUE;
case 025:
EXECUTE_VU_1H();
CONTINUE;
case 026:
EXECUTE_VU_2H();
CONTINUE;
case 027:
EXECUTE_VU_3H();
CONTINUE;
case 030:
EXECUTE_VU_0W();
CONTINUE;
case 031:
EXECUTE_VU_1W();
CONTINUE;
case 032:
EXECUTE_VU_2W();
CONTINUE;
case 033:
EXECUTE_VU_3W();
CONTINUE;
case 034:
EXECUTE_VU_4W();
CONTINUE;
case 035:
EXECUTE_VU_5W();
CONTINUE;
case 036:
EXECUTE_VU_6W();
CONTINUE;
case 037:
EXECUTE_VU_7W();
CONTINUE;
default:
res_S();
CONTINUE;
}
CONTINUE;
case 040: /* LB */
offset = (s16)(inst);
base = (inst >> 21) & 31;
addr = (SR[base] + offset) & 0x00000FFF;
rt = (inst >> 16) & 31;
SR[rt] = DMEM[BES(addr)];
SR[rt] = (s8)(SR[rt]);
SR[0] = 0x00000000;
CONTINUE;
case 041: /* LH */
offset = (s16)(inst);
base = (inst >> 21) & 31;
addr = (SR[base] + offset) & 0x00000FFF;
rt = (inst >> 16) & 31;
if (addr%0x004 == 0x003)
{
SR_B(rt, 2) = DMEM[addr - BES(0x000)];
@ -1857,7 +1885,10 @@ EX:
CONTINUE;
case 043: /* LW */
offset = (s16)(inst);
base = (inst >> 21) & 31;
addr = (SR[base] + offset) & 0x00000FFF;
rt = (inst >> 16) & 31;
if (addr%0x004 != 0x000)
ULW(rt, addr);
else
@ -1866,14 +1897,20 @@ EX:
CONTINUE;
case 044: /* LBU */
offset = (s16)(inst);
base = (inst >> 21) & 31;
addr = (SR[base] + offset) & 0x00000FFF;
rt = (inst >> 16) & 31;
SR[rt] = DMEM[BES(addr)];
SR[rt] = (u8)(SR[rt]);
SR[0] = 0x00000000;
CONTINUE;
case 045: /* LHU */
offset = (s16)(inst);
base = (inst >> 21) & 31;
addr = (SR[base] + offset) & 0x00000FFF;
rt = (inst >> 16) & 31;
if (addr%0x004 == 0x003)
{
SR_B(rt, 2) = DMEM[addr - BES(0x000)];
@ -1890,12 +1927,18 @@ EX:
CONTINUE;
case 050: /* SB */
offset = (s16)(inst);
base = (inst >> 21) & 31;
addr = (SR[base] + offset) & 0x00000FFF;
rt = (inst >> 16) & 31;
DMEM[BES(addr)] = (u8)(SR[rt]);
CONTINUE;
case 051: /* SH */
offset = (s16)(inst);
base = (inst >> 21) & 31;
addr = (SR[base] + offset) & 0x00000FFF;
rt = (inst >> 16) & 31;
if (addr%0x004 == 0x003)
{
DMEM[addr - BES(0x000)] = SR_B(rt, 2);
@ -1908,13 +1951,17 @@ EX:
CONTINUE;
case 053: /* SW */
offset = (s16)(inst);
base = (inst >> 21) & 31;
addr = (SR[base] + offset) & 0x00000FFF;
rt = (inst >> 16) & 31;
if (addr%0x004 != 0x000)
USW(rt, addr);
else
*(pi32)(DMEM + addr) = SR[rt];
CONTINUE;
case 062: /* LWC2 */
vt = (inst >> 16) & 31;
element = (inst & 0x000007FF) >> 7;
offset = (s16)(inst);
#ifdef ARCH_MIN_SSE2
@ -1923,9 +1970,13 @@ EX:
#else
offset = SE(offset, 6); /* sign-extended seven-bit offset */
#endif
LWC2[rd](rt, element, offset, base);
base = (inst >> 21) & 31;
rd = (inst & 0x0000FFFF) >> 11;
LWC2[rd](vt, element, offset, base);
CONTINUE;
case 072: /* SWC2 */
vt = (inst >> 16) & 31;
element = (inst & 0x000007FF) >> 7;
offset = (s16)(inst);
#ifdef ARCH_MIN_SSE2
@ -1934,7 +1985,10 @@ EX:
#else
offset = SE(offset, 6); /* sign-extended seven-bit offset */
#endif
SWC2[rd](rt, element, offset, base);
base = (inst >> 21) & 31;
rd = (inst & 0x0000FFFF) >> 11;
SWC2[rd](vt, element, offset, base);
CONTINUE;
default:
res_S();

127
su.h
View file

@ -17,7 +17,6 @@
#define _SU_H_
#include <stdio.h>
#include "my_types.h"
#include "rsp.h"
@ -31,10 +30,6 @@
#define VU_EMULATE_SCALAR_ACCUMULATOR_READ
#endif
#define RSP_INFO_NAME RSP_info
#define GET_RSP_INFO(member) (RSP_INFO_NAME.member)
#define GET_RCP_REG(member) (*RSP_INFO_NAME.member)
/*
* Currently, the plugin system this module is written for doesn't notify us
* of how much RDRAM is installed to the system, so we have to presume 8 MiB.
@ -106,6 +101,13 @@ extern void set_PC(unsigned int address);
/* Let hardware architecture do the mask for us. */
#endif
/* If primary op-code is SPECIAL (000000), we could skip ANDing the rs shift. */
#if (~0U >> 1 == ~0U) || defined(_DEBUG)
#define SPECIAL_DECODE_RS(inst) (((inst) & 0x03E00000UL) >> 21)
#else
#define SPECIAL_DECODE_RS(inst) ((inst) >> 21)
#endif
#define SR_B(s, i) (*(pi8)(((pi8)(SR + s)) + BES(i)))
#define SR_S(s, i) (*(pi16)(((pi8)(SR + s)) + HES(i)))
@ -269,119 +271,4 @@ extern void STV(unsigned vt, unsigned element, signed offset, unsigned base);
NOINLINE extern void run_task(void);
/*
* Unfortunately, SSE machine code takes up so much space in the instruction
* cache when populated enough in something like an interpreter switch
* statement, that the compiler starts looking for ways to create branches
* and jumps where the C code specifies none. This complex set of macros
* is intended to minimize the compiler's obligation to choose doing this
* since SSE2 has no static shuffle operation with a variable mask operand.
*/
#ifdef ARCH_MIN_SSE2
#define EXECUTE_VU() { target = *(v16 *)VR[vt]; \
*(v16 *)(VR[vd]) = vector_op(source, target); }
#define EXECUTE_VU_0Q() { \
target = _mm_shufflehi_epi16(_mm_shufflelo_epi16(*(v16 *)VR[vt], \
SHUFFLE(00, 00, 02, 02)), SHUFFLE(04, 04, 06, 06)); \
*(v16 *)(VR[vd]) = vector_op(source, target); }
#define EXECUTE_VU_1Q() { \
target = _mm_shufflehi_epi16(_mm_shufflelo_epi16(*(v16 *)VR[vt], \
SHUFFLE(01, 01, 03, 03)), SHUFFLE(05, 05, 07, 07)); \
*(v16 *)(VR[vd]) = vector_op(source, target); }
#define EXECUTE_VU_0H() { \
target = _mm_shufflehi_epi16(_mm_shufflelo_epi16(*(v16 *)VR[vt], \
SHUFFLE(00, 00, 00, 00)), SHUFFLE(04, 04, 04, 04)); \
*(v16 *)(VR[vd]) = vector_op(source, target); }
#define EXECUTE_VU_1H() { \
target = _mm_shufflehi_epi16(_mm_shufflelo_epi16(*(v16 *)VR[vt], \
SHUFFLE(01, 01, 01, 01)), SHUFFLE(05, 05, 05, 05)); \
*(v16 *)(VR[vd]) = vector_op(source, target); }
#define EXECUTE_VU_2H() { \
target = _mm_shufflehi_epi16(_mm_shufflelo_epi16(*(v16 *)VR[vt], \
SHUFFLE(02, 02, 02, 02)), SHUFFLE(06, 06, 06, 06)); \
*(v16 *)(VR[vd]) = vector_op(source, target); }
#define EXECUTE_VU_3H() { \
target = _mm_shufflehi_epi16(_mm_shufflelo_epi16(*(v16 *)VR[vt], \
SHUFFLE(03, 03, 03, 03)), SHUFFLE(07, 07, 07, 07)); \
*(v16 *)(VR[vd]) = vector_op(source, target); }
#define EXECUTE_VU_0W() { \
target = _mm_shuffle_epi32(_mm_shufflelo_epi16(*(v16 *)VR[vt], \
SHUFFLE(00, 00, 00, 00)), SHUFFLE(0/2, 0/2, 0/2, 0/2)); \
*(v16 *)(VR[vd]) = vector_op(source, target); }
#define EXECUTE_VU_1W() { \
target = _mm_shuffle_epi32(_mm_shufflelo_epi16(*(v16 *)VR[vt], \
SHUFFLE(01, 01, 01, 01)), SHUFFLE(1/2, 1/2, 1/2, 1/2)); \
*(v16 *)(VR[vd]) = vector_op(source, target); }
#define EXECUTE_VU_2W() { \
target = _mm_shuffle_epi32(_mm_shufflelo_epi16(*(v16 *)VR[vt], \
SHUFFLE(02, 02, 02, 02)), SHUFFLE(2/2, 2/2, 2/2, 2/2)); \
*(v16 *)(VR[vd]) = vector_op(source, target); }
#define EXECUTE_VU_3W() { \
target = _mm_shuffle_epi32(_mm_shufflelo_epi16(*(v16 *)VR[vt], \
SHUFFLE(03, 03, 03, 03)), SHUFFLE(3/2, 3/2, 3/2, 3/2)); \
*(v16 *)(VR[vd]) = vector_op(source, target); }
#define EXECUTE_VU_4W() { \
target = _mm_shuffle_epi32(_mm_shufflehi_epi16(*(v16 *)VR[vt], \
SHUFFLE(04, 04, 04, 04)), SHUFFLE(4/2, 4/2, 4/2, 4/2)); \
*(v16 *)(VR[vd]) = vector_op(source, target); }
#define EXECUTE_VU_5W() { \
target = _mm_shuffle_epi32(_mm_shufflehi_epi16(*(v16 *)VR[vt], \
SHUFFLE(05, 05, 05, 05)), SHUFFLE(5/2, 5/2, 5/2, 5/2)); \
*(v16 *)(VR[vd]) = vector_op(source, target); }
#define EXECUTE_VU_6W() { \
target = _mm_shuffle_epi32(_mm_shufflehi_epi16(*(v16 *)VR[vt], \
SHUFFLE(06, 06, 06, 06)), SHUFFLE(6/2, 6/2, 6/2, 6/2)); \
*(v16 *)(VR[vd]) = vector_op(source, target); }
#define EXECUTE_VU_7W() { \
target = _mm_shuffle_epi32(_mm_shufflehi_epi16(*(v16 *)VR[vt], \
SHUFFLE(07, 07, 07, 07)), SHUFFLE(7/2, 7/2, 7/2, 7/2)); \
*(v16 *)(VR[vd]) = vector_op(source, target); }
#else
#define EXECUTE_VU() { \
vector_copy(target, VR[vt]); SHUFFLE_VECTOR(target, 0x0); \
vector_op(source, target); vector_copy(VR[vd], V_result); }
#define EXECUTE_VU_0Q() { \
vector_copy(target, VR[vt]); SHUFFLE_VECTOR(target, 0x2); \
vector_op(source, target); vector_copy(VR[vd], V_result); }
#define EXECUTE_VU_1Q() { \
vector_copy(target, VR[vt]); SHUFFLE_VECTOR(target, 0x3); \
vector_op(source, target); vector_copy(VR[vd], V_result); }
#define EXECUTE_VU_0H() { \
vector_copy(target, VR[vt]); SHUFFLE_VECTOR(target, 0x4); \
vector_op(source, target); vector_copy(VR[vd], V_result); }
#define EXECUTE_VU_1H() { \
vector_copy(target, VR[vt]); SHUFFLE_VECTOR(target, 0x5); \
vector_op(source, target); vector_copy(VR[vd], V_result); }
#define EXECUTE_VU_2H() { \
vector_copy(target, VR[vt]); SHUFFLE_VECTOR(target, 0x6); \
vector_op(source, target); vector_copy(VR[vd], V_result); }
#define EXECUTE_VU_3H() { \
vector_copy(target, VR[vt]); SHUFFLE_VECTOR(target, 0x7); \
vector_op(source, target); vector_copy(VR[vd], V_result); }
#define EXECUTE_VU_0W() { \
vector_copy(target, VR[vt]); SHUFFLE_VECTOR(target, 0x8); \
vector_op(source, target); vector_copy(VR[vd], V_result); }
#define EXECUTE_VU_1W() { \
vector_copy(target, VR[vt]); SHUFFLE_VECTOR(target, 0x9); \
vector_op(source, target); vector_copy(VR[vd], V_result); }
#define EXECUTE_VU_2W() { \
vector_copy(target, VR[vt]); SHUFFLE_VECTOR(target, 0xA); \
vector_op(source, target); vector_copy(VR[vd], V_result); }
#define EXECUTE_VU_3W() { \
vector_copy(target, VR[vt]); SHUFFLE_VECTOR(target, 0xB); \
vector_op(source, target); vector_copy(VR[vd], V_result); }
#define EXECUTE_VU_4W() { \
vector_copy(target, VR[vt]); SHUFFLE_VECTOR(target, 0xC); \
vector_op(source, target); vector_copy(VR[vd], V_result); }
#define EXECUTE_VU_5W() { \
vector_copy(target, VR[vt]); SHUFFLE_VECTOR(target, 0xD); \
vector_op(source, target); vector_copy(VR[vd], V_result); }
#define EXECUTE_VU_6W() { \
vector_copy(target, VR[vt]); SHUFFLE_VECTOR(target, 0xE); \
vector_op(source, target); vector_copy(VR[vd], V_result); }
#define EXECUTE_VU_7W() { \
vector_copy(target, VR[vt]); SHUFFLE_VECTOR(target, 0xF); \
vector_op(source, target); vector_copy(VR[vd], V_result); }
#endif
#endif

View file

@ -15,8 +15,6 @@
#include "add.h"
#include "select.h"
#ifdef ARCH_MIN_SSE2
static INLINE void SIGNED_CLAMP_ADD(pi16 VD, pi16 VS, pi16 VT)
{

View file

@ -1070,10 +1070,17 @@ INLINE static void do_div(i32 data, int sqrt, int precision)
int fetch;
int shift;
#if (~0 >> 1 == -1)
data ^= (s32)(data + 32768) >> 31; /* DP only: (data < -32768) */
fetch = (s32)(data + 0) >> 31;
data ^= fetch;
data -= fetch; /* two's complement: -x == ~x - (~0) on wrap-around */
#else
if (precision == SP_DIV_PRECISION_SINGLE)
data = (data < 0) ? -data : +data;
if (precision == SP_DIV_PRECISION_DOUBLE && data < 0)
data = (data >= -32768) ? -data : ~data;
#endif
/*
* Note, from the code just above, that data cannot be negative.

View file

@ -15,8 +15,6 @@
#include "multiply.h"
#include "select.h"
#ifdef ARCH_MIN_SSE2
#define _mm_cmple_epu16(dst, src) \
_mm_cmpeq_epi16(_mm_subs_epu16(dst, src), _mm_setzero_si128())
@ -93,8 +91,9 @@ static INLINE void SIGNED_CLAMP_AL(pi16 VD)
for (i = 0; i < N; i++)
cond[i] = (temp[i] != VACC_M[i]); /* result_clamped != result_raw ? */
for (i = 0; i < N; i++)
temp[i] ^= 0x8000; /* half-assed unsigned saturation mix in the clamp */
merge(VD, cond, temp, VACC_L);
temp[i] ^= 0x8000; /* clamps 0x0000:0xFFFF instead of -0x8000:+0x7FFF */
for (i = 0; i < N; i++)
VD[i] = (cond[i] ? temp[i] : VACC_L[i]);
return;
}

View file

@ -27,7 +27,7 @@
* else
* dest = element_b;
*/
INLINE void merge(pi16 VD, pi16 cmp, pi16 pass, pi16 fail)
static void merge(pi16 VD, pi16 cmp, pi16 pass, pi16 fail)
{
register int i;
#if (0 != 0)

View file

@ -35,6 +35,4 @@ VECTOR_EXTERN
VECTOR_EXTERN
VMRG (v16 vs, v16 vt);
extern INLINE void merge(pi16 VD, pi16 cmp, pi16 pass, pi16 fail);
#endif

66
vu/vu.c
View file

@ -15,11 +15,11 @@
#include "vu.h"
#include "add.h"
#include "divide.h"
#include "logical.h"
#include "multiply.h"
#include "add.h"
#include "select.h"
#include "logical.h"
#include "divide.h"
#if 0
#include "pack.h"
#endif
@ -231,63 +231,3 @@ void set_VCE(u8 VCE)
cf_vce[i] = (VCE >> i) & 1;
return; /* Little endian becomes big. */
}
#ifndef ARCH_MIN_SSE2
/*
* vector-scalar element decoding
* Obsolete. Consider using at least the SSE2 algorithms instead.
*/
static const int ei[1 << 4][N] = {
{ 00, 01, 02, 03, 04, 05, 06, 07 }, /* none (vector-only operand) */
{ 00, 01, 02, 03, 04, 05, 06, 07 },
{ 00, 00, 02, 02, 04, 04, 06, 06 }, /* 0Q */
{ 01, 01, 03, 03, 05, 05, 07, 07 }, /* 1Q */
{ 00, 00, 00, 00, 04, 04, 04, 04 }, /* 0H */
{ 01, 01, 01, 01, 05, 05, 05, 05 }, /* 1H */
{ 02, 02, 02, 02, 06, 06, 06, 06 }, /* 2H */
{ 03, 03, 03, 03, 07, 07, 07, 07 }, /* 3H */
{ 00, 00, 00, 00, 00, 00, 00, 00 }, /* 0 */
{ 01, 01, 01, 01, 01, 01, 01, 01 }, /* 1 */
{ 02, 02, 02, 02, 02, 02, 02, 02 }, /* 2 */
{ 03, 03, 03, 03, 03, 03, 03, 03 }, /* 3 */
{ 04, 04, 04, 04, 04, 04, 04, 04 }, /* 4 */
{ 05, 05, 05, 05, 05, 05, 05, 05 }, /* 5 */
{ 06, 06, 06, 06, 06, 06, 06, 06 }, /* 6 */
{ 07, 07, 07, 07, 07, 07, 07, 07 } /* 7 */
};
static const int log_mask[1 << 4] = { /* inverse logarithms, truncated to int */
1 - 1,
1 - 1,
2 - 1, 2 - 1,
4 - 1, 4 - 1, 4 - 1, 4 - 1,
8 - 1, 8 - 1, 8 - 1, 8 - 1, 8 - 1, 8 - 1, 8 - 1, 8 - 1,
};
INLINE VECTOR_OPERATION SHUFFLE_VECTOR(v16 vd, const unsigned int e)
{
i16 SV[8];
register unsigned int i, j;
#if (0 == 0)
j = log_mask[e];
for (i = 0; i < N; i++)
SV[i] = vd[(i & ~j) | (e & j)];
#else
if (e & 0x8)
for (i = 0; i < N; i++)
SV[i] = vd[(i & ~0x7) | (e & 0x7)];
else if (e & 0x4)
for (i = 0; i < N; i++)
SV[i] = vd[(i & ~0x3) | (e & 0x3)];
else if (e & 0x2)
for (i = 0; i < N; i++)
SV[i] = vd[(i & ~0x1) | (e & 0x1)];
else /* if ((e == 0b0000) || (e == 0b0001)) */
for (i = 0; i < N; i++)
SV[i] = vd[(i & ~0x0) | (e & 0x0)];
#endif
vector_copy(vd, SV);
return;
}
#endif

View file

@ -228,14 +228,6 @@ VECTOR_EXTERN (*COP2_C2[8*7 + 8])(v16, v16);
#endif
/*
* Considering that almost all of the computational vector unit operations
* concern a shuffled, halfword-swapped target vector, a centralized method
* for shuffling said vector into a new vector temporary register,
* ST = VR[vt], should be pretty convenient.
*/
INLINE VECTOR_EXTERN SHUFFLE_VECTOR(v16 vd, const unsigned int e);
/*
* Many vector units have pairs of "vector condition flags" registers.
* In SGI's vector unit implementation, these are denoted as the