mirror of
https://github.com/Inori/GPCS4.git
synced 2024-05-20 13:07:58 -04:00
1891 lines
76 KiB
C++
1891 lines
76 KiB
C++
#include <cstdio>
|
||
#include <cstdint>
|
||
#include <cstdlib>
|
||
#include <cstring>
|
||
#include <fstream>
|
||
|
||
#define SCE_GNM_SET_ALIGN(n) __attribute__((__aligned__(n)))
|
||
#define SCE_GNM_ASSERT(x) ((void)(x))
|
||
|
||
|
||
typedef unsigned int uint;
|
||
typedef unsigned char uchar;
|
||
typedef unsigned short ushort;
|
||
typedef unsigned long ulong;
|
||
|
||
typedef char int8;
|
||
typedef signed char sint8;
|
||
typedef unsigned char uint8;
|
||
typedef short int16;
|
||
typedef signed short sint16;
|
||
typedef unsigned short uint16;
|
||
typedef int int32;
|
||
typedef signed int sint32;
|
||
typedef unsigned int uint32;
|
||
|
||
#define LAST_IND(x,part_type) (sizeof(x)/sizeof(part_type) - 1)
|
||
#define HIGH_IND(x,part_type) LAST_IND(x,part_type)
|
||
#define LOW_IND(x,part_type) 0
|
||
|
||
|
||
// Partially defined types. They are used when the decompiler does not know
|
||
// anything about the type except its size.
|
||
#define _BYTE uint8
|
||
#define _WORD uint16
|
||
#define _DWORD uint32
|
||
#define _QWORD uint64
|
||
typedef int32_t _BOOL4;
|
||
#define BYTEn(x, n) (*((_BYTE*)&(x)+n))
|
||
#define WORDn(x, n) (*((_WORD*)&(x)+n))
|
||
#define DWORDn(x, n) (*((_DWORD*)&(x)+n))
|
||
|
||
#define LOBYTE(x) BYTEn(x,LOW_IND(x,_BYTE))
|
||
#define LOWORD(x) WORDn(x,LOW_IND(x,_WORD))
|
||
#define LODWORD(x) DWORDn(x,LOW_IND(x,_DWORD))
|
||
#define HIBYTE(x) BYTEn(x,HIGH_IND(x,_BYTE))
|
||
#define HIWORD(x) WORDn(x,HIGH_IND(x,_WORD))
|
||
#define HIDWORD(x) DWORDn(x,HIGH_IND(x,_DWORD))
|
||
|
||
|
||
static const unsigned s_vex_vv[] = {
|
||
//#include "vex_vv.h"
|
||
//#include "test_vv.h"
|
||
#include "test.h"
|
||
};
|
||
|
||
//static const unsigned s_pix_p[] = {
|
||
// #include "pix_p.h"
|
||
// #include "test_p.h"
|
||
//};
|
||
|
||
struct ShaderBinaryInfo
|
||
{
|
||
uint8_t m_signature[7]; // 'OrbShdr'
|
||
uint8_t m_version; // ShaderBinaryInfoVersion
|
||
|
||
unsigned int m_pssl_or_cg : 1; // 1 = PSSL / Cg, 0 = IL / shtb
|
||
unsigned int m_cached : 1; // 1 = when compile, debugging source was cached. May only make sense for PSSL=1
|
||
uint32_t m_type : 4; // See enum ShaderBinaryType
|
||
uint32_t m_source_type : 2; // See enum ShaderSourceType
|
||
unsigned int m_length : 24; // Binary code length (does not include this structure or any of its preceding associated tables)
|
||
|
||
uint8_t m_chunkUsageBaseOffsetInDW; // in DW, which starts at ((uint32_t*)&ShaderBinaryInfo) - m_chunkUsageBaseOffsetInDW; max is currently 7 dwords (128 T# + 32 V# + 20 CB V# + 16 UAV T#/V#)
|
||
uint8_t m_numInputUsageSlots; // Up to 16 user data reg slots + 128 extended user data dwords supported by CUE; up to 16 user data reg slots + 240 extended user data dwords supported by InputUsageSlot
|
||
uint8_t m_isSrt : 1; // 1 if this shader uses shader resource tables and has an SrtDef table embedded below the input usage table and any extended usage info
|
||
uint8_t m_isSrtUsedInfoValid : 1; // 1 if SrtDef::m_isUsed=0 indicates an element is definitely unused; 0 if SrtDef::m_isUsed=0 indicates only that the element is not known to be used (m_isUsed=1 always indicates a resource is known to be used)
|
||
uint8_t m_isExtendedUsageInfo : 1; // 1 if this shader has extended usage info for the InputUsage table embedded below the input usage table
|
||
uint8_t m_reserved2 : 5; // For future use
|
||
uint8_t m_reserved3; // For future use
|
||
|
||
uint32_t m_shaderHash0; // Association hash first 4 bytes
|
||
uint32_t m_shaderHash1; // Association hash second 4 bytes
|
||
uint32_t m_crc32; // crc32 of shader + this struct, just up till this field
|
||
};
|
||
|
||
typedef enum ShaderStage
|
||
{
|
||
kShaderStageCs = 0x00000000, ///< Compute shader stage.
|
||
kShaderStagePs = 0x00000001, ///< Pixel shader stage.
|
||
kShaderStageVs = 0x00000002, ///< Vertex shader stage.
|
||
kShaderStageGs = 0x00000003, ///< Geometry shader stage.
|
||
kShaderStageEs = 0x00000004, ///< Export shader stage.
|
||
kShaderStageHs = 0x00000005, ///< Hull shader stage.
|
||
kShaderStageLs = 0x00000006, ///< LDS shader stage.
|
||
|
||
kShaderStageCount ///< The number of shader stages.
|
||
} ShaderStage;
|
||
|
||
|
||
typedef enum ShaderConstantDwordSize
|
||
{
|
||
kDwordSizeResource = 8, ///< <c>T#</c> or padded <c>V#</c>.
|
||
kDwordSizeRwResource = 8, ///< <c>T#</c> or padded <c>V#</c>.
|
||
kDwordSizeSampler = 4, ///< <c>S#</c>.
|
||
kDwordSizeVertexBuffer = 4, ///< <c>V#</c>.
|
||
kDwordSizeConstantBuffer = 4, ///< <c>V#</c>.
|
||
kDwordSizeBoolConstant = 1, ///< 32 packed bits.
|
||
kDwordSizeFloatConstant = 1, ///< IEEE single-precision float.
|
||
kDwordSizeAppendConsumeCounterRange = 1, ///< Two 16-bit integers packed together.
|
||
kDwordSizeStreamoutBuffer = 4, ///< <c>V#</c>.
|
||
kDwordSizeExtendedUserData = 128, ///< Space for extra shader user data.
|
||
kDwordSizeDispatchDrawData = 32, ///< Space for 32 <c>DWORD</c>s of DispatchDraw data.
|
||
kDwordSizeGdsMemoryRange = 1, ///< Two 16-bit integers packed together.
|
||
} ShaderConstantDwordSize;
|
||
|
||
|
||
typedef enum FetchShaderInstancingMode
|
||
{
|
||
kFetchShaderUseVertexIndex = 0x0, ///< No instancing; use Vertex Index. Default.
|
||
kFetchShaderUseInstanceId = 0x1, ///< Use the instance ID to index the data.
|
||
kFetchShaderUseInstanceIdOverStepRate0 = 0x2, ///< Use the (instance ID / step rate 0) to index the data.
|
||
kFetchShaderUseInstanceIdOverStepRate1 = 0x3, ///< Use the (instance ID / step rate 1) to index the data.
|
||
} FetchShaderInstancingMode;
|
||
|
||
|
||
const int32_t kMaxResourceCount = 16;
|
||
const int32_t kMaxRwResourceCount = 16;
|
||
const int32_t kMaxSamplerCount = 16;
|
||
const int32_t kMaxVertexBufferCount = 16;
|
||
const int32_t kMaxConstantBufferCount = 20;
|
||
const int32_t kMaxStreamOutBufferCount = 4;
|
||
|
||
|
||
const int32_t kMaxUserDataCount = 16; ///< PSSL compiler limit is 16, count not tracked by the InputResourceOffsets table
|
||
const int32_t kMaxSrtUserDataCount = 16; ///< PSSL compiler limit is 16, count not tracked by the InputResourceOffsets table
|
||
|
||
const int32_t kMaxResourceBufferCount = 4; ///< Maximum number for supported splits for the resource buffer per LCUE instance
|
||
const int32_t kMaxPsInputUsageCount = 32; ///< Maximum number of interpolants a PS Stage can receive
|
||
|
||
const int32_t kDefaultFetchShaderPtrSgpr = 0; ///< Default SGPR in PSSL
|
||
const int32_t kDefaultVertexBufferTablePtrSgpr = 2; ///< Default SGPR in PSSL
|
||
const int32_t kDefaultGlobalInternalTablePtrSgpr = 0; ///< Default SGPR in PSSL, Note: it has lower priority than FetchPtr (sgpr would be s[4:5], after FetchPtr and VbPtr)
|
||
const int32_t kDefaultStreamOutTablePtrSgpr = 2; ///< Default SGPR in PSSL, only used by VS copy shader in GS active stage
|
||
const int32_t kDefaultVertexldsEsGsSizeSgpr = 0; ///< Default SGPR in PSSL, only used by VS copy shader in GS active stage
|
||
|
||
const int32_t kResourceInUserDataSgpr = 0x8000; ///< In User data resource Mask
|
||
const int32_t kResourceIsVSharp = 0x4000; ///< VSharp resource Mask Note: only used/available for immediate resources
|
||
const int32_t kResourceValueMask = 0x3FFF; ///< Resource memory offset is stored in the lower 14-bits
|
||
|
||
// On-chip GS constants
|
||
const uint32_t kOnChipGsInvalidSignature = 0xFFFFFFFFU;
|
||
|
||
// Tessellation distribution constants (Neo only)
|
||
const uint32_t kTesselationDistrbutionMask = 0x7FFFFFFFU; ///< Tessellation Distribution mask HS shader stage
|
||
const uint32_t kTesselationDistrbutionEnabledMask = ~kTesselationDistrbutionMask; ///< Tessellation Distribution enabled for HS shader stage
|
||
|
||
// Shader stage constants
|
||
const uint32_t kShaderStageAsynchronousCompute = kShaderStageCount; ///< Additional stage for CsVs dispatch Draw shader
|
||
const uint32_t kNumShaderStages = kShaderStageCount + 1; ///< Number of unique shader stages for resource binding
|
||
|
||
// 6KB is enough to store anything you can bind to a GPU shader stage, all counted in <c>DWORD</c>s
|
||
const int32_t kGpuStageBufferSizeInDwords = (6 * 1024) / sizeof(uint32_t); ///< Size of Single buffer Stage
|
||
const int32_t kComputeScratchBufferSizeInDwords = kGpuStageBufferSizeInDwords; ///< Size of the Compute Scratch buffer
|
||
const int32_t kGraphicsScratchBufferSizeInDwords = kNumShaderStages * kGpuStageBufferSizeInDwords; ///< Size of the Graphics Scratch buffer (encompasses all graphics shader stages)
|
||
//const int32_t kGlobalInternalTableSizeInDwords = sce::kShaderGlobalResourceCount * sizeof(sce::Buffer) / sizeof(uint32_t); ///< Size of a global resource table
|
||
|
||
// Internal constant buffers that are expected at fixed API-slots
|
||
const int32_t kConstantBufferInternalApiSlotForEmbeddedData = 15; ///< Immediate/Embedded constant buffer fixed API-slot (any GPU stage).
|
||
const int32_t kConstantBufferInternalApiSlotReserved0 = 16; ///< Slot 16 is reserved by compiler
|
||
const int32_t kConstantBufferInternalApiSlotReserved1 = 17; ///< Slot 17 is reserved by compiler
|
||
const int32_t kConstantBufferInternalApiSlotReserved2 = 18; ///< Slot 18 is reserved by compiler
|
||
const int32_t kConstantBufferInternalApiSlotForTessellation = 19; ///< Tessellation constant buffer (with strides for LDS data) fixed API-slot (HS,VS/ES GPU stages).
|
||
|
||
// Internal constants for ShaderBinaryInfo
|
||
const uint64_t kShaderBinaryInfoSignatureMask = 0x00ffffffffffffffLL;
|
||
const uint64_t kShaderBinaryInfoSignatureU64 = 0x007264685362724fLL;
|
||
|
||
// DispatchDraw constants
|
||
const uint32_t kNumDispatchDrawRingBuffersRolls = 16; ///< Number of times the dispatch ring buffers can be rolled in a LCUE instance, see setupDispatchDrawRingBuffers()
|
||
|
||
|
||
struct InputResourceOffsets
|
||
{
|
||
uint16_t requiredBufferSizeInDwords; ///< Specifies how much memory needs to be reserved to store all memory-based resources. These are things not set through PM4.
|
||
bool isSrtShader; ///< A flag that specifies whether the shader makes use of SRTs.
|
||
uint8_t shaderStage; ///< The shader stage (LS/HS/ES/GS/VS/PS) for the shader resources offsets.
|
||
|
||
// For each available shader-resource-ptr, store the starting SGPR s[0:254] where it'll be set (<c>0xFF</c> means not used). Pointers take 2 SGPRs (64b) and must be 2DW aligned
|
||
uint8_t fetchShaderPtrSgpr; ///< The SGPR containing the fetch shader pointer. If this exists, <c>s[0:1]</c> is always used.
|
||
uint8_t vertexBufferPtrSgpr; ///< The SGPR containing the vertex buffer table pointer. If this exists, <c>s[2:3]</c> is always used, but only in the vertex pipeline.
|
||
uint8_t streamOutPtrSgpr; ///< The SGPR containing the stream out buffer pointer. If this exists, <c>s[2:3]</c> is always used, but only in the Geometry pipeline.
|
||
uint8_t userExtendedData1PtrSgpr; ///< The SGPR containing the user extended data table pointer.
|
||
// uint8_t userInternalSrtDataPtrSgpr; ///< *Note: Not supported for now*.
|
||
uint8_t constBufferPtrSgpr; ///< The SGPR containing the constant buffer table pointer.
|
||
uint8_t resourcePtrSgpr; ///< The SGPR containing the resource buffer table pointer.
|
||
uint8_t rwResourcePtrSgpr; ///< The SGPR containing the read/write resource buffer table pointer.
|
||
uint8_t samplerPtrSgpr; ///< The SGPR containing the sampler buffer table pointer.
|
||
uint8_t globalInternalPtrSgpr; ///< The SGPR containing the global internal pointer, which is either stored in <c>s[0:1]</c> or <c>s[4:5]</c>.
|
||
uint8_t appendConsumeCounterSgpr; ///< The SGPR containing the 32bit value address and size used from GDS.
|
||
uint8_t gdsMemoryRangeSgpr; ///< The SGPR containing the GDS address range for storage.
|
||
uint8_t ldsEsGsSizeSgpr; ///< The SGPR containing the GWS resource base offset.
|
||
uint8_t userSrtDataSgpr; ///< The SGPR containing the start offset of the SRT Data Buffer.
|
||
uint8_t userSrtDataCount; ///< The number of <c>DWORD</c>s in use by the SRT Data Buffer. The size will be between 1-8.
|
||
|
||
uint8_t gdsKickRingBufferOffsetSgpr; ///< The SGPR containing the GDS kick ring buffer offset for DispatchDraw.
|
||
uint8_t vertexRingBufferOffsetSgpr; ///< The SGPR containing the Vertex kick ring buffer offset for DispatchDraw.
|
||
uint8_t dispatchDrawPtrSgpr; ///< The SGPR containing the DispatchDraw data pointer to DispatchDrawTriangleCullData structure.
|
||
uint8_t dispatchDrawInstancesSgpr; ///< The SGPR containing the number of instances for DispatchDraw.
|
||
|
||
// For each available shader-resource-flat-table (aka array), store the memory offset (from the start of the buffer) to the beginning of its flat-table (0xFFFF means it's not used).
|
||
// Note: arrays are 0 indexed but the user can skip/set any index inside the range, allowing gaps at any place. This accelerates setting the pointer to the beginning of flat-tables.
|
||
uint16_t constBufferArrayDwOffset; ///< The constant buffer table offset into the main buffer.
|
||
uint16_t vertexBufferArrayDwOffset; ///< The vertex buffer table offset into the main buffer.
|
||
uint16_t resourceArrayDwOffset; ///< The resource buffer table offset into the main buffer.
|
||
uint16_t rwResourceArrayDwOffset; ///< The read/write resource buffer table offset into the main buffer.
|
||
uint16_t samplerArrayDwOffset; ///< The sampler buffer table offset into the main buffer.
|
||
uint16_t streamOutArrayDwOffset; ///< The stream out buffer table offset into the main buffer. This is only for the Geometry pipeline.
|
||
|
||
// For each logical shader API slot, store either: an offset to a memory location, or a User Data (UD) SGPR where the resource should be set.
|
||
// Note: if (item[i]&kResourceInUserDataSgpr) it's set directly into s[0:15] using PM4 packets, otherwise it's copied into the scratch buffer using the offset.
|
||
uint16_t resourceDwOffset[kMaxResourceCount]; ///< The start offset of a resource in the resource buffer table or user data.
|
||
uint16_t rwResourceDwOffset[kMaxRwResourceCount]; ///< The start offset of a resource in the read/write resource buffer table or user data.
|
||
uint16_t samplerDwOffset[kMaxSamplerCount]; ///< The start offset of a sampler in the sampler buffer table or user data.
|
||
uint16_t constBufferDwOffset[kMaxConstantBufferCount]; ///< The start offset of a constant buffer in the constant buffer table or user data.
|
||
uint16_t vertexBufferDwOffset[kMaxVertexBufferCount]; ///< The start offset of a vertex array in the vertex buffer table or user data.
|
||
uint16_t streamOutDwOffset[kMaxStreamOutBufferCount];///< The start offset of a stream out buffer in the stream out buffer table or user data. This is only for the Geometry pipeline.
|
||
|
||
uint8_t resourceSlotCount; ///< The number of resource slots used by the shader.
|
||
uint8_t rwResourceSlotCount; ///< The number of rw resource slots used by the shader.
|
||
uint8_t samplerSlotCount; ///< The number of sampler slots used by the shader.
|
||
uint8_t constBufferSlotCount; ///< The number of constant buffer slots used by the shader.
|
||
uint8_t vertexBufferSlotCount; ///< The number of vertex buffer slots used by the shader.
|
||
|
||
uint8_t pad[1];
|
||
|
||
|
||
/** @brief Initializes several resource slots that the shader uses.
|
||
*/
|
||
void initSupportedResourceCounts()
|
||
{
|
||
resourceSlotCount = kMaxResourceCount;
|
||
rwResourceSlotCount = kMaxRwResourceCount;
|
||
samplerSlotCount = kMaxSamplerCount;
|
||
constBufferSlotCount = kMaxConstantBufferCount;
|
||
vertexBufferSlotCount = kMaxVertexBufferCount;
|
||
}
|
||
|
||
// kShaderInputUsageImmAluFloatConst // Immediate float const (scalar or vector). *Not Supported*
|
||
// kShaderInputUsageImmAluBool32Const // 32 immediate Booleans packed into one UINT. *Not Supported*
|
||
};
|
||
|
||
typedef enum ShaderInputUsageType
|
||
{
|
||
kShaderInputUsageImmResource = 0x00, ///< Immediate read-only buffer/texture descriptor.
|
||
kShaderInputUsageImmSampler = 0x01, ///< Immediate sampler descriptor.
|
||
kShaderInputUsageImmConstBuffer = 0x02, ///< Immediate constant buffer descriptor.
|
||
kShaderInputUsageImmVertexBuffer = 0x03, ///< Immediate vertex buffer descriptor.
|
||
kShaderInputUsageImmRwResource = 0x04, ///< Immediate read/write buffer/texture descriptor.
|
||
kShaderInputUsageImmAluFloatConst = 0x05, ///< Immediate float const (scalar or vector).
|
||
kShaderInputUsageImmAluBool32Const = 0x06, ///< 32 immediate Booleans packed into one UINT.
|
||
kShaderInputUsageImmGdsCounterRange = 0x07, ///< Immediate UINT with GDS address range for counters (used for append/consume buffers).
|
||
kShaderInputUsageImmGdsMemoryRange = 0x08, ///< Immediate UINT with GDS address range for storage.
|
||
kShaderInputUsageImmGwsBase = 0x09, ///< Immediate UINT with GWS resource base offset.
|
||
kShaderInputUsageImmShaderResourceTable = 0x0A, ///< Pointer to read/write resource indirection table.
|
||
kShaderInputUsageImmLdsEsGsSize = 0x0D, ///< Immediate LDS ESGS size used in on-chip GS
|
||
// Skipped several items here...
|
||
kShaderInputUsageSubPtrFetchShader = 0x12, ///< Immediate fetch shader subroutine pointer.
|
||
kShaderInputUsagePtrResourceTable = 0x13, ///< Flat resource table pointer.
|
||
kShaderInputUsagePtrInternalResourceTable = 0x14, ///< Flat internal resource table pointer.
|
||
kShaderInputUsagePtrSamplerTable = 0x15, ///< Flat sampler table pointer.
|
||
kShaderInputUsagePtrConstBufferTable = 0x16, ///< Flat const buffer table pointer.
|
||
kShaderInputUsagePtrVertexBufferTable = 0x17, ///< Flat vertex buffer table pointer.
|
||
kShaderInputUsagePtrSoBufferTable = 0x18, ///< Flat stream-out buffer table pointer.
|
||
kShaderInputUsagePtrRwResourceTable = 0x19, ///< Flat read/write resource table pointer.
|
||
kShaderInputUsagePtrInternalGlobalTable = 0x1A, ///< Internal driver table pointer.
|
||
kShaderInputUsagePtrExtendedUserData = 0x1B, ///< Extended user data pointer.
|
||
kShaderInputUsagePtrIndirectResourceTable = 0x1C, ///< Pointer to resource indirection table.
|
||
kShaderInputUsagePtrIndirectInternalResourceTable = 0x1D, ///< Pointer to internal resource indirection table.
|
||
kShaderInputUsagePtrIndirectRwResourceTable = 0x1E, ///< Pointer to read/write resource indirection table.
|
||
// Skipped several items here...
|
||
kShaderInputUsageImmGdsKickRingBufferOffset = 0x22, ///< Immediate UINT offset into GDS kick ring buffer for DispatchDraw. This must not be in extended user data.
|
||
kShaderInputUsageImmVertexRingBufferOffset = 0x23, ///< Immediate UINT offset into vertex ring buffer for DispatchDraw. This must not be in extended user data.
|
||
kShaderInputUsagePtrDispatchDraw = 0x24, ///< Pointer to DispatchDraw data. This must not be in extended user data.
|
||
kShaderInputUsageImmDispatchDrawInstances = 0x25, ///< Immediate UINT ((firstInstance<<16)|(numInstances-1)). This must not be in extended user data.
|
||
} ShaderInputUsageType;
|
||
|
||
class VsShader;
|
||
class PsShader;
|
||
class ShaderInfo
|
||
{
|
||
public:
|
||
|
||
union
|
||
{
|
||
const void *m_shaderStruct; ///< A pointer to the shader struct -- typeless.
|
||
const VsShader* m_vsShader;
|
||
const PsShader* m_psShader;
|
||
};
|
||
|
||
const uint32_t *m_gpuShaderCode; ///< A pointer to the GPU Shader Code which will need to be copied into GPU visible memory.
|
||
uint32_t m_gpuShaderCodeSize; ///< The size of the GPU Shader Code in bytes.
|
||
uint32_t m_reserved;
|
||
};
|
||
|
||
class PipelineStage
|
||
{
|
||
public:
|
||
/// Represents vertex shader information.
|
||
class VsInfo
|
||
{
|
||
public:
|
||
uint8_t m_vertexShaderVariant; ///< The <c>PsslVertexVariant</c> such as <c>kVertexVariantVertex</c>, <c>kVertexVariantExport</c>, <c>kVertexVariantLocal</c> etc.
|
||
uint8_t m_paddingTo32[3]; ///< Padding.
|
||
};
|
||
|
||
/// Represents domain shader information.
|
||
class DsInfo
|
||
{
|
||
public:
|
||
uint8_t m_domainShaderVariant; ///< The <c>PsslDomainVariant</c> such as <c>kDomainVariantVertex</c>, <c>kDomainVariantExport</c> etc.
|
||
uint8_t m_paddingTo32[3]; ///< Padding.
|
||
};
|
||
|
||
/// Represents geometry shader information.
|
||
class GsInfo
|
||
{
|
||
public:
|
||
uint8_t m_geometryShaderVariant; ///< The <c>PsslGeometryVariant</c> such as <c>kGeometryVariantOnBuffer</c>, <c>kGeometryVariantOnChip</c> etc.
|
||
uint8_t m_paddingTo32[3]; ///< Padding.
|
||
};
|
||
|
||
/// Represents geometry shader information.
|
||
class HsInfo
|
||
{
|
||
public:
|
||
uint8_t m_hullShaderVariant; ///< The <c>PsslHullVariant</c> such as <c>kHullVariantOnBuffer</c>, <c>kHullVariantOnChip</c> etc.
|
||
uint8_t m_paddingTo32[3]; ///< Padding.
|
||
};
|
||
|
||
/// Stores data as different class types, depending on the type of shader.
|
||
union
|
||
{
|
||
uint32_t m_u32; ///< An unsigned 32 bit integer.
|
||
VsInfo m_vsInfo; ///< The vertex shader information.
|
||
DsInfo m_dsInfo; ///< The domain shader information.
|
||
GsInfo m_gsInfo; ///< The geometry shader information.
|
||
HsInfo m_hsInfo; ///< The hull shader information.
|
||
};
|
||
};
|
||
|
||
class SystemAttributes
|
||
{
|
||
public:
|
||
/// Represents CS pipeline stage information.
|
||
class CsInfo
|
||
{
|
||
public:
|
||
uint16_t m_numThreads[3]; ///< The number of threads.
|
||
};
|
||
|
||
/// Represents GS pipeline stage information.
|
||
class GsInfo
|
||
{
|
||
public:
|
||
uint16_t m_instance; ///< The instance of the GS Shader.
|
||
uint16_t m_maxVertexCount; ///< The maximum number of vertices count.
|
||
uint8_t m_inputType; ///< The GS Input Type (<c>PsslGsIoType</c>) such as triangle, line, point, adjacent tri + line, or patch.
|
||
uint8_t m_outputType; ///< The GS Output Type (<c>PsslGsIoType</c>) such as triangles, lines, or points.
|
||
uint8_t m_patchSize; ///< The patch size in case of patch topology.
|
||
};
|
||
|
||
/// Represents DS pipeline stage information.
|
||
class DsInfo
|
||
{
|
||
public:
|
||
uint8_t m_domainPatchType; ///< The <c>PsslHsDsPatchType</c>: triangle, quad, or isoline.
|
||
uint8_t m_inputControlPoints; ///< The number of points in the input patch.
|
||
};
|
||
|
||
/// Represents HS pipeline stage information.
|
||
class HsInfo
|
||
{
|
||
public:
|
||
uint8_t m_domainPatchType; ///< The <c>PsslHsDsPatchType</c>: triangle, quad, or isoline.
|
||
uint8_t m_inputControlPoints; ///< The number of points in the input patch.
|
||
uint8_t m_outputTopologyType; ///< The <c>PsslHsTopologyType</c>: point, line, cwtri, or ccwtri.
|
||
uint8_t m_partitioningType; ///< The <c>PsslHsPartitioningType</c>: integer, powof2, odd_fractional, or even_fractional.
|
||
|
||
uint8_t m_outputControlPoints; ///< The number of points in the output patch.
|
||
uint8_t m_patchSize; ///< The size of patch.
|
||
uint8_t m_paddingTo32[2]; ///< Padding.
|
||
|
||
float m_maxTessFactor; ///< The maximum tessellation factor.
|
||
};
|
||
|
||
/// Stores data as different class types, depending on the type of shader.
|
||
union
|
||
{
|
||
uint32_t m_u32[3]; ///< 12 bytes.
|
||
CsInfo m_csInfo; ///< The compute shader information.
|
||
GsInfo m_gsInfo; ///< The geometry shader information.
|
||
DsInfo m_dsInfo; ///< The domain shader information.
|
||
HsInfo m_hsInfo; ///< The hull shader information.
|
||
};
|
||
};
|
||
|
||
class Header
|
||
{
|
||
public:
|
||
uint8_t m_formatVersionMajor; ///< The version of shader binary format: major numbering.
|
||
uint8_t m_formatVersionMinor; ///< The version of shader binary format: minor numbering.
|
||
uint16_t m_compilerRevision; ///< The compiler type specific version of shader compiler: this is the svn revision for m_compilerType==kCompilerTypeOrbisPsslc or kCompilerTypeOrbisEsslc or for kCompilerTypeUnspecified (pre-SDK 2.500 versions of these compilers)
|
||
|
||
uint32_t m_associationHash0; ///< The shader association hash 1.
|
||
uint32_t m_associationHash1; ///< The shader association hash 2.
|
||
|
||
uint8_t m_shaderType; ///< The <c>PsslShaderType</c>: VS, PS, GS, CS, GS, HS, or DS.
|
||
uint8_t m_codeType; ///< The <c>PsslCodeType</c>: IL, ISA, or SCU.
|
||
uint8_t m_usesShaderResourceTable; ///< The shader uses resource table.
|
||
uint8_t m_compilerType : 4; ///< The <c>PsslCompilerType</c>; 0
|
||
uint8_t m_paddingTo32 : 4; // 0; reserved for future use
|
||
|
||
uint32_t m_codeSize; ///< The size of code section.
|
||
|
||
PipelineStage m_shaderTypeInfo; ///< The shader pipeline stage info.
|
||
SystemAttributes m_shaderSystemAttributeInfo; ///< The shader system attribute info.
|
||
};
|
||
|
||
class ShaderFileHeader
|
||
{
|
||
public:
|
||
uint32_t m_fileHeader; ///< File identifier. Should be equal to kShaderFileHeaderId
|
||
uint16_t m_majorVersion; ///< Major version of the shader binary.
|
||
uint16_t m_minorVersion; ///< Minor version of the shader binary.
|
||
uint8_t m_type; ///< Type of shader. Comes from ShaderType.
|
||
uint8_t m_shaderHeaderSizeInDW; ///< <c>\<Type\>Shader.computeSize()/4</c>. For example, see CsShader::computeSize().
|
||
uint8_t m_shaderAuxData; ///< A flag that indicates whether shader auxiliary data is present after end of the shader data ( <c>sizeof(ShaderFileHeader) +</c>
|
||
///< <c>m_shaderHeaderSizeInDW * 4 + ShaderCommonData::m_shaderSize +</c>
|
||
///< <c>ShaderCommonData::m_embeddedConstantBufferSizeInDQW * 16)</c>. Set to 1 to indicate it is
|
||
uint8_t m_targetGpuModes; ///< Union of all TargetGpuMode values for which this shader binary is valid.
|
||
uint32_t m_reserved1; ///< Must be 0.
|
||
};
|
||
|
||
class ShaderCommonData
|
||
{
|
||
public:
|
||
// Memory Layout:
|
||
// - Shader setup data (starting with ShaderCommonData)
|
||
// - n InputUsage (4 bytes each)
|
||
// - immediateConstants
|
||
uint32_t m_shaderSize : 23; ///< The size of the shader binary code block in bytes.
|
||
uint32_t m_shaderIsUsingSrt : 1; ///< A bitflag that indicates if the shader is using a Shader Resource Table.
|
||
uint32_t m_numInputUsageSlots : 8; ///< The number of InputUsageSlot entries following the main shader structure.
|
||
uint16_t m_embeddedConstantBufferSizeInDQW; ///< The size of the embedded constant buffer in 16-byte <c>DWORD</c>s.
|
||
uint16_t m_scratchSizeInDWPerThread; ///< The scratch size required by each thread in 4-byte <c>DWORD</c>s.
|
||
|
||
/** @brief Calculates and returns the size of the shader code including its embedded CB size in bytes */
|
||
uint32_t computeShaderCodeSizeInBytes() const { return m_shaderSize + m_embeddedConstantBufferSizeInDQW * 16; }
|
||
};
|
||
|
||
#ifdef __cplusplus
|
||
class InputUsageSlot
|
||
#else // __cplusplus
|
||
typedef struct InputUsageSlot
|
||
#endif // __cplusplus
|
||
{
|
||
#ifdef __cplusplus
|
||
public:
|
||
#endif // __cplusplus
|
||
uint8_t m_usageType; ///< From ShaderInputUsageType.
|
||
uint8_t m_apiSlot; ///< API slot or chunk ID.
|
||
uint8_t m_startRegister; ///< User data slot.
|
||
|
||
union
|
||
{
|
||
struct
|
||
{
|
||
uint8_t m_registerCount : 1; ///< If 0, count is 4DW; if 1, count is 8DW. Other sizes are defined by the usage type.
|
||
uint8_t m_resourceType : 1; ///< If 0, resource type <c>V#</c>; if 1, resource type <c>T#</c>, in case of a kShaderInputUsageImmResource.
|
||
uint8_t m_reserved : 2; ///< Unused; must be set to zero.
|
||
uint8_t m_chunkMask : 4; ///< Internal usage data.
|
||
};
|
||
uint8_t m_srtSizeInDWordMinusOne; ///< Size of the SRT data; used for kShaderInputUsageImmShaderResourceTable.
|
||
};
|
||
#ifdef __cplusplus
|
||
} SCE_GNM_SET_ALIGN(4);
|
||
#else // __cplusplus
|
||
} InputUsageSlot;
|
||
#endif // __cplusplus
|
||
|
||
|
||
#ifdef __cplusplus
|
||
class VertexInputSemantic
|
||
#else // __cplusplus
|
||
typedef struct VertexInputSemantic
|
||
#endif // __cplusplus
|
||
{
|
||
#ifdef __cplusplus
|
||
public:
|
||
#endif // __cplusplus
|
||
uint8_t m_semantic;
|
||
uint8_t m_vgpr;
|
||
uint8_t m_sizeInElements;
|
||
uint8_t m_reserved; ///< Unused; must be set to zero.
|
||
#ifdef __cplusplus
|
||
};
|
||
#else // __cplusplus
|
||
} VertexInputSemantic;
|
||
#endif // __cplusplus
|
||
|
||
#ifdef __cplusplus
|
||
class VertexExportSemantic
|
||
#else // __cplusplus
|
||
typedef struct VertexExportSemantic
|
||
#endif // __cplusplus
|
||
{
|
||
#ifdef __cplusplus
|
||
public:
|
||
#endif // __cplusplus
|
||
uint8_t m_semantic; ///< Description to be specified.
|
||
uint8_t m_outIndex : 5; ///< Description to be specified.
|
||
uint8_t m_reserved : 1;
|
||
uint8_t m_exportF16 : 2; ///< if (m_exportF16 == 0) this shader exports a 32-bit value to this parameter; if (m_exportF16 & 1) this shader exports a 16-bit float value to the low 16-bits of each channel; if (m_exportF16 & 2) this shader exports a 16-bit float value to the high 16-bits of each channel
|
||
#ifdef __cplusplus
|
||
};
|
||
#else // __cplusplus
|
||
} VertexExportSemantic;
|
||
#endif // __cplusplus
|
||
|
||
|
||
#ifdef __cplusplus
|
||
class VsStageRegisters
|
||
#else // __cplusplus
|
||
typedef struct VsStageRegisters
|
||
#endif // __cplusplus
|
||
{
|
||
#ifdef __cplusplus
|
||
public:
|
||
#endif // __cplusplus
|
||
uint32_t m_spiShaderPgmLoVs; ///< The pointer to shader program (bits 39:8).
|
||
uint32_t m_spiShaderPgmHiVs; ///< The pointer to shader program (bits 47:40). This must be set to zero.
|
||
|
||
uint32_t m_spiShaderPgmRsrc1Vs;
|
||
uint32_t m_spiShaderPgmRsrc2Vs;
|
||
|
||
uint32_t m_spiVsOutConfig;
|
||
uint32_t m_spiShaderPosFormat;
|
||
uint32_t m_paClVsOutCntl;
|
||
|
||
#ifdef __cplusplus
|
||
|
||
|
||
/** @brief Patches the GPU address of the shader code.
|
||
|
||
@param[in] gpuAddress This address to patch. This must be aligned to a 256-byte boundary.
|
||
*/
|
||
void patchShaderGpuAddress(void *gpuAddress)
|
||
{
|
||
m_spiShaderPgmLoVs = static_cast<uint32_t>(uintptr_t(gpuAddress) >> 8);
|
||
m_spiShaderPgmHiVs = static_cast<uint32_t>(uintptr_t(gpuAddress) >> 40);
|
||
}
|
||
|
||
bool isSharingContext(const VsStageRegisters shader) const
|
||
{
|
||
return !((m_spiVsOutConfig - shader.m_spiVsOutConfig)
|
||
| (m_spiShaderPosFormat - shader.m_spiShaderPosFormat)
|
||
| (m_paClVsOutCntl - shader.m_paClVsOutCntl));
|
||
}
|
||
|
||
#endif // __cplusplus
|
||
|
||
#ifdef __cplusplus
|
||
};
|
||
#else // __cplusplus
|
||
} VsStageRegisters;
|
||
#endif // __cplusplus
|
||
|
||
|
||
|
||
#ifdef __cplusplus
|
||
class PsStageRegisters
|
||
#else // __cplusplus
|
||
typedef struct PsStageRegisters
|
||
#endif // __cplusplus
|
||
{
|
||
#ifdef __cplusplus
|
||
public:
|
||
#endif // __cplusplus
|
||
uint32_t m_spiShaderPgmLoPs; ///< A pointer to shader program (bits 39:8).
|
||
uint32_t m_spiShaderPgmHiPs; ///< A pointer to shader program (bits 47:40). This must be set to zero.
|
||
|
||
uint32_t m_spiShaderPgmRsrc1Ps;
|
||
uint32_t m_spiShaderPgmRsrc2Ps;
|
||
|
||
uint32_t m_spiShaderZFormat;
|
||
uint32_t m_spiShaderColFormat;
|
||
|
||
uint32_t m_spiPsInputEna;
|
||
uint32_t m_spiPsInputAddr;
|
||
|
||
uint32_t m_spiPsInControl;
|
||
uint32_t m_spiBarycCntl;
|
||
|
||
uint32_t m_dbShaderControl;
|
||
uint32_t m_cbShaderMask;
|
||
|
||
#ifdef __cplusplus
|
||
void patchShaderGpuAddress(void *gpuAddress)
|
||
{
|
||
m_spiShaderPgmLoPs = static_cast<uint32_t>(uintptr_t(gpuAddress) >> 8);
|
||
m_spiShaderPgmHiPs = static_cast<uint32_t>(uintptr_t(gpuAddress) >> 40);
|
||
}
|
||
#endif // __cplusplus
|
||
#ifdef __cplusplus
|
||
};
|
||
#else // __cplusplus
|
||
} PsStageRegisters;
|
||
#endif // __cplusplus
|
||
|
||
#ifdef __cplusplus
|
||
class FetchShaderBuildState
|
||
#else // __cplusplus
|
||
typedef struct FetchShaderBuildState
|
||
#endif // __cplusplus
|
||
{
|
||
#ifdef __cplusplus
|
||
public:
|
||
#endif // __cplusplus
|
||
// Filled up by: Generate[xx]FSBuildState functions
|
||
uint16_t m_fetchShaderBufferSize;
|
||
uint16_t m_fetchShaderFlags; ///< Description to be specified.
|
||
uint8_t m_firstFreeSgpr; ///< Description to be specified.
|
||
uint8_t m_vsharpSlotCount;
|
||
uint16_t m_numElementsInInstancingData; ///< Description to be specified.
|
||
uint32_t m_shaderModifier; // to be passed to set[x]sShader functions.
|
||
uint32_t m_reserved0; ///< Unused; must be set to zero.
|
||
const FetchShaderInstancingMode *m_fetchShaderInstancingData; ///< Description to be specified.
|
||
|
||
// From the SC:
|
||
uint8_t m_numInputSemantics;
|
||
uint8_t m_vertexBaseUsgpr;
|
||
uint8_t m_instanceBaseUsgpr;
|
||
uint8_t m_reserved1; ///< Unused; must be set to zero.
|
||
uint32_t m_numInputUsageSlots;
|
||
const VertexInputSemantic *m_inputSemantics;
|
||
const InputUsageSlot *m_inputUsageSlots;
|
||
|
||
// Vertex Buffer Semantic Remap Table: <optional>
|
||
uint32_t m_numElementsInRemapTable;
|
||
uint32_t m_reserved2;
|
||
const uint32_t *m_semanticsRemapTable; // vertex buffer index -> vertex buffer semantic (e.g. vbSemantic = m_semanticsRemapTable[vbIndex])
|
||
#ifdef __cplusplus
|
||
};
|
||
#else // __cplusplus
|
||
} FetchShaderBuildState;
|
||
#endif // __cplusplus
|
||
|
||
|
||
#ifdef __cplusplus
|
||
class PixelInputSemantic
|
||
#else // __cplusplus
|
||
typedef struct PixelInputSemantic
|
||
#endif // __cplusplus
|
||
{
|
||
#ifdef __cplusplus
|
||
public:
|
||
#endif // __cplusplus
|
||
union
|
||
{
|
||
struct
|
||
{
|
||
uint16_t m_semantic : 8; ///< The semantic, matched against the semantic value in the VertexExportSemantic table in the VS shader.
|
||
uint16_t m_defaultValue : 2; ///< The default value supplied to the shader, if m_semantic is not matched in the VS shader. 0={0,0,0,0}, 1={0,0,0,1.0}, 2={1.0,1.0,1.0,0}, 3={1.0,1.0,1.0,1.0}
|
||
uint16_t m_isFlatShaded : 1; ///< if (m_interpF16 == 0) A bitflag that specifies whether the value interpolation is constant in the shader. It is ignored if <c><i>m_isCustom</i></c> is set; otherwise, it indicates that a shader reads only { P0 } and that some handling of infinite values in the calculation of P1-P0 and P2-P0 can be disabled.
|
||
uint16_t m_isLinear : 1; ///< A bitflag that specifies whether the value interpolation is linear in the shader. It is unused by the Gnm runtime.
|
||
uint16_t m_isCustom : 1; ///< if (m_interpF16 == 0) A bitflag that specifies whether the value interpolation is custom in the shader. It determines whether hardware subtraction should be disabled, supplying { P0, P1, P2 } to the shader instead of { P0, P1-P0, P2-P0 }.
|
||
uint16_t m_reserved : 3; ///< Unused; set to zero.
|
||
};
|
||
// NEO mode only:
|
||
struct
|
||
{
|
||
uint16_t : 12; ///< Description to be specified.
|
||
uint16_t m_defaultValueHi : 2; ///< if (m_interpF16 != 0) indicates the default value supplied to the shader for the upper 16-bits if m_semantic is not matched in the VS shader, and m_defaultValue indicates the default value for the lower 16-bits.
|
||
uint16_t m_interpF16 : 2; ///< if (m_interpF16 == 0) this is a 32-bit float or custom value; if (m_interpF16 & 1) the low 16-bits of this parameter expect 16-bit float interpolation and/or default value; if (m_interpF16 & 2) the high 16-bits of this parameter expect 16-bit float interpolation and/or default value
|
||
};
|
||
};
|
||
#ifdef __cplusplus
|
||
};
|
||
#else // __cplusplus
|
||
} PixelInputSemantic;
|
||
#endif // __cplusplus
|
||
|
||
class VsShader
|
||
{
|
||
public:
|
||
ShaderCommonData m_common; ///< The common data for all shader stages.
|
||
|
||
VsStageRegisters m_vsStageRegisters; ///< The data to be loaded into the VS shader stage registers. Please see DrawCommandBuffer::setVsShader() for more information.
|
||
// not used if domain shader => vertex shader
|
||
|
||
uint8_t m_numInputSemantics; ///< The number of entries in the input semantic table.
|
||
uint8_t m_numExportSemantics; ///< The number of entries in the export semantic table.
|
||
uint8_t m_gsModeOrNumInputSemanticsCs; ///< Stores a union of VsShaderGsMode values for a VsShader or GsShader::getCopyShader(), which are translated into a GsMode constant. For CsVsShader::getVertexShader() with CsVsShader::getComputeShader()->m_version==0, the number of input semantic table entries to use for the CsVsShader::getComputeShader() fetch shader is stored.
|
||
uint8_t m_fetchControl; ///< The user registers that receive vertex and instance offsets for use in the fetch shader.
|
||
|
||
|
||
/** @brief Patches the GPU address of the shader code.
|
||
|
||
@param[in] gpuAddress This address to patch. This must be aligned to a 256-byte boundary.
|
||
*/
|
||
void patchShaderGpuAddress(void *gpuAddress)
|
||
{
|
||
m_vsStageRegisters.patchShaderGpuAddress(gpuAddress);
|
||
}
|
||
|
||
void *getBaseAddress() const
|
||
{
|
||
return (void *)((((uintptr_t)m_vsStageRegisters.m_spiShaderPgmHiVs) << 40) | (((uintptr_t)m_vsStageRegisters.m_spiShaderPgmLoVs) << 8));
|
||
}
|
||
|
||
/** @brief Gets a pointer to this shader's input usage slot table that immediately follows this shader's structure in memory.
|
||
|
||
@return A pointer to this shader's input usage slot table.
|
||
*/
|
||
const InputUsageSlot *getInputUsageSlotTable() const { return (const InputUsageSlot *)(this + 1); }
|
||
|
||
/** @brief Gets a pointer to this shader's input semantic table that immediately follows the input usage table in memory.
|
||
|
||
@return A pointer to this shader's input semantic table.
|
||
*/
|
||
const VertexInputSemantic *getInputSemanticTable() const { return (const VertexInputSemantic *)(getInputUsageSlotTable() + m_common.m_numInputUsageSlots); }
|
||
|
||
/** @brief Gets a pointer to this shader's export semantic table that immediately follows the input semantic table in memory.
|
||
|
||
@return A pointer to this shader's export semantic table.
|
||
*/
|
||
const VertexExportSemantic *getExportSemanticTable() const { return (const VertexExportSemantic *)(getInputSemanticTable() + m_numInputSemantics); }
|
||
|
||
/** @brief Computes the total size (in bytes) of the shader binary including this structure, the input usage table, and the input and export semantic tables.
|
||
|
||
@return The total size in bytes of this shader binary and its associated tables.
|
||
*/
|
||
uint32_t computeSize() const
|
||
{
|
||
const uint32_t size = sizeof(VsShader) +
|
||
sizeof(InputUsageSlot) * m_common.m_numInputUsageSlots +
|
||
sizeof(VertexInputSemantic) * m_numInputSemantics +
|
||
sizeof(VertexExportSemantic) * m_numExportSemantics;
|
||
|
||
return (size + 3) & ~3U;
|
||
}
|
||
/** @brief Gets the user register that contains the vertex offset.
|
||
|
||
@return The index of the register containing the vertex offset. A value of 0 indicates no register contains the vertex offset.
|
||
*/
|
||
uint8_t getVertexOffsetUserRegister() const
|
||
{
|
||
return m_fetchControl & 0xf;
|
||
}
|
||
/** @brief Gets the user register that contains the instance offset.
|
||
|
||
@return The index of the register containing the instance offset. A value of 0 indicates no register contains the instance offset.
|
||
*/
|
||
uint8_t getInstanceOffsetUserRegister() const
|
||
{
|
||
return (m_fetchControl >> 4) & 0xf;
|
||
}
|
||
};
|
||
|
||
|
||
class PsShader
|
||
{
|
||
public:
|
||
ShaderCommonData m_common; ///< The common data for all shader stages.
|
||
|
||
PsStageRegisters m_psStageRegisters; ///< The data to be loaded into the PS shader stage registers. Please see Gnm::DrawCommandBuffer::setPsShader() for more details.
|
||
|
||
uint8_t m_numInputSemantics; ///< The number of entries in the input semantic table.
|
||
uint8_t m_reserved[3]; ///< Unused
|
||
|
||
/** @brief Patches the GPU address of the shader code.
|
||
|
||
@param[in] gpuAddress The address to patch. This must be aligned to a 256-byte boundary.
|
||
*/
|
||
void patchShaderGpuAddress(void *gpuAddress)
|
||
{
|
||
m_psStageRegisters.patchShaderGpuAddress(gpuAddress);
|
||
}
|
||
|
||
/** @brief Retrieves the GPU address of the shader code.
|
||
|
||
@return The address of the shader code.
|
||
*/
|
||
void *getBaseAddress() const
|
||
{
|
||
return (void *)((((uintptr_t)m_psStageRegisters.m_spiShaderPgmHiPs) << 40) | (((uintptr_t)m_psStageRegisters.m_spiShaderPgmLoPs) << 8));
|
||
}
|
||
|
||
/** @brief Gets a pointer to this shader's input usage slot table that immediately follows this shader's structure in memory.
|
||
|
||
@return A pointer to this shader's input usage slot table.
|
||
*/
|
||
const InputUsageSlot *getInputUsageSlotTable() const { return (const InputUsageSlot *)(this + 1); }
|
||
|
||
/** @brief Gets a pointer to this shader's input semantic table that immediately follows the input usage table in memory.
|
||
|
||
@return A pointer to this shader's input semantic table.
|
||
*/
|
||
const PixelInputSemantic *getPixelInputSemanticTable() const { return (const PixelInputSemantic *)(getInputUsageSlotTable() + m_common.m_numInputUsageSlots); }
|
||
|
||
/** @brief Computes the total size (in bytes) of the shader binary including this structure, the input usage table and the input semantic table.
|
||
|
||
@return The total size in bytes of this shader binary and its associated tables.
|
||
*/
|
||
uint32_t computeSize() const
|
||
{
|
||
const uint32_t size = sizeof(PsShader) +
|
||
sizeof(InputUsageSlot) * m_common.m_numInputUsageSlots +
|
||
sizeof(PixelInputSemantic) * m_numInputSemantics;
|
||
|
||
return (size + 3) & ~3U;
|
||
}
|
||
};
|
||
|
||
void parseShader(ShaderInfo *shaderInfo, const void* data)
|
||
{
|
||
const Header *binaryHeader = (const Header*)(data);
|
||
const ShaderFileHeader *header = (const ShaderFileHeader*)(binaryHeader + 1);
|
||
const ShaderCommonData *shaderCommon = (const ShaderCommonData*)(header + 1);
|
||
const uint32_t *sbAddress = (const uint32_t*)(shaderCommon + 1);
|
||
|
||
const uint32_t sbOffsetInDW = sbAddress[0] >> 2;
|
||
|
||
shaderInfo->m_shaderStruct = (void*)shaderCommon;
|
||
shaderInfo->m_gpuShaderCode = (uint32_t*)shaderCommon + sbOffsetInDW;
|
||
shaderInfo->m_gpuShaderCodeSize = shaderCommon->computeShaderCodeSizeInBytes();
|
||
}
|
||
|
||
ShaderBinaryInfo* findShaderBinInfo(uint8_t* code)
|
||
{
|
||
for (size_t i = 0; i != 0xFFFFFFFF; i++)
|
||
{
|
||
if (!std::memcmp(&code[i], "OrbShdr", 7))
|
||
{
|
||
return (ShaderBinaryInfo*)&code[i];
|
||
}
|
||
}
|
||
return nullptr;
|
||
}
|
||
|
||
void generateInputResourceOffsetTable(InputResourceOffsets* outTable, ShaderBinaryInfo* sb)
|
||
{
|
||
SCE_GNM_ASSERT(outTable != NULL);
|
||
//SCE_GNM_ASSERT(shaderStage <= kShaderStageCount);
|
||
|
||
// Get resource info to populate ShaderResourceOffsets
|
||
//ShaderBinaryInfo const *shaderBinaryInfo = (ShaderBinaryInfo const*)((uintptr_t)shaderCode + shaderCodeSizeInBytes - sizeof(ShaderBinaryInfo));
|
||
ShaderBinaryInfo const *shaderBinaryInfo = sb;
|
||
//SCE_GNM_ASSERT((*(reinterpret_cast<uint64_t const*>(shaderBinaryInfo->m_signature)) & kShaderBinaryInfoSignatureMask) == kShaderBinaryInfoSignatureU64);
|
||
|
||
// Get usage masks and input usage slots
|
||
|
||
uint32_t const* usageMasks = reinterpret_cast<unsigned int const*>((unsigned char const*)shaderBinaryInfo - shaderBinaryInfo->m_chunkUsageBaseOffsetInDW * 4);
|
||
int32_t inputUsageSlotsCount = shaderBinaryInfo->m_numInputUsageSlots;
|
||
InputUsageSlot const* inputUsageSlots = (InputUsageSlot const*)usageMasks - inputUsageSlotsCount;
|
||
|
||
// Cache shader input information into the ShaderResource Offsets table
|
||
__builtin_memset(outTable, 0xFF, sizeof(InputResourceOffsets));
|
||
outTable->initSupportedResourceCounts();
|
||
//outTable->shaderStage = isDispatchDraw && shaderStage == kShaderStageCs ? kShaderStageAsynchronousCompute : shaderStage;
|
||
//outTable->isSrtShader = isSrtUsed;
|
||
int32_t lastUserDataResourceSizeInDwords = 0;
|
||
uint16_t requiredMemorySizeInDwords = 0;
|
||
|
||
// Here we handle all immediate resources s[1:16] plus s[16:48] (extended user data)
|
||
// resources that go into the extended user data also have "immediate" usage type, although they are stored in a table (not loaded by the SPI)
|
||
for (int32_t i = 0; i < inputUsageSlotsCount; ++i)
|
||
{
|
||
uint8_t apiSlot = inputUsageSlots[i].m_apiSlot;
|
||
uint8_t startRegister = inputUsageSlots[i].m_startRegister;
|
||
bool isVSharp = (inputUsageSlots[i].m_resourceType == 0);
|
||
uint16_t vsharpFlag = (isVSharp) ? kResourceIsVSharp : 0;
|
||
|
||
uint16_t extendedRegisterOffsetInDwords = (startRegister >= kMaxUserDataCount) ?
|
||
(startRegister - kMaxUserDataCount) : 0;
|
||
requiredMemorySizeInDwords = (requiredMemorySizeInDwords > extendedRegisterOffsetInDwords) ?
|
||
requiredMemorySizeInDwords : extendedRegisterOffsetInDwords;
|
||
|
||
// Handle immediate resources, including some pointer types
|
||
switch (inputUsageSlots[i].m_usageType)
|
||
{
|
||
case kShaderInputUsageImmGdsCounterRange:
|
||
outTable->appendConsumeCounterSgpr = startRegister;
|
||
break;
|
||
|
||
case kShaderInputUsageImmGdsMemoryRange:
|
||
outTable->gdsMemoryRangeSgpr = startRegister;
|
||
break;
|
||
|
||
case kShaderInputUsageImmLdsEsGsSize:
|
||
outTable->ldsEsGsSizeSgpr = startRegister;
|
||
break;
|
||
|
||
case kShaderInputUsageSubPtrFetchShader:
|
||
SCE_GNM_ASSERT(apiSlot == 0);
|
||
outTable->fetchShaderPtrSgpr = startRegister;
|
||
break;
|
||
|
||
case kShaderInputUsagePtrInternalGlobalTable:
|
||
SCE_GNM_ASSERT(apiSlot == 0);
|
||
outTable->globalInternalPtrSgpr = startRegister;
|
||
break;
|
||
|
||
case kShaderInputUsagePtrExtendedUserData:
|
||
SCE_GNM_ASSERT(apiSlot == 1);
|
||
outTable->userExtendedData1PtrSgpr = startRegister;
|
||
break;
|
||
|
||
case kShaderInputUsageImmGdsKickRingBufferOffset:
|
||
SCE_GNM_ASSERT(apiSlot == 0);
|
||
SCE_GNM_ASSERT(startRegister < kMaxUserDataCount);
|
||
outTable->gdsKickRingBufferOffsetSgpr = startRegister;
|
||
break;
|
||
|
||
case kShaderInputUsageImmVertexRingBufferOffset:
|
||
SCE_GNM_ASSERT(apiSlot == 0);
|
||
SCE_GNM_ASSERT(startRegister < kMaxUserDataCount);
|
||
outTable->vertexRingBufferOffsetSgpr = startRegister;
|
||
break;
|
||
|
||
case kShaderInputUsagePtrDispatchDraw:
|
||
SCE_GNM_ASSERT(apiSlot == 0);
|
||
outTable->dispatchDrawPtrSgpr = startRegister;
|
||
break;
|
||
|
||
case kShaderInputUsageImmDispatchDrawInstances:
|
||
SCE_GNM_ASSERT(apiSlot == 0);
|
||
SCE_GNM_ASSERT(startRegister < kMaxUserDataCount);
|
||
outTable->dispatchDrawInstancesSgpr = startRegister;
|
||
break;
|
||
|
||
// below resources can either be inside UserData or the EUD
|
||
case kShaderInputUsageImmResource:
|
||
SCE_GNM_ASSERT(apiSlot >= 0 && apiSlot < outTable->resourceSlotCount);
|
||
outTable->resourceDwOffset[apiSlot] = (startRegister < kMaxUserDataCount) ?
|
||
(kResourceInUserDataSgpr | vsharpFlag | startRegister) : (vsharpFlag | extendedRegisterOffsetInDwords);
|
||
lastUserDataResourceSizeInDwords = (startRegister < kMaxUserDataCount) ? 0 : kDwordSizeResource;
|
||
break;
|
||
|
||
case kShaderInputUsageImmRwResource:
|
||
SCE_GNM_ASSERT(apiSlot >= 0 && apiSlot < outTable->rwResourceSlotCount);
|
||
outTable->rwResourceDwOffset[apiSlot] = (startRegister < kMaxUserDataCount) ?
|
||
(kResourceInUserDataSgpr | vsharpFlag | startRegister) : (vsharpFlag | extendedRegisterOffsetInDwords);
|
||
lastUserDataResourceSizeInDwords = (startRegister < kMaxUserDataCount) ? 0 : kDwordSizeRwResource;
|
||
break;
|
||
|
||
case kShaderInputUsageImmSampler:
|
||
SCE_GNM_ASSERT(apiSlot >= 0 && apiSlot < outTable->samplerSlotCount);
|
||
outTable->samplerDwOffset[apiSlot] = (startRegister < kMaxUserDataCount) ?
|
||
(kResourceInUserDataSgpr | startRegister) : extendedRegisterOffsetInDwords;
|
||
lastUserDataResourceSizeInDwords = (startRegister < kMaxUserDataCount) ? 0 : kDwordSizeSampler;
|
||
break;
|
||
|
||
case kShaderInputUsageImmConstBuffer:
|
||
SCE_GNM_ASSERT(apiSlot >= 0 && apiSlot < outTable->constBufferSlotCount);
|
||
outTable->constBufferDwOffset[apiSlot] = (startRegister < kMaxUserDataCount) ?
|
||
(kResourceInUserDataSgpr | startRegister) : extendedRegisterOffsetInDwords;
|
||
lastUserDataResourceSizeInDwords = (startRegister < kMaxUserDataCount) ? 0 : kDwordSizeConstantBuffer;
|
||
break;
|
||
|
||
case kShaderInputUsageImmVertexBuffer:
|
||
SCE_GNM_ASSERT(apiSlot >= 0 && apiSlot < outTable->vertexBufferSlotCount);
|
||
outTable->vertexBufferDwOffset[apiSlot] = (startRegister < kMaxUserDataCount) ?
|
||
(kResourceInUserDataSgpr | startRegister) : extendedRegisterOffsetInDwords;
|
||
lastUserDataResourceSizeInDwords = (startRegister < kMaxUserDataCount) ? 0 : kDwordSizeVertexBuffer;
|
||
break;
|
||
|
||
// SRTs will always reside inside the Imm UserData (dwords 0-15), as opposed to the
|
||
// above resources which can exist in the EUD
|
||
case kShaderInputUsageImmShaderResourceTable:
|
||
SCE_GNM_ASSERT(apiSlot >= 0 && apiSlot < kMaxUserDataCount);
|
||
outTable->userSrtDataSgpr = inputUsageSlots[i].m_startRegister;
|
||
outTable->userSrtDataCount = inputUsageSlots[i].m_srtSizeInDWordMinusOne + 1;
|
||
break;
|
||
|
||
// case kShaderInputUsagePtrSoBufferTable: // Only present in the VS copy-shader that doesn't have a footer
|
||
// outTable->streamOutPtrSgpr = startRegister;
|
||
// break;
|
||
|
||
}
|
||
}
|
||
/*
|
||
// Make sure we can fit a T# (if required) in the last userOffset
|
||
requiredMemorySizeInDwords += lastUserDataResourceSizeInDwords;
|
||
|
||
// Now handle only pointers to resource-tables. Items handled below cannot be found more than once
|
||
#if SCE_GNM_LCUE_USE_VERTEX_BUFFER_TABLE_MASK_IF_AVAILABLE
|
||
// Note: in order to maintain binary compatibility, we can only put a new chunk mask for kShaderInputUsagePtrVertexBufferTable at the end of all other chunk masks
|
||
bool bUseVertexBufferTableChunkMask = false;
|
||
#endif
|
||
|
||
for (int32_t i = 0; i < inputUsageSlotsCount; ++i)
|
||
{
|
||
uint8_t maskChunks = inputUsageSlots[i].m_chunkMask;
|
||
|
||
const uint64_t kNibbleToCount = 0x4332322132212110ull;
|
||
uint8_t chunksCount = (kNibbleToCount >> ((maskChunks & 0xF) * 4)) & 0xF;
|
||
SCE_GNM_ASSERT(usageMasks + chunksCount <= (uint32_t const*)shaderBinaryInfo);
|
||
|
||
// Lets fill the resource indices first
|
||
uint32_t usedApiSlots[kSlotCountResource]; // Use the size of the biggest resource table
|
||
uint32_t usedApiSlotCount;
|
||
|
||
// This thing will break if there's more than 1 table for any resource type
|
||
uint8_t startRegister = inputUsageSlots[i].m_startRegister;
|
||
|
||
switch (inputUsageSlots[i].m_usageType)
|
||
{
|
||
case kShaderInputUsagePtrResourceTable:
|
||
{
|
||
SCE_GNM_ASSERT(inputUsageSlots[i].m_apiSlot == 0);
|
||
outTable->resourcePtrSgpr = startRegister;
|
||
outTable->resourceArrayDwOffset = requiredMemorySizeInDwords;
|
||
|
||
if (!(maskChunks & 0xF))
|
||
break;
|
||
|
||
SCE_GNM_ASSERT(usageMasks < (uint32_t const*)shaderBinaryInfo);
|
||
uint32_t maskArray[4] = { 0, 0, 0, 0 }; // Max 128 slots are supported in the kShaderInputUsagePtrResourceTable
|
||
if (maskChunks & 1) maskArray[0] = *usageMasks++; // get slots 0-31 which are set in Chunk 0
|
||
if (maskChunks & 2) maskArray[1] = *usageMasks++; // get slots 32-63 which are set in Chunk 1
|
||
if (maskChunks & 4) maskArray[2] = *usageMasks++; // get slots 64-95 which are set in Chunk 2
|
||
if (maskChunks & 8) maskArray[3] = *usageMasks++; // get slots 96-127 which are set in Chunk 3
|
||
SCE_GNM_ASSERT(usageMasks <= (uint32_t const*)shaderBinaryInfo);
|
||
|
||
usedApiSlotCount = getUsedApiSlotsFromMask(&usedApiSlots[0], outTable->resourceSlotCount, maskArray, kSlotCountResource);
|
||
SCE_GNM_ASSERT(usedApiSlotCount > 0);
|
||
|
||
uint32_t lastUsedApiSlot = usedApiSlots[usedApiSlotCount - 1];
|
||
for (uint8_t j = 0; j < usedApiSlotCount; j++)
|
||
{
|
||
uint16_t currentApiSlot = static_cast<uint16_t>(usedApiSlots[j]);
|
||
outTable->resourceDwOffset[currentApiSlot] = requiredMemorySizeInDwords + currentApiSlot * kDwordSizeResource;
|
||
}
|
||
requiredMemorySizeInDwords += (lastUsedApiSlot + 1) * kDwordSizeResource;
|
||
}
|
||
break;
|
||
|
||
case kShaderInputUsagePtrRwResourceTable:
|
||
{
|
||
SCE_GNM_ASSERT(inputUsageSlots[i].m_apiSlot == 0);
|
||
outTable->rwResourcePtrSgpr = startRegister;
|
||
outTable->rwResourceArrayDwOffset = requiredMemorySizeInDwords;
|
||
|
||
if (!(maskChunks & 1))
|
||
break;
|
||
SCE_GNM_ASSERT(usageMasks < (uint32_t const*)shaderBinaryInfo);
|
||
|
||
usedApiSlotCount = getUsedApiSlotsFromMask(&usedApiSlots[0], outTable->rwResourceSlotCount, *usageMasks++, kSlotCountRwResource);
|
||
SCE_GNM_ASSERT(usedApiSlotCount > 0);
|
||
|
||
uint32_t lastUsedApiSlot = usedApiSlots[usedApiSlotCount - 1];
|
||
for (uint8_t j = 0; j < usedApiSlotCount; j++)
|
||
{
|
||
uint16_t currentApiSlot = static_cast<uint16_t>(usedApiSlots[j]);
|
||
outTable->rwResourceDwOffset[currentApiSlot] = requiredMemorySizeInDwords + currentApiSlot * kDwordSizeRwResource;
|
||
}
|
||
requiredMemorySizeInDwords += (lastUsedApiSlot + 1) * kDwordSizeRwResource;
|
||
}
|
||
break;
|
||
|
||
case kShaderInputUsagePtrConstBufferTable:
|
||
{
|
||
SCE_GNM_ASSERT(inputUsageSlots[i].m_apiSlot == 0);
|
||
outTable->constBufferPtrSgpr = startRegister;
|
||
outTable->constBufferArrayDwOffset = requiredMemorySizeInDwords;
|
||
|
||
if (!(maskChunks & 1))
|
||
break;
|
||
SCE_GNM_ASSERT(usageMasks < (uint32_t const*)shaderBinaryInfo);
|
||
|
||
usedApiSlotCount = getUsedApiSlotsFromMask(&usedApiSlots[0], outTable->constBufferSlotCount, *usageMasks++, kSlotCountConstantBuffer);
|
||
SCE_GNM_ASSERT(usedApiSlotCount > 0);
|
||
|
||
uint32_t lastUsedApiSlot = usedApiSlots[usedApiSlotCount - 1];
|
||
for (uint8_t j = 0; j < usedApiSlotCount; j++)
|
||
{
|
||
uint16_t currentApiSlot = static_cast<uint16_t>(usedApiSlots[j]);
|
||
outTable->constBufferDwOffset[currentApiSlot] = requiredMemorySizeInDwords + currentApiSlot * kDwordSizeConstantBuffer;
|
||
}
|
||
requiredMemorySizeInDwords += (lastUsedApiSlot + 1) * kDwordSizeConstantBuffer;
|
||
}
|
||
break;
|
||
|
||
case kShaderInputUsagePtrSamplerTable:
|
||
{
|
||
SCE_GNM_ASSERT(inputUsageSlots[i].m_apiSlot == 0);
|
||
outTable->samplerPtrSgpr = startRegister;
|
||
outTable->samplerArrayDwOffset = requiredMemorySizeInDwords;
|
||
|
||
if (!(maskChunks & 1))
|
||
break;
|
||
SCE_GNM_ASSERT(usageMasks < (uint32_t const*)shaderBinaryInfo);
|
||
|
||
usedApiSlotCount = getUsedApiSlotsFromMask(&usedApiSlots[0], outTable->samplerSlotCount, *usageMasks++, kSlotCountSampler);
|
||
SCE_GNM_ASSERT(usedApiSlotCount > 0);
|
||
|
||
uint32_t lastUsedApiSlot = usedApiSlots[usedApiSlotCount - 1];
|
||
for (uint8_t j = 0; j < usedApiSlotCount; j++)
|
||
{
|
||
uint16_t currentApiSlot = static_cast<uint16_t>(usedApiSlots[j]);
|
||
outTable->samplerDwOffset[currentApiSlot] = requiredMemorySizeInDwords + currentApiSlot * kDwordSizeSampler;
|
||
}
|
||
requiredMemorySizeInDwords += (lastUsedApiSlot + 1) * kDwordSizeSampler;
|
||
}
|
||
break;
|
||
|
||
case kShaderInputUsagePtrVertexBufferTable:
|
||
{
|
||
SCE_GNM_ASSERT(shaderStage == kShaderStageLs || shaderStage == kShaderStageEs || shaderStage == kShaderStageVs || shaderStage == kShaderStageCs);
|
||
SCE_GNM_ASSERT(inputUsageSlots[i].m_apiSlot == 0);
|
||
outTable->vertexBufferPtrSgpr = startRegister;
|
||
outTable->vertexBufferArrayDwOffset = requiredMemorySizeInDwords;
|
||
|
||
#if SCE_GNM_LCUE_USE_VERTEX_BUFFER_TABLE_MASK_IF_AVAILABLE
|
||
if (maskChunks & 1)
|
||
{
|
||
// Skip updating for the vertex buffer table below, since we are using the chunk mask at the end
|
||
// we'll update it after everything else.
|
||
bUseVertexBufferTableChunkMask = true;
|
||
continue;
|
||
}
|
||
#endif
|
||
SCE_GNM_ASSERT(gnmxShaderStruct != NULL);
|
||
const VertexInputSemantic* semanticTable = NULL;
|
||
usedApiSlotCount = 0;
|
||
if (shaderStage == kShaderStageVs && isDispatchDraw)
|
||
{
|
||
VsShader const* pVsShader = ((CsVsShader*)gnmxShaderStruct)->getVertexShader();
|
||
usedApiSlotCount = pVsShader->m_numInputSemantics;
|
||
semanticTable = pVsShader->getInputSemanticTable();
|
||
}
|
||
else if (shaderStage == kShaderStageVs)
|
||
{
|
||
usedApiSlotCount = ((VsShader*)gnmxShaderStruct)->m_numInputSemantics;
|
||
semanticTable = ((VsShader*)gnmxShaderStruct)->getInputSemanticTable();
|
||
}
|
||
else if (shaderStage == kShaderStageLs)
|
||
{
|
||
usedApiSlotCount = ((LsShader*)gnmxShaderStruct)->m_numInputSemantics;
|
||
semanticTable = ((LsShader*)gnmxShaderStruct)->getInputSemanticTable();
|
||
}
|
||
else if (shaderStage == kShaderStageEs)
|
||
{
|
||
usedApiSlotCount = ((EsShader*)gnmxShaderStruct)->m_numInputSemantics;
|
||
semanticTable = ((EsShader*)gnmxShaderStruct)->getInputSemanticTable();
|
||
}
|
||
if (shaderStage == kShaderStageCs && isDispatchDraw)
|
||
{
|
||
VsShader const* pVsShader = ((CsVsShader*)gnmxShaderStruct)->getVertexShader();
|
||
CsShader const* pCsShader = ((CsVsShader*)gnmxShaderStruct)->getComputeShader();
|
||
if (pCsShader->m_version >= kDdCsShaderVersion_IndependentCsFetchShader)
|
||
{
|
||
usedApiSlotCount = pCsShader->m_numInputSemantics;
|
||
semanticTable = pCsShader->getInputSemanticTable();
|
||
}
|
||
else
|
||
{
|
||
usedApiSlotCount = pVsShader->m_gsModeOrNumInputSemanticsCs;
|
||
semanticTable = pVsShader->getInputSemanticTable();
|
||
}
|
||
}
|
||
else if (shaderStage == kShaderStageCs)
|
||
{
|
||
CsShader const* pCsShader = ((CsShader*)gnmxShaderStruct);
|
||
if (pCsShader->m_version >= kDdCsShaderVersion_IndependentCsFetchShader)
|
||
{
|
||
usedApiSlotCount = pCsShader->m_numInputSemantics;
|
||
semanticTable = pCsShader->getInputSemanticTable();
|
||
}
|
||
else
|
||
{
|
||
usedApiSlotCount = 0;
|
||
semanticTable = NULL;
|
||
}
|
||
}
|
||
// Check if the shader uses any API-slot over the maximum count configured for the InputResourceOffset table
|
||
SCE_GNM_ASSERT(usedApiSlotCount > 0 && usedApiSlotCount <= outTable->vertexBufferSlotCount);
|
||
|
||
uint32_t usedApiSlots[kSlotCountVertexBuffer];
|
||
|
||
// First use what the shader generated
|
||
for (uint8_t i = 0; i < usedApiSlotCount; i++)
|
||
{
|
||
uint8_t semanticIndex = semanticTable[i].m_semantic;
|
||
SCE_GNM_ASSERT(semanticIndex >= 0 && semanticIndex < outTable->vertexBufferSlotCount);
|
||
usedApiSlots[i] = semanticIndex;
|
||
}
|
||
|
||
uint32_t firstUsedApiSlot = usedApiSlots[0];
|
||
uint32_t lastUsedApiSlot = usedApiSlots[usedApiSlotCount - 1];
|
||
|
||
// If a semanticRemapTable has been provided, override the shaders defined usage slots to conform with the remapped layout
|
||
if (semanticRemapTable && numElementsInSemanticRemapTable != 0)
|
||
{
|
||
// Override values defined in the shader binary header above
|
||
SCE_GNM_ASSERT(usedApiSlotCount <= numElementsInSemanticRemapTable);
|
||
usedApiSlotCount = remapVertexBufferOffsetsWithSemanticTable(&usedApiSlots[0], firstUsedApiSlot, lastUsedApiSlot, outTable->vertexBufferSlotCount,
|
||
semanticRemapTable, numElementsInSemanticRemapTable);
|
||
}
|
||
|
||
// Generate the final dword offsets for the vertex buffer table
|
||
lastUsedApiSlot = usedApiSlots[usedApiSlotCount - 1];
|
||
for (uint8_t j = 0; j < usedApiSlotCount; j++)
|
||
{
|
||
uint16_t currentApiSlot = static_cast<uint16_t>(usedApiSlots[j]);
|
||
outTable->vertexBufferDwOffset[currentApiSlot] = requiredMemorySizeInDwords + currentApiSlot * kDwordSizeVertexBuffer;
|
||
}
|
||
requiredMemorySizeInDwords += (lastUsedApiSlot + 1) * kDwordSizeVertexBuffer;
|
||
}
|
||
break;
|
||
}
|
||
}
|
||
*/
|
||
|
||
// Note: this must be called after all other tables are processed above, as the vertex buffer table chunk mask (*usageMasks)
|
||
// is always stored at the end of the chunk mask table
|
||
/*
|
||
if (bUseVertexBufferTableChunkMask)
|
||
{
|
||
uint32_t usedApiSlots[kSlotCountVertexBuffer];
|
||
uint32_t usedApiSlotCount;
|
||
|
||
SCE_GNM_ASSERT(usageMasks < (uint32_t const*)shaderBinaryInfo);
|
||
|
||
usedApiSlotCount = getUsedApiSlotsFromMask(&usedApiSlots[0], outTable->vertexBufferSlotCount, *usageMasks++, kSlotCountVertexBuffer);
|
||
SCE_GNM_ASSERT(usedApiSlotCount > 0);
|
||
|
||
uint32_t firstUsedApiSlot = usedApiSlots[0];
|
||
uint32_t lastUsedApiSlot = usedApiSlots[usedApiSlotCount - 1];
|
||
|
||
// If a semanticRemapTable has been provided, override the shaders defined usage slots to conform with the remapped layout
|
||
if (semanticRemapTable && numElementsInSemanticRemapTable != 0)
|
||
{
|
||
// Override values defined in the shader binary header above
|
||
SCE_GNM_ASSERT((uint32_t)usedApiSlotCount <= numElementsInSemanticRemapTable);
|
||
usedApiSlotCount = remapVertexBufferOffsetsWithSemanticTable(&usedApiSlots[0], firstUsedApiSlot, lastUsedApiSlot, outTable->vertexBufferSlotCount,
|
||
semanticRemapTable, numElementsInSemanticRemapTable);
|
||
}
|
||
|
||
// Generate the final dword offsets for the vertex buffer table
|
||
lastUsedApiSlot = usedApiSlots[usedApiSlotCount - 1];
|
||
for (uint8_t j = 0; j < usedApiSlotCount; j++)
|
||
{
|
||
int32_t currentApiSlot = usedApiSlots[j];
|
||
outTable->vertexBufferDwOffset[currentApiSlot] = requiredMemorySizeInDwords + currentApiSlot * kDwordSizeVertexBuffer;
|
||
}
|
||
requiredMemorySizeInDwords += (lastUsedApiSlot + 1) * kDwordSizeVertexBuffer;
|
||
}
|
||
*/
|
||
|
||
// Final amount of memory the shader will use from the scratch and resource buffer
|
||
outTable->requiredBufferSizeInDwords = requiredMemorySizeInDwords;
|
||
|
||
|
||
// Checking for non handled input data
|
||
for (int32_t i = 0; i < inputUsageSlotsCount; ++i)
|
||
{
|
||
switch (inputUsageSlots[i].m_usageType)
|
||
{
|
||
case kShaderInputUsageImmResource:
|
||
case kShaderInputUsageImmRwResource:
|
||
case kShaderInputUsageImmSampler:
|
||
case kShaderInputUsageImmConstBuffer:
|
||
case kShaderInputUsageImmVertexBuffer:
|
||
case kShaderInputUsageImmShaderResourceTable:
|
||
case kShaderInputUsageSubPtrFetchShader:
|
||
case kShaderInputUsagePtrExtendedUserData:
|
||
case kShaderInputUsagePtrResourceTable:
|
||
case kShaderInputUsagePtrRwResourceTable:
|
||
case kShaderInputUsagePtrConstBufferTable:
|
||
case kShaderInputUsagePtrVertexBufferTable:
|
||
case kShaderInputUsagePtrSamplerTable:
|
||
case kShaderInputUsagePtrInternalGlobalTable:
|
||
case kShaderInputUsageImmGdsCounterRange:
|
||
case kShaderInputUsageImmGdsMemoryRange:
|
||
case kShaderInputUsageImmLdsEsGsSize:
|
||
case kShaderInputUsageImmGdsKickRingBufferOffset:
|
||
case kShaderInputUsageImmVertexRingBufferOffset:
|
||
case kShaderInputUsagePtrDispatchDraw:
|
||
case kShaderInputUsageImmDispatchDrawInstances:
|
||
// case kShaderInputUsagePtrSoBufferTable: // Only present in the VS copy-shader that doesn't have a footer
|
||
break;
|
||
default:
|
||
// Not handled yet
|
||
(false, "Input Usage Slot type %d is not supported by LCUE sce::generateInputResourceOffsetTable()", inputUsageSlots[i].m_usageType);
|
||
break;
|
||
}
|
||
}
|
||
|
||
}
|
||
|
||
int generateVsFetchShaderBuildState(FetchShaderBuildState *fsbs, const VsStageRegisters *pregs, uint32_t numInputs,
|
||
const FetchShaderInstancingMode *instancingData, const uint32_t numElementsInInstancingData,
|
||
uint8_t vertexBaseUserSgpr, uint8_t instanceBaseUsgpr)
|
||
{
|
||
uint32_t spiShaderPgmRsrc2Vs; // ebp
|
||
uint32_t spiShaderPgmRsrc1Vs; // er14
|
||
uint32_t firstFreeSgpr; // ebp
|
||
_BOOL4 v10; // ebx
|
||
unsigned int vsharpSlotCount; // er15
|
||
int v12; // er14
|
||
int v13; // esi
|
||
__int16 result; // ax
|
||
bool noInstance; // zf
|
||
signed int v16; // ecx
|
||
|
||
spiShaderPgmRsrc1Vs = pregs->m_spiShaderPgmRsrc1Vs;
|
||
spiShaderPgmRsrc2Vs = pregs->m_spiShaderPgmRsrc2Vs;
|
||
|
||
firstFreeSgpr = ((((unsigned __int8)(spiShaderPgmRsrc2Vs >> 7) | (unsigned __int8)(spiShaderPgmRsrc2Vs >> 12)) & 1)
|
||
+ ((spiShaderPgmRsrc2Vs >> 24) & 1)
|
||
+ ((spiShaderPgmRsrc2Vs >> 12) & 1)
|
||
+ ((spiShaderPgmRsrc2Vs >> 11) & 1)
|
||
+ ((spiShaderPgmRsrc2Vs >> 10) & 1)
|
||
+ ((spiShaderPgmRsrc2Vs >> 9) & 1)
|
||
+ ((unsigned __int8)spiShaderPgmRsrc2Vs >> 8)
|
||
+ ((spiShaderPgmRsrc2Vs >> 7) & 1)
|
||
+ (spiShaderPgmRsrc2Vs & 1)
|
||
+ ((spiShaderPgmRsrc2Vs >> 1) & 0x1F)
|
||
+ 3) & 0xFFFFFFFC;
|
||
|
||
v10 = (((spiShaderPgmRsrc1Vs >> 3) & 0x78) + 8 - firstFreeSgpr) >> 2 == 0;
|
||
vsharpSlotCount = ((((spiShaderPgmRsrc1Vs >> 3) & 0x78) + 8 - firstFreeSgpr) >> 2) + 2 * v10;
|
||
v12 = spiShaderPgmRsrc1Vs & 0x3000000;
|
||
v13 = (((_BYTE)v10 + (unsigned __int8)(pregs->m_spiShaderPgmRsrc1Vs >> 6)) & 0xF) << 6;
|
||
fsbs->m_fetchShaderBufferSize = 4
|
||
* ((numInputs + vsharpSlotCount - 1) / vsharpSlotCount
|
||
+ (instanceBaseUsgpr != 0)
|
||
+ numInputs
|
||
- ((vertexBaseUserSgpr < 1u)
|
||
- 1))
|
||
+ 8 * numInputs
|
||
+ 12;
|
||
fsbs->m_fetchShaderFlags = 0;
|
||
fsbs->m_firstFreeSgpr = firstFreeSgpr;
|
||
fsbs->m_vsharpSlotCount = vsharpSlotCount;
|
||
fsbs->m_fetchShaderInstancingData = instancingData;
|
||
result = 0;
|
||
noInstance = instancingData == 0LL;
|
||
if (instancingData)
|
||
result = numElementsInInstancingData;
|
||
v16 = 0x3000000;
|
||
if (noInstance)
|
||
v16 = v12;
|
||
fsbs->m_numElementsInInstancingData = result;
|
||
fsbs->m_vertexBaseUsgpr = vertexBaseUserSgpr;
|
||
fsbs->m_instanceBaseUsgpr = instanceBaseUsgpr;
|
||
fsbs->m_reserved1 = 0;
|
||
fsbs->m_reserved2 = 0;
|
||
fsbs->m_reserved0 = 0;
|
||
fsbs->m_shaderModifier = v13 | v16;
|
||
return result;
|
||
}
|
||
|
||
|
||
int generateFetchShader(uint32_t *fs, FetchShaderBuildState *fsbs)
|
||
{
|
||
FetchShaderBuildState *fsbs_; // r11
|
||
__int64 *v3; // rsi
|
||
char vbtStartRegister; // r14
|
||
uint16_t v5; // ax
|
||
const InputUsageSlot *inputUsageSlot; // rcx
|
||
unsigned __int64 i; // rdx
|
||
signed __int64 idx; // r9
|
||
unsigned __int64 inputSemaCount; // rdi
|
||
uint8_t vsharpSlotCount; // bl
|
||
_DWORD *pEleNumInRemapTable; // r15
|
||
__int64 v12; // rdx
|
||
unsigned int k; // er13
|
||
int shiftVbtStartRegister; // er14
|
||
unsigned __int64 m; // r10
|
||
unsigned int n; // er8
|
||
const uint32_t *semanticsRemapTable; // rbx
|
||
int v18; // er12
|
||
int semanticIdx; // edx
|
||
__int64 j; // rax
|
||
unsigned int firstFreeSgprAndVsharp; // eax
|
||
char v22; // cl
|
||
int v23; // edx
|
||
__int64 v24; // rcx
|
||
__int64 v25; // rax
|
||
unsigned int v26; // esi
|
||
__int64 v27; // rcx
|
||
const FetchShaderInstancingMode *fetchShaderInstancingData; // r12
|
||
const VertexInputSemantic *inputSemantics; // rdx
|
||
__int64 t; // r14
|
||
int v31; // er11
|
||
signed __int64 pSizeInElements; // r8
|
||
unsigned int v33; // edx
|
||
unsigned int v34; // er13
|
||
unsigned int v35; // er15
|
||
int vgpr; // edi
|
||
int sizeInElmt; // esi
|
||
unsigned int v38; // ebx
|
||
unsigned int v39; // ecx
|
||
__int64 v40; // rbx
|
||
__int64 v41; // rdi
|
||
__int64 v42; // rcx
|
||
__int64 v43; // rax
|
||
__int64 v44; // r9
|
||
unsigned __int64 numElementsInInstancingData; // [rsp+8h] [rbp-98h]
|
||
__int64 v47; // [rsp+10h] [rbp-90h]
|
||
signed __int64 v48; // [rsp+18h] [rbp-88h]
|
||
__int64 v49; // [rsp+20h] [rbp-80h]
|
||
FetchShaderBuildState *fsbs_bak; // [rsp+28h] [rbp-78h]
|
||
unsigned __int64 numInputSemantics; // [rsp+30h] [rbp-70h]
|
||
signed __int64 pEleNumInRemapTable_; // [rsp+38h] [rbp-68h]
|
||
int v53; // [rsp+44h] [rbp-5Ch]
|
||
int v54; // [rsp+48h] [rbp-58h]
|
||
int shiftVbtStartRegister_; // [rsp+4Ch] [rbp-54h]
|
||
uint32_t *fs_dst; // [rsp+50h] [rbp-50h]
|
||
uint8_t vsharpSlotCount_; // [rsp+5Fh] [rbp-41h]
|
||
__int64 v58; // [rsp+60h] [rbp-40h]
|
||
int v59; // [rsp+68h] [rbp-38h]
|
||
int v60; // [rsp+6Ch] [rbp-34h]
|
||
__int64 v61; // [rsp+70h] [rbp-30h]
|
||
|
||
fsbs_ = fsbs;
|
||
//v3 = (__int64 *)loc_96B;
|
||
fs_dst = fs;
|
||
vbtStartRegister = -1;
|
||
//v61 = *loc_96B;
|
||
v5 = fsbs_->m_fetchShaderFlags;
|
||
v58 = 0x300000000LL;
|
||
v59 = (v5 & 1) + 1;
|
||
v60 = 2;
|
||
if (fsbs_->m_numInputUsageSlots)
|
||
{
|
||
inputUsageSlot = fsbs_->m_inputUsageSlots;
|
||
vbtStartRegister = -1;
|
||
i = 0LL;
|
||
do
|
||
{
|
||
if (inputUsageSlot[i].m_usageType == kShaderInputUsagePtrVertexBufferTable)
|
||
vbtStartRegister = inputUsageSlot[i].m_startRegister;
|
||
++i;
|
||
} while (i < fsbs_->m_numInputUsageSlots);
|
||
}
|
||
|
||
idx = 0LL;
|
||
|
||
if (fsbs_->m_vertexBaseUsgpr)
|
||
{
|
||
fs[idx] = fsbs_->m_vertexBaseUsgpr | 0x4A000000;
|
||
++idx;
|
||
}
|
||
|
||
if (fsbs_->m_instanceBaseUsgpr)
|
||
{
|
||
fs[idx] = fsbs_->m_instanceBaseUsgpr | 0x4A060600;
|
||
++idx;
|
||
}
|
||
|
||
inputSemaCount = fsbs_->m_numInputSemantics;
|
||
if (fsbs_->m_numInputSemantics)
|
||
{
|
||
vsharpSlotCount = fsbs_->m_vsharpSlotCount;
|
||
pEleNumInRemapTable = &fsbs_->m_numElementsInRemapTable;
|
||
v12 = 0LL;
|
||
k = 0;
|
||
fsbs_bak = fsbs_;
|
||
numInputSemantics = fsbs_->m_numInputSemantics;
|
||
shiftVbtStartRegister = (vbtStartRegister & 0x7E) << 8;
|
||
pEleNumInRemapTable_ = (signed __int64)&fsbs_->m_numElementsInRemapTable;
|
||
shiftVbtStartRegister_ = shiftVbtStartRegister;
|
||
vsharpSlotCount_ = fsbs_->m_vsharpSlotCount;
|
||
do
|
||
{
|
||
if (vsharpSlotCount)
|
||
{
|
||
m = (unsigned int)v12;
|
||
n = 0;
|
||
while (1)
|
||
{
|
||
v12 = (unsigned int)m;
|
||
if (m >= inputSemaCount)
|
||
break;
|
||
semanticsRemapTable = fsbs_->m_semanticsRemapTable;
|
||
v18 = m;
|
||
semanticIdx = fsbs_->m_inputSemantics[m].m_semantic;
|
||
if (semanticsRemapTable && *pEleNumInRemapTable)
|
||
{
|
||
j = 0LL;
|
||
do
|
||
{
|
||
if (semanticsRemapTable[j] == semanticIdx)
|
||
break;
|
||
++j;
|
||
} while ((unsigned int)j < *pEleNumInRemapTable);
|
||
LOBYTE(semanticIdx) = j;
|
||
}
|
||
firstFreeSgprAndVsharp = *(unsigned __int16 *)&fsbs_->m_firstFreeSgpr;
|
||
++m;
|
||
v22 = firstFreeSgprAndVsharp + 4 * n++;
|
||
v23 = ((v22 & 0x7F) << 15) | shiftVbtStartRegister | 4 * (semanticIdx & 0x3F);
|
||
v24 = (unsigned int)idx;
|
||
++idx;
|
||
fs_dst[v24] = v23 | 0xC0800100;
|
||
if (n >= firstFreeSgprAndVsharp >> 8)
|
||
{
|
||
v12 = (unsigned int)(v18 + 1);
|
||
break;
|
||
}
|
||
}
|
||
vsharpSlotCount = vsharpSlotCount_;
|
||
}
|
||
v25 = (unsigned int)idx;
|
||
++idx;
|
||
fs_dst[v25] = 0xBF8C007F;
|
||
if ((unsigned int)v12 > k)
|
||
{
|
||
v26 = *(_DWORD *)&fsbs_->m_firstFreeSgpr;
|
||
v27 = v12;
|
||
fetchShaderInstancingData = fsbs_->m_fetchShaderInstancingData;
|
||
inputSemantics = fsbs_->m_inputSemantics;
|
||
t = 0LL;
|
||
v49 = v27;
|
||
v53 = 2 * k;
|
||
v54 = 2 * v27;
|
||
v47 = k;
|
||
v31 = k - v27;
|
||
pSizeInElements = (signed __int64)&inputSemantics[k].m_sizeInElements;
|
||
v33 = idx;
|
||
numElementsInInstancingData = v26 >> 16;
|
||
v48 = (signed __int64)&fetchShaderInstancingData[k];
|
||
v34 = (v26 & 0xFFFFFFFC) << 14;
|
||
do
|
||
{
|
||
v35 = 0;
|
||
if (fetchShaderInstancingData && v47 + t < numElementsInInstancingData)
|
||
v35 = *(_DWORD *)(v48 + 4 * t);
|
||
vgpr = *(unsigned __int8 *)(pSizeInElements + 4 * t - 1);
|
||
sizeInElmt = *(unsigned __int8 *)(pSizeInElements + 4 * t);
|
||
v38 = v34;
|
||
++t;
|
||
v34 += 0x10000;
|
||
v39 = (v38 & 0x1F0000) + (*((unsigned __int8 *)&v58 + 4 * v35) | (vgpr << 8)) + 0x80000000;
|
||
v40 = v33;
|
||
v41 = v33 + 1;
|
||
v33 += 2;
|
||
fs_dst[v40] = ((sizeInElmt << 18) + 0x1FC0000) & 0x1FC0000 | 0xE0002000;
|
||
fs_dst[v41] = v39;
|
||
} while (v31 + (_DWORD)t);
|
||
v12 = v49;
|
||
fsbs_ = fsbs_bak;
|
||
shiftVbtStartRegister = shiftVbtStartRegister_;
|
||
inputSemaCount = numInputSemantics;
|
||
pEleNumInRemapTable = (_DWORD *)pEleNumInRemapTable_;
|
||
vsharpSlotCount = vsharpSlotCount_;
|
||
LODWORD(idx) = v54 + idx - v53;
|
||
k = v49;
|
||
}
|
||
} while (k < (unsigned int)inputSemaCount);
|
||
//v3 = (__int64 *)loc_C1E;
|
||
}
|
||
else
|
||
{
|
||
pEleNumInRemapTable = &fsbs_->m_numElementsInRemapTable;
|
||
}
|
||
v42 = (unsigned int)idx;
|
||
v43 = (unsigned int)(idx + 1);
|
||
v44 = (unsigned int)(idx + 2);
|
||
fs_dst[v42] = 0xBF8C0000;
|
||
fs_dst[v43] = 0xBE802000;
|
||
if (*pEleNumInRemapTable)
|
||
LODWORD(inputSemaCount) = *pEleNumInRemapTable;
|
||
fs_dst[v44] = inputSemaCount;
|
||
return 0;
|
||
}
|
||
|
||
int getGpuMode()
|
||
{
|
||
return 1;
|
||
}
|
||
|
||
void generatePsShaderUsageTable(uint32_t *inputTable, const VertexExportSemantic *vsTable, uint32_t vsTableNumItems, const PixelInputSemantic *psTable, uint32_t psTableNumItem)
|
||
{
|
||
const PixelInputSemantic *psTab; // r12
|
||
uint32_t numVsExportSemantics; // ebx
|
||
__int64 idx; // r14
|
||
int v8; // ecx
|
||
char v9; // r8
|
||
__int64 v10; // rax
|
||
signed __int64 v11; // rax
|
||
unsigned __int64 v12; // rax
|
||
unsigned __int64 v13; // rcx
|
||
unsigned __int64 v14; // rdx
|
||
unsigned __int64 v15; // rax
|
||
__int64 matchVsIdx; // r15
|
||
char hasMatchedVsSema; // r13
|
||
const PixelInputSemantic *psTab_; // r8
|
||
unsigned int v19; // edx
|
||
unsigned int v20; // ecx
|
||
uint32_t numPsInputSemantics; // [rsp+4h] [rbp-3Ch]
|
||
const VertexExportSemantic *vsTab; // [rsp+10h] [rbp-30h]
|
||
|
||
psTab = psTable;
|
||
numVsExportSemantics = vsTableNumItems;
|
||
numPsInputSemantics = psTableNumItem;
|
||
if (psTableNumItem)
|
||
{
|
||
idx = 0LL;
|
||
vsTab = vsTable;
|
||
do
|
||
{
|
||
LODWORD(matchVsIdx) = 0;
|
||
hasMatchedVsSema = 0;
|
||
if (numVsExportSemantics)
|
||
{
|
||
matchVsIdx = 0LL;
|
||
while (vsTable[matchVsIdx].m_semantic != psTab[idx].m_semantic)
|
||
{
|
||
if ((unsigned int)++matchVsIdx >= numVsExportSemantics)
|
||
{
|
||
hasMatchedVsSema = 0;
|
||
goto LABEL_14;
|
||
}
|
||
}
|
||
hasMatchedVsSema = 1;
|
||
}
|
||
|
||
LABEL_14:
|
||
if (getGpuMode() == 1 && psTab[idx].m_interpF16)
|
||
{
|
||
if (hasMatchedVsSema)
|
||
{
|
||
vsTable = vsTab;
|
||
v9 = *((_BYTE *)&vsTab[(unsigned int)matchVsIdx] + 1) & 0x1F;
|
||
v8 = (unsigned __int16)(psTab[idx].m_interpF16) & ~(unsigned __int8)(*((_BYTE *)&vsTab[(unsigned int)matchVsIdx] + 1) >> 6);
|
||
}
|
||
else
|
||
{
|
||
vsTable = vsTab;
|
||
v8 = 0;
|
||
v9 = 0x20;
|
||
}
|
||
inputTable[idx] = 0x80000;
|
||
v10 = (*(_WORD *)&psTab[idx] << 10) & 0x1000000;
|
||
inputTable[idx] = v10 + 0x80000;
|
||
v11 = ((*(_WORD *)&psTab[idx] << 10) & 0x2000000) + v10 + 0x80000;
|
||
inputTable[idx] = v11;
|
||
if (*(_WORD *)&psTab[idx] < 0xC000u)
|
||
{
|
||
v15 = ((unsigned __int8)v9 | (unsigned __int8)(32 * (v8 != 0))) & 0x3F | (unsigned __int64)v11;
|
||
inputTable[idx] = v15;
|
||
v14 = v15 | (((*(_WORD *)&psTab[idx] >> (4 * v8 & 4 ^ 0xC)) & 3LL) << 8);
|
||
}
|
||
else
|
||
{
|
||
v12 = ((unsigned __int8)v9 | (unsigned __int8)(32 * v8)) & 0x3F | (unsigned __int64)v11;
|
||
inputTable[idx] = v12;
|
||
v13 = v12 | ((unsigned __int64)(*((_BYTE *)&psTab[idx] + 1) & 3) << 8) | (v8 << 19) & 0x100000;
|
||
inputTable[idx] = v13;
|
||
v14 = v13 & 0xFFFFFFFFFF9FFFFFLL | (*(_WORD *)&psTab[idx] << 9) & 0x600000;
|
||
}
|
||
inputTable[idx] = v14;
|
||
}
|
||
else
|
||
{
|
||
inputTable[idx] = 0;
|
||
if (hasMatchedVsSema)
|
||
{
|
||
vsTable = vsTab;
|
||
psTab_ = &psTab[idx];
|
||
v19 = *((_BYTE *)&vsTab[(unsigned int)matchVsIdx] + 1) & 0x1F | ((unsigned int)*(_WORD *)&psTab[idx] >> 7) & 0x20;
|
||
inputTable[idx] = v19;
|
||
v20 = v19 | (((((unsigned int)*(_WORD *)&psTab[idx] >> 10) | ((unsigned int)*(_WORD *)&psTab[idx] >> 12)) & 1) << 10);
|
||
}
|
||
else
|
||
{
|
||
vsTable = vsTab;
|
||
psTab_ = &psTab[idx];
|
||
v20 = 32;
|
||
}
|
||
inputTable[idx] = v20;
|
||
inputTable[idx] = v20 & 0xFFFFFCFF | ((*((_BYTE *)psTab_ + 1) & 3) << 8);
|
||
}
|
||
++idx;
|
||
} while ((_DWORD)idx != numPsInputSemantics);
|
||
}
|
||
}
|
||
|
||
// 128 bits
|
||
struct VSharpBuffer
|
||
{
|
||
uint64_t base : 44; // base byte address (only 40 bits supported)
|
||
uint64_t mtype_L1s : 2; // mtype for scalar L1
|
||
uint64_t mtype_L2 : 2; // mtype for L2
|
||
uint64_t stride : 14; // bytes: 0..16383
|
||
uint64_t cache_swizzle : 1; // buffer access. optionally swizzle TC L1 cache banks
|
||
uint64_t swizzle_en : 1; // swizzle AOS according to stride, index_stride, and element_size, else linear (stride * index + offset)
|
||
|
||
uint32_t num_records; // in units of 'stride'
|
||
|
||
// Destination channel select:
|
||
// 0=0, 1=1, 4=R, 5=G, 6=B, 7=A
|
||
uint32_t dst_sel_x : 3;
|
||
uint32_t dst_sel_y : 3;
|
||
uint32_t dst_sel_z : 3;
|
||
uint32_t dst_sel_w : 3;
|
||
|
||
uint32_t nfmt : 3; // numeric data type (float, int, <20><>)
|
||
uint32_t dfmt : 4; // # of fields, size of each field. Note: dfmt=0 (invalid) is a special case that will disable buffer access via vector memory ops.
|
||
uint32_t element_size : 2; // 2, 4, 8, or 16 bytes. Used for swizzled buffer addressing
|
||
uint32_t index_stride : 2; // 8, 16, 32, or 64. Used for swizzled buffer addressing
|
||
uint32_t addtid_en : 1; // add thread id to the index for addr calc
|
||
uint32_t reserved0 : 1;
|
||
uint32_t hash_en : 1; // 1 = buffer addresses are hashed for better cache perf
|
||
uint32_t reserved1 : 1;
|
||
uint32_t mtype : 3; // mtype for L1
|
||
uint32_t type : 2; // value == 0 for buf. Overlaps upper 2 bits of 4-bit TYPE field in 128-bit T# resource
|
||
};
|
||
|
||
typedef VSharpBuffer GnmBuffer;
|
||
|
||
// T# Texture Descriptor Buffer
|
||
// 256 bits
|
||
struct TSharpBuffer
|
||
{
|
||
uint64_t baseaddr256 : 38; // base 256-byte aligned address bits [39:8] (top 6 bits are not used)
|
||
uint64_t mtype_L2 : 2; // mtype for L2
|
||
uint64_t min_lod : 12; // fixed point 4.8 minimum LOD (0.0..15.0)
|
||
uint64_t dfmt : 6; // texture data format; num components, num bits
|
||
uint64_t nfmt : 4; // texture numeric format; value conversion
|
||
uint64_t mtype_lsbs : 2; // mtype for L1 (LSBs)
|
||
|
||
uint64_t width : 14; // texture width (0..16383)
|
||
uint64_t height : 14; // texture height (0..16383)
|
||
|
||
// Specifies the scale factor applied to the perf_z, perf_mip,
|
||
// aniso_bias, aniso_threshold, lod_bias_sec settings
|
||
// specified in the associated S#:
|
||
// 0=0/16, 1=2/16, 2=5/16, 3=7/16, 4=9/16, 5=11/16, 6=14/16, 7=16/16
|
||
// The result after scaling is rounded down to the nearest
|
||
// representable value for the given S# field.
|
||
// (Note that perf_mod=0 effectively disables these S#
|
||
// settings, while perf_mod=7 essentially eliminates the
|
||
// dependency between the T# and S#.)
|
||
uint64_t perf_mod : 3;
|
||
|
||
uint64_t interlaced : 1; // texture is interlaced
|
||
|
||
// Destination channel select:
|
||
// 0=0, 1=1, 4=R, 5=G, 6=B, 7=A
|
||
uint64_t dst_sel_x : 3;
|
||
uint64_t dst_sel_y : 3;
|
||
uint64_t dst_sel_z : 3;
|
||
uint64_t dst_sel_w : 3;
|
||
|
||
uint64_t base_level : 4;
|
||
uint64_t last_level : 4;
|
||
uint64_t tiling_idx : 5;
|
||
uint64_t pow2pad : 1;
|
||
uint64_t mtype_msb : 1;
|
||
uint64_t reserved0 : 1;
|
||
uint64_t type : 4;
|
||
|
||
uint64_t depth : 13;
|
||
uint64_t pitch : 14; // texture pitch in texels (0..16383); defaults to width
|
||
uint64_t reserved1 : 5;
|
||
uint64_t base_array : 13; // first array index (0..16383)
|
||
uint64_t last_array : 13; // texture height (0..16383)
|
||
uint64_t reserved2 : 6;
|
||
|
||
uint64_t min_lod_warn : 12;
|
||
uint64_t counter_bank_id : 8;
|
||
uint64_t LOD_hdw_cnt_en : 1;
|
||
uint64_t reserved3 : 43;
|
||
};
|
||
|
||
typedef TSharpBuffer GnmTexture;
|
||
|
||
// S# Sampler Descriptor Buffer
|
||
// 128 bits
|
||
struct SSharpBuffer
|
||
{
|
||
uint64_t clamp_x : 3;
|
||
uint64_t clamp_y : 3;
|
||
uint64_t clamp_z : 3;
|
||
uint64_t max_aniso_ratio : 3;
|
||
uint64_t depth_compare_func : 3;
|
||
uint64_t force_unorm_coords : 1;
|
||
uint64_t aniso_threshold : 3;
|
||
uint64_t mc_coord_trunc : 1;
|
||
uint64_t force_degamma : 1;
|
||
uint64_t aniso_bias : 6;
|
||
uint64_t trunc_coord : 1;
|
||
uint64_t disable_cube_wrap : 1;
|
||
uint64_t filter_mode : 2;
|
||
uint64_t reserved0 : 1;
|
||
uint64_t min_lod : 12;
|
||
uint64_t max_lod : 12;
|
||
uint64_t perf_mip : 4;
|
||
uint64_t perf_z : 4;
|
||
|
||
uint64_t lod_bias : 14;
|
||
uint64_t lod_bias_sec : 6;
|
||
uint64_t xy_mag_filter : 2;
|
||
uint64_t xy_min_filter : 2;
|
||
uint64_t z_filter : 2;
|
||
uint64_t mip_filter : 2;
|
||
uint64_t reserved1 : 4;
|
||
uint64_t border_color_ptr : 12;
|
||
uint64_t reserved2 : 18;
|
||
uint64_t border_color_type : 2;
|
||
};
|
||
|
||
|
||
|
||
int main(void)
|
||
{
|
||
|
||
ShaderInfo shaderInfo;
|
||
parseShader(&shaderInfo, s_vex_vv);
|
||
|
||
void *shaderBinary = malloc(shaderInfo.m_gpuShaderCodeSize);
|
||
void *shaderHeader = malloc(shaderInfo.m_vsShader->computeSize());
|
||
|
||
memcpy(shaderBinary, shaderInfo.m_gpuShaderCode, shaderInfo.m_gpuShaderCodeSize);
|
||
memcpy(shaderHeader, shaderInfo.m_vsShader, shaderInfo.m_vsShader->computeSize());
|
||
|
||
ShaderBinaryInfo* binInfo = findShaderBinInfo((uint8_t*)shaderBinary);
|
||
|
||
InputResourceOffsets table;
|
||
generateInputResourceOffsetTable(&table, binInfo);
|
||
|
||
size_t codeLength = binInfo->m_length + sizeof(ShaderBinaryInfo);
|
||
|
||
VsShader* m_shader = static_cast<VsShader*>(shaderHeader);
|
||
m_shader->patchShaderGpuAddress(shaderBinary);
|
||
|
||
VsShader* vertexShader = m_shader;
|
||
FetchShaderBuildState fb = { 0 };
|
||
generateVsFetchShaderBuildState(&fb, (const VsStageRegisters*)&vertexShader->m_vsStageRegisters, vertexShader->m_numInputSemantics, nullptr, 0, fb.m_vertexBaseUsgpr, fb.m_instanceBaseUsgpr);
|
||
|
||
const InputUsageSlot *inputUsageSlots = vertexShader->getInputUsageSlotTable();
|
||
fb.m_numInputSemantics = vertexShader->m_numInputSemantics;
|
||
fb.m_inputSemantics = vertexShader->getInputSemanticTable();
|
||
fb.m_numInputUsageSlots = vertexShader->m_common.m_numInputUsageSlots;
|
||
fb.m_inputUsageSlots = inputUsageSlots;
|
||
fb.m_numElementsInRemapTable = 0;
|
||
fb.m_semanticsRemapTable = 0;
|
||
|
||
uint32_t *fs = (uint32_t *)malloc(fb.m_fetchShaderBufferSize);
|
||
generateFetchShader(fs, &fb);
|
||
|
||
std::ofstream fout("fetch_shader.bin");
|
||
fout.write((char*)fs, fb.m_fetchShaderBufferSize);
|
||
fout.close();
|
||
|
||
const InputUsageSlot* slot = m_shader->getInputUsageSlotTable();
|
||
const VertexInputSemantic* inputSema = m_shader->getInputSemanticTable();
|
||
const VertexExportSemantic* expSema = m_shader->getExportSemanticTable();
|
||
|
||
//
|
||
//ShaderInfo psShaderInfo;
|
||
//parseShader(&psShaderInfo, s_pix_p);
|
||
|
||
//void *psshaderBinary = malloc(psShaderInfo.m_gpuShaderCodeSize);
|
||
//void *psshaderHeader = malloc(psShaderInfo.m_psShader->computeSize());
|
||
|
||
//memcpy(psshaderBinary, psShaderInfo.m_gpuS haderCode, psShaderInfo.m_gpuShaderCodeSize);
|
||
//memcpy(psshaderHeader, psShaderInfo.m_psShader, psShaderInfo.m_psShader->computeSize());
|
||
|
||
//ShaderBinaryInfo* psbinInfo = findShaderBinInfo((uint8_t*)psshaderBinary);
|
||
|
||
//InputResourceOffsets pstable;
|
||
//generateInputResourceOffsetTable(&pstable, psbinInfo);
|
||
|
||
//size_t pscodeLength = psbinInfo->m_length + sizeof(ShaderBinaryInfo);
|
||
|
||
//PsShader* m_psshader = static_cast<PsShader*>(psshaderHeader);
|
||
//m_psshader->patchShaderGpuAddress(psshaderBinary);
|
||
|
||
//uint32_t psInputs[32] = {0};
|
||
//generatePsShaderUsageTable(psInputs,
|
||
// vertexShader->getExportSemanticTable(), vertexShader->m_numExportSemantics,
|
||
// m_psshader->getPixelInputSemanticTable(), m_psshader->m_numInputSemantics);
|
||
//
|
||
return binInfo->m_isSrt;
|
||
} |