2019-10-08 15:31:29 +08:00

1891 lines
76 KiB
Raw Permalink Blame History

#include <cstdio>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <fstream>
#define SCE_GNM_SET_ALIGN(n) __attribute__((__aligned__(n)))
#define SCE_GNM_ASSERT(x) ((void)(x))
typedef unsigned int uint;
typedef unsigned char uchar;
typedef unsigned short ushort;
typedef unsigned long ulong;
typedef char int8;
typedef signed char sint8;
typedef unsigned char uint8;
typedef short int16;
typedef signed short sint16;
typedef unsigned short uint16;
typedef int int32;
typedef signed int sint32;
typedef unsigned int uint32;
#define LAST_IND(x,part_type) (sizeof(x)/sizeof(part_type) - 1)
#define HIGH_IND(x,part_type) LAST_IND(x,part_type)
#define LOW_IND(x,part_type) 0
// Partially defined types. They are used when the decompiler does not know
// anything about the type except its size.
#define _BYTE uint8
#define _WORD uint16
#define _DWORD uint32
#define _QWORD uint64
typedef int32_t _BOOL4;
#define BYTEn(x, n) (*((_BYTE*)&(x)+n))
#define WORDn(x, n) (*((_WORD*)&(x)+n))
#define DWORDn(x, n) (*((_DWORD*)&(x)+n))
#define LOBYTE(x) BYTEn(x,LOW_IND(x,_BYTE))
#define LOWORD(x) WORDn(x,LOW_IND(x,_WORD))
#define LODWORD(x) DWORDn(x,LOW_IND(x,_DWORD))
#define HIBYTE(x) BYTEn(x,HIGH_IND(x,_BYTE))
#define HIWORD(x) WORDn(x,HIGH_IND(x,_WORD))
static const unsigned s_vex_vv[] = {
//#include "vex_vv.h"
//#include "test_vv.h"
#include "test.h"
//static const unsigned s_pix_p[] = {
// #include "pix_p.h"
// #include "test_p.h"
struct ShaderBinaryInfo
uint8_t m_signature[7]; // 'OrbShdr'
uint8_t m_version; // ShaderBinaryInfoVersion
unsigned int m_pssl_or_cg : 1; // 1 = PSSL / Cg, 0 = IL / shtb
unsigned int m_cached : 1; // 1 = when compile, debugging source was cached. May only make sense for PSSL=1
uint32_t m_type : 4; // See enum ShaderBinaryType
uint32_t m_source_type : 2; // See enum ShaderSourceType
unsigned int m_length : 24; // Binary code length (does not include this structure or any of its preceding associated tables)
uint8_t m_chunkUsageBaseOffsetInDW; // in DW, which starts at ((uint32_t*)&ShaderBinaryInfo) - m_chunkUsageBaseOffsetInDW; max is currently 7 dwords (128 T# + 32 V# + 20 CB V# + 16 UAV T#/V#)
uint8_t m_numInputUsageSlots; // Up to 16 user data reg slots + 128 extended user data dwords supported by CUE; up to 16 user data reg slots + 240 extended user data dwords supported by InputUsageSlot
uint8_t m_isSrt : 1; // 1 if this shader uses shader resource tables and has an SrtDef table embedded below the input usage table and any extended usage info
uint8_t m_isSrtUsedInfoValid : 1; // 1 if SrtDef::m_isUsed=0 indicates an element is definitely unused; 0 if SrtDef::m_isUsed=0 indicates only that the element is not known to be used (m_isUsed=1 always indicates a resource is known to be used)
uint8_t m_isExtendedUsageInfo : 1; // 1 if this shader has extended usage info for the InputUsage table embedded below the input usage table
uint8_t m_reserved2 : 5; // For future use
uint8_t m_reserved3; // For future use
uint32_t m_shaderHash0; // Association hash first 4 bytes
uint32_t m_shaderHash1; // Association hash second 4 bytes
uint32_t m_crc32; // crc32 of shader + this struct, just up till this field
typedef enum ShaderStage
kShaderStageCs = 0x00000000, ///< Compute shader stage.
kShaderStagePs = 0x00000001, ///< Pixel shader stage.
kShaderStageVs = 0x00000002, ///< Vertex shader stage.
kShaderStageGs = 0x00000003, ///< Geometry shader stage.
kShaderStageEs = 0x00000004, ///< Export shader stage.
kShaderStageHs = 0x00000005, ///< Hull shader stage.
kShaderStageLs = 0x00000006, ///< LDS shader stage.
kShaderStageCount ///< The number of shader stages.
} ShaderStage;
typedef enum ShaderConstantDwordSize
kDwordSizeResource = 8, ///< <c>T#</c> or padded <c>V#</c>.
kDwordSizeRwResource = 8, ///< <c>T#</c> or padded <c>V#</c>.
kDwordSizeSampler = 4, ///< <c>S#</c>.
kDwordSizeVertexBuffer = 4, ///< <c>V#</c>.
kDwordSizeConstantBuffer = 4, ///< <c>V#</c>.
kDwordSizeBoolConstant = 1, ///< 32 packed bits.
kDwordSizeFloatConstant = 1, ///< IEEE single-precision float.
kDwordSizeAppendConsumeCounterRange = 1, ///< Two 16-bit integers packed together.
kDwordSizeStreamoutBuffer = 4, ///< <c>V#</c>.
kDwordSizeExtendedUserData = 128, ///< Space for extra shader user data.
kDwordSizeDispatchDrawData = 32, ///< Space for 32 <c>DWORD</c>s of DispatchDraw data.
kDwordSizeGdsMemoryRange = 1, ///< Two 16-bit integers packed together.
} ShaderConstantDwordSize;
typedef enum FetchShaderInstancingMode
kFetchShaderUseVertexIndex = 0x0, ///< No instancing; use Vertex Index. Default.
kFetchShaderUseInstanceId = 0x1, ///< Use the instance ID to index the data.
kFetchShaderUseInstanceIdOverStepRate0 = 0x2, ///< Use the (instance ID / step rate 0) to index the data.
kFetchShaderUseInstanceIdOverStepRate1 = 0x3, ///< Use the (instance ID / step rate 1) to index the data.
} FetchShaderInstancingMode;
const int32_t kMaxResourceCount = 16;
const int32_t kMaxRwResourceCount = 16;
const int32_t kMaxSamplerCount = 16;
const int32_t kMaxVertexBufferCount = 16;
const int32_t kMaxConstantBufferCount = 20;
const int32_t kMaxStreamOutBufferCount = 4;
const int32_t kMaxUserDataCount = 16; ///< PSSL compiler limit is 16, count not tracked by the InputResourceOffsets table
const int32_t kMaxSrtUserDataCount = 16; ///< PSSL compiler limit is 16, count not tracked by the InputResourceOffsets table
const int32_t kMaxResourceBufferCount = 4; ///< Maximum number for supported splits for the resource buffer per LCUE instance
const int32_t kMaxPsInputUsageCount = 32; ///< Maximum number of interpolants a PS Stage can receive
const int32_t kDefaultFetchShaderPtrSgpr = 0; ///< Default SGPR in PSSL
const int32_t kDefaultVertexBufferTablePtrSgpr = 2; ///< Default SGPR in PSSL
const int32_t kDefaultGlobalInternalTablePtrSgpr = 0; ///< Default SGPR in PSSL, Note: it has lower priority than FetchPtr (sgpr would be s[4:5], after FetchPtr and VbPtr)
const int32_t kDefaultStreamOutTablePtrSgpr = 2; ///< Default SGPR in PSSL, only used by VS copy shader in GS active stage
const int32_t kDefaultVertexldsEsGsSizeSgpr = 0; ///< Default SGPR in PSSL, only used by VS copy shader in GS active stage
const int32_t kResourceInUserDataSgpr = 0x8000; ///< In User data resource Mask
const int32_t kResourceIsVSharp = 0x4000; ///< VSharp resource Mask Note: only used/available for immediate resources
const int32_t kResourceValueMask = 0x3FFF; ///< Resource memory offset is stored in the lower 14-bits
// On-chip GS constants
const uint32_t kOnChipGsInvalidSignature = 0xFFFFFFFFU;
// Tessellation distribution constants (Neo only)
const uint32_t kTesselationDistrbutionMask = 0x7FFFFFFFU; ///< Tessellation Distribution mask HS shader stage
const uint32_t kTesselationDistrbutionEnabledMask = ~kTesselationDistrbutionMask; ///< Tessellation Distribution enabled for HS shader stage
// Shader stage constants
const uint32_t kShaderStageAsynchronousCompute = kShaderStageCount; ///< Additional stage for CsVs dispatch Draw shader
const uint32_t kNumShaderStages = kShaderStageCount + 1; ///< Number of unique shader stages for resource binding
// 6KB is enough to store anything you can bind to a GPU shader stage, all counted in <c>DWORD</c>s
const int32_t kGpuStageBufferSizeInDwords = (6 * 1024) / sizeof(uint32_t); ///< Size of Single buffer Stage
const int32_t kComputeScratchBufferSizeInDwords = kGpuStageBufferSizeInDwords; ///< Size of the Compute Scratch buffer
const int32_t kGraphicsScratchBufferSizeInDwords = kNumShaderStages * kGpuStageBufferSizeInDwords; ///< Size of the Graphics Scratch buffer (encompasses all graphics shader stages)
//const int32_t kGlobalInternalTableSizeInDwords = sce::kShaderGlobalResourceCount * sizeof(sce::Buffer) / sizeof(uint32_t); ///< Size of a global resource table
// Internal constant buffers that are expected at fixed API-slots
const int32_t kConstantBufferInternalApiSlotForEmbeddedData = 15; ///< Immediate/Embedded constant buffer fixed API-slot (any GPU stage).
const int32_t kConstantBufferInternalApiSlotReserved0 = 16; ///< Slot 16 is reserved by compiler
const int32_t kConstantBufferInternalApiSlotReserved1 = 17; ///< Slot 17 is reserved by compiler
const int32_t kConstantBufferInternalApiSlotReserved2 = 18; ///< Slot 18 is reserved by compiler
const int32_t kConstantBufferInternalApiSlotForTessellation = 19; ///< Tessellation constant buffer (with strides for LDS data) fixed API-slot (HS,VS/ES GPU stages).
// Internal constants for ShaderBinaryInfo
const uint64_t kShaderBinaryInfoSignatureMask = 0x00ffffffffffffffLL;
const uint64_t kShaderBinaryInfoSignatureU64 = 0x007264685362724fLL;
// DispatchDraw constants
const uint32_t kNumDispatchDrawRingBuffersRolls = 16; ///< Number of times the dispatch ring buffers can be rolled in a LCUE instance, see setupDispatchDrawRingBuffers()
struct InputResourceOffsets
uint16_t requiredBufferSizeInDwords; ///< Specifies how much memory needs to be reserved to store all memory-based resources. These are things not set through PM4.
bool isSrtShader; ///< A flag that specifies whether the shader makes use of SRTs.
uint8_t shaderStage; ///< The shader stage (LS/HS/ES/GS/VS/PS) for the shader resources offsets.
// For each available shader-resource-ptr, store the starting SGPR s[0:254] where it'll be set (<c>0xFF</c> means not used). Pointers take 2 SGPRs (64b) and must be 2DW aligned
uint8_t fetchShaderPtrSgpr; ///< The SGPR containing the fetch shader pointer. If this exists, <c>s[0:1]</c> is always used.
uint8_t vertexBufferPtrSgpr; ///< The SGPR containing the vertex buffer table pointer. If this exists, <c>s[2:3]</c> is always used, but only in the vertex pipeline.
uint8_t streamOutPtrSgpr; ///< The SGPR containing the stream out buffer pointer. If this exists, <c>s[2:3]</c> is always used, but only in the Geometry pipeline.
uint8_t userExtendedData1PtrSgpr; ///< The SGPR containing the user extended data table pointer.
// uint8_t userInternalSrtDataPtrSgpr; ///< *Note: Not supported for now*.
uint8_t constBufferPtrSgpr; ///< The SGPR containing the constant buffer table pointer.
uint8_t resourcePtrSgpr; ///< The SGPR containing the resource buffer table pointer.
uint8_t rwResourcePtrSgpr; ///< The SGPR containing the read/write resource buffer table pointer.
uint8_t samplerPtrSgpr; ///< The SGPR containing the sampler buffer table pointer.
uint8_t globalInternalPtrSgpr; ///< The SGPR containing the global internal pointer, which is either stored in <c>s[0:1]</c> or <c>s[4:5]</c>.
uint8_t appendConsumeCounterSgpr; ///< The SGPR containing the 32bit value address and size used from GDS.
uint8_t gdsMemoryRangeSgpr; ///< The SGPR containing the GDS address range for storage.
uint8_t ldsEsGsSizeSgpr; ///< The SGPR containing the GWS resource base offset.
uint8_t userSrtDataSgpr; ///< The SGPR containing the start offset of the SRT Data Buffer.
uint8_t userSrtDataCount; ///< The number of <c>DWORD</c>s in use by the SRT Data Buffer. The size will be between 1-8.
uint8_t gdsKickRingBufferOffsetSgpr; ///< The SGPR containing the GDS kick ring buffer offset for DispatchDraw.
uint8_t vertexRingBufferOffsetSgpr; ///< The SGPR containing the Vertex kick ring buffer offset for DispatchDraw.
uint8_t dispatchDrawPtrSgpr; ///< The SGPR containing the DispatchDraw data pointer to DispatchDrawTriangleCullData structure.
uint8_t dispatchDrawInstancesSgpr; ///< The SGPR containing the number of instances for DispatchDraw.
// For each available shader-resource-flat-table (aka array), store the memory offset (from the start of the buffer) to the beginning of its flat-table (0xFFFF means it's not used).
// Note: arrays are 0 indexed but the user can skip/set any index inside the range, allowing gaps at any place. This accelerates setting the pointer to the beginning of flat-tables.
uint16_t constBufferArrayDwOffset; ///< The constant buffer table offset into the main buffer.
uint16_t vertexBufferArrayDwOffset; ///< The vertex buffer table offset into the main buffer.
uint16_t resourceArrayDwOffset; ///< The resource buffer table offset into the main buffer.
uint16_t rwResourceArrayDwOffset; ///< The read/write resource buffer table offset into the main buffer.
uint16_t samplerArrayDwOffset; ///< The sampler buffer table offset into the main buffer.
uint16_t streamOutArrayDwOffset; ///< The stream out buffer table offset into the main buffer. This is only for the Geometry pipeline.
// For each logical shader API slot, store either: an offset to a memory location, or a User Data (UD) SGPR where the resource should be set.
// Note: if (item[i]&kResourceInUserDataSgpr) it's set directly into s[0:15] using PM4 packets, otherwise it's copied into the scratch buffer using the offset.
uint16_t resourceDwOffset[kMaxResourceCount]; ///< The start offset of a resource in the resource buffer table or user data.
uint16_t rwResourceDwOffset[kMaxRwResourceCount]; ///< The start offset of a resource in the read/write resource buffer table or user data.
uint16_t samplerDwOffset[kMaxSamplerCount]; ///< The start offset of a sampler in the sampler buffer table or user data.
uint16_t constBufferDwOffset[kMaxConstantBufferCount]; ///< The start offset of a constant buffer in the constant buffer table or user data.
uint16_t vertexBufferDwOffset[kMaxVertexBufferCount]; ///< The start offset of a vertex array in the vertex buffer table or user data.
uint16_t streamOutDwOffset[kMaxStreamOutBufferCount];///< The start offset of a stream out buffer in the stream out buffer table or user data. This is only for the Geometry pipeline.
uint8_t resourceSlotCount; ///< The number of resource slots used by the shader.
uint8_t rwResourceSlotCount; ///< The number of rw resource slots used by the shader.
uint8_t samplerSlotCount; ///< The number of sampler slots used by the shader.
uint8_t constBufferSlotCount; ///< The number of constant buffer slots used by the shader.
uint8_t vertexBufferSlotCount; ///< The number of vertex buffer slots used by the shader.
uint8_t pad[1];
/** @brief Initializes several resource slots that the shader uses.
void initSupportedResourceCounts()
resourceSlotCount = kMaxResourceCount;
rwResourceSlotCount = kMaxRwResourceCount;
samplerSlotCount = kMaxSamplerCount;
constBufferSlotCount = kMaxConstantBufferCount;
vertexBufferSlotCount = kMaxVertexBufferCount;
// kShaderInputUsageImmAluFloatConst // Immediate float const (scalar or vector). *Not Supported*
// kShaderInputUsageImmAluBool32Const // 32 immediate Booleans packed into one UINT. *Not Supported*
typedef enum ShaderInputUsageType
kShaderInputUsageImmResource = 0x00, ///< Immediate read-only buffer/texture descriptor.
kShaderInputUsageImmSampler = 0x01, ///< Immediate sampler descriptor.
kShaderInputUsageImmConstBuffer = 0x02, ///< Immediate constant buffer descriptor.
kShaderInputUsageImmVertexBuffer = 0x03, ///< Immediate vertex buffer descriptor.
kShaderInputUsageImmRwResource = 0x04, ///< Immediate read/write buffer/texture descriptor.
kShaderInputUsageImmAluFloatConst = 0x05, ///< Immediate float const (scalar or vector).
kShaderInputUsageImmAluBool32Const = 0x06, ///< 32 immediate Booleans packed into one UINT.
kShaderInputUsageImmGdsCounterRange = 0x07, ///< Immediate UINT with GDS address range for counters (used for append/consume buffers).
kShaderInputUsageImmGdsMemoryRange = 0x08, ///< Immediate UINT with GDS address range for storage.
kShaderInputUsageImmGwsBase = 0x09, ///< Immediate UINT with GWS resource base offset.
kShaderInputUsageImmShaderResourceTable = 0x0A, ///< Pointer to read/write resource indirection table.
kShaderInputUsageImmLdsEsGsSize = 0x0D, ///< Immediate LDS ESGS size used in on-chip GS
// Skipped several items here...
kShaderInputUsageSubPtrFetchShader = 0x12, ///< Immediate fetch shader subroutine pointer.
kShaderInputUsagePtrResourceTable = 0x13, ///< Flat resource table pointer.
kShaderInputUsagePtrInternalResourceTable = 0x14, ///< Flat internal resource table pointer.
kShaderInputUsagePtrSamplerTable = 0x15, ///< Flat sampler table pointer.
kShaderInputUsagePtrConstBufferTable = 0x16, ///< Flat const buffer table pointer.
kShaderInputUsagePtrVertexBufferTable = 0x17, ///< Flat vertex buffer table pointer.
kShaderInputUsagePtrSoBufferTable = 0x18, ///< Flat stream-out buffer table pointer.
kShaderInputUsagePtrRwResourceTable = 0x19, ///< Flat read/write resource table pointer.
kShaderInputUsagePtrInternalGlobalTable = 0x1A, ///< Internal driver table pointer.
kShaderInputUsagePtrExtendedUserData = 0x1B, ///< Extended user data pointer.
kShaderInputUsagePtrIndirectResourceTable = 0x1C, ///< Pointer to resource indirection table.
kShaderInputUsagePtrIndirectInternalResourceTable = 0x1D, ///< Pointer to internal resource indirection table.
kShaderInputUsagePtrIndirectRwResourceTable = 0x1E, ///< Pointer to read/write resource indirection table.
// Skipped several items here...
kShaderInputUsageImmGdsKickRingBufferOffset = 0x22, ///< Immediate UINT offset into GDS kick ring buffer for DispatchDraw. This must not be in extended user data.
kShaderInputUsageImmVertexRingBufferOffset = 0x23, ///< Immediate UINT offset into vertex ring buffer for DispatchDraw. This must not be in extended user data.
kShaderInputUsagePtrDispatchDraw = 0x24, ///< Pointer to DispatchDraw data. This must not be in extended user data.
kShaderInputUsageImmDispatchDrawInstances = 0x25, ///< Immediate UINT ((firstInstance<<16)|(numInstances-1)). This must not be in extended user data.
} ShaderInputUsageType;
class VsShader;
class PsShader;
class ShaderInfo
const void *m_shaderStruct; ///< A pointer to the shader struct -- typeless.
const VsShader* m_vsShader;
const PsShader* m_psShader;
const uint32_t *m_gpuShaderCode; ///< A pointer to the GPU Shader Code which will need to be copied into GPU visible memory.
uint32_t m_gpuShaderCodeSize; ///< The size of the GPU Shader Code in bytes.
uint32_t m_reserved;
class PipelineStage
/// Represents vertex shader information.
class VsInfo
uint8_t m_vertexShaderVariant; ///< The <c>PsslVertexVariant</c> such as <c>kVertexVariantVertex</c>, <c>kVertexVariantExport</c>, <c>kVertexVariantLocal</c> etc.
uint8_t m_paddingTo32[3]; ///< Padding.
/// Represents domain shader information.
class DsInfo
uint8_t m_domainShaderVariant; ///< The <c>PsslDomainVariant</c> such as <c>kDomainVariantVertex</c>, <c>kDomainVariantExport</c> etc.
uint8_t m_paddingTo32[3]; ///< Padding.
/// Represents geometry shader information.
class GsInfo
uint8_t m_geometryShaderVariant; ///< The <c>PsslGeometryVariant</c> such as <c>kGeometryVariantOnBuffer</c>, <c>kGeometryVariantOnChip</c> etc.
uint8_t m_paddingTo32[3]; ///< Padding.
/// Represents geometry shader information.
class HsInfo
uint8_t m_hullShaderVariant; ///< The <c>PsslHullVariant</c> such as <c>kHullVariantOnBuffer</c>, <c>kHullVariantOnChip</c> etc.
uint8_t m_paddingTo32[3]; ///< Padding.
/// Stores data as different class types, depending on the type of shader.
uint32_t m_u32; ///< An unsigned 32 bit integer.
VsInfo m_vsInfo; ///< The vertex shader information.
DsInfo m_dsInfo; ///< The domain shader information.
GsInfo m_gsInfo; ///< The geometry shader information.
HsInfo m_hsInfo; ///< The hull shader information.
class SystemAttributes
/// Represents CS pipeline stage information.
class CsInfo
uint16_t m_numThreads[3]; ///< The number of threads.
/// Represents GS pipeline stage information.
class GsInfo
uint16_t m_instance; ///< The instance of the GS Shader.
uint16_t m_maxVertexCount; ///< The maximum number of vertices count.
uint8_t m_inputType; ///< The GS Input Type (<c>PsslGsIoType</c>) such as triangle, line, point, adjacent tri + line, or patch.
uint8_t m_outputType; ///< The GS Output Type (<c>PsslGsIoType</c>) such as triangles, lines, or points.
uint8_t m_patchSize; ///< The patch size in case of patch topology.
/// Represents DS pipeline stage information.
class DsInfo
uint8_t m_domainPatchType; ///< The <c>PsslHsDsPatchType</c>: triangle, quad, or isoline.
uint8_t m_inputControlPoints; ///< The number of points in the input patch.
/// Represents HS pipeline stage information.
class HsInfo
uint8_t m_domainPatchType; ///< The <c>PsslHsDsPatchType</c>: triangle, quad, or isoline.
uint8_t m_inputControlPoints; ///< The number of points in the input patch.
uint8_t m_outputTopologyType; ///< The <c>PsslHsTopologyType</c>: point, line, cwtri, or ccwtri.
uint8_t m_partitioningType; ///< The <c>PsslHsPartitioningType</c>: integer, powof2, odd_fractional, or even_fractional.
uint8_t m_outputControlPoints; ///< The number of points in the output patch.
uint8_t m_patchSize; ///< The size of patch.
uint8_t m_paddingTo32[2]; ///< Padding.
float m_maxTessFactor; ///< The maximum tessellation factor.
/// Stores data as different class types, depending on the type of shader.
uint32_t m_u32[3]; ///< 12 bytes.
CsInfo m_csInfo; ///< The compute shader information.
GsInfo m_gsInfo; ///< The geometry shader information.
DsInfo m_dsInfo; ///< The domain shader information.
HsInfo m_hsInfo; ///< The hull shader information.
class Header
uint8_t m_formatVersionMajor; ///< The version of shader binary format: major numbering.
uint8_t m_formatVersionMinor; ///< The version of shader binary format: minor numbering.
uint16_t m_compilerRevision; ///< The compiler type specific version of shader compiler: this is the svn revision for m_compilerType==kCompilerTypeOrbisPsslc or kCompilerTypeOrbisEsslc or for kCompilerTypeUnspecified (pre-SDK 2.500 versions of these compilers)
uint32_t m_associationHash0; ///< The shader association hash 1.
uint32_t m_associationHash1; ///< The shader association hash 2.
uint8_t m_shaderType; ///< The <c>PsslShaderType</c>: VS, PS, GS, CS, GS, HS, or DS.
uint8_t m_codeType; ///< The <c>PsslCodeType</c>: IL, ISA, or SCU.
uint8_t m_usesShaderResourceTable; ///< The shader uses resource table.
uint8_t m_compilerType : 4; ///< The <c>PsslCompilerType</c>; 0
uint8_t m_paddingTo32 : 4; // 0; reserved for future use
uint32_t m_codeSize; ///< The size of code section.
PipelineStage m_shaderTypeInfo; ///< The shader pipeline stage info.
SystemAttributes m_shaderSystemAttributeInfo; ///< The shader system attribute info.
class ShaderFileHeader
uint32_t m_fileHeader; ///< File identifier. Should be equal to kShaderFileHeaderId
uint16_t m_majorVersion; ///< Major version of the shader binary.
uint16_t m_minorVersion; ///< Minor version of the shader binary.
uint8_t m_type; ///< Type of shader. Comes from ShaderType.
uint8_t m_shaderHeaderSizeInDW; ///< <c>\<Type\>Shader.computeSize()/4</c>. For example, see CsShader::computeSize().
uint8_t m_shaderAuxData; ///< A flag that indicates whether shader auxiliary data is present after end of the shader data ( <c>sizeof(ShaderFileHeader) +</c>
///< <c>m_shaderHeaderSizeInDW * 4 + ShaderCommonData::m_shaderSize +</c>
///< <c>ShaderCommonData::m_embeddedConstantBufferSizeInDQW * 16)</c>. Set to 1 to indicate it is
uint8_t m_targetGpuModes; ///< Union of all TargetGpuMode values for which this shader binary is valid.
uint32_t m_reserved1; ///< Must be 0.
class ShaderCommonData
// Memory Layout:
// - Shader setup data (starting with ShaderCommonData)
// - n InputUsage (4 bytes each)
// - immediateConstants
uint32_t m_shaderSize : 23; ///< The size of the shader binary code block in bytes.
uint32_t m_shaderIsUsingSrt : 1; ///< A bitflag that indicates if the shader is using a Shader Resource Table.
uint32_t m_numInputUsageSlots : 8; ///< The number of InputUsageSlot entries following the main shader structure.
uint16_t m_embeddedConstantBufferSizeInDQW; ///< The size of the embedded constant buffer in 16-byte <c>DWORD</c>s.
uint16_t m_scratchSizeInDWPerThread; ///< The scratch size required by each thread in 4-byte <c>DWORD</c>s.
/** @brief Calculates and returns the size of the shader code including its embedded CB size in bytes */
uint32_t computeShaderCodeSizeInBytes() const { return m_shaderSize + m_embeddedConstantBufferSizeInDQW * 16; }
#ifdef __cplusplus
class InputUsageSlot
#else // __cplusplus
typedef struct InputUsageSlot
#endif // __cplusplus
#ifdef __cplusplus
#endif // __cplusplus
uint8_t m_usageType; ///< From ShaderInputUsageType.
uint8_t m_apiSlot; ///< API slot or chunk ID.
uint8_t m_startRegister; ///< User data slot.
uint8_t m_registerCount : 1; ///< If 0, count is 4DW; if 1, count is 8DW. Other sizes are defined by the usage type.
uint8_t m_resourceType : 1; ///< If 0, resource type <c>V#</c>; if 1, resource type <c>T#</c>, in case of a kShaderInputUsageImmResource.
uint8_t m_reserved : 2; ///< Unused; must be set to zero.
uint8_t m_chunkMask : 4; ///< Internal usage data.
uint8_t m_srtSizeInDWordMinusOne; ///< Size of the SRT data; used for kShaderInputUsageImmShaderResourceTable.
#ifdef __cplusplus
#else // __cplusplus
} InputUsageSlot;
#endif // __cplusplus
#ifdef __cplusplus
class VertexInputSemantic
#else // __cplusplus
typedef struct VertexInputSemantic
#endif // __cplusplus
#ifdef __cplusplus
#endif // __cplusplus
uint8_t m_semantic;
uint8_t m_vgpr;
uint8_t m_sizeInElements;
uint8_t m_reserved; ///< Unused; must be set to zero.
#ifdef __cplusplus
#else // __cplusplus
} VertexInputSemantic;
#endif // __cplusplus
#ifdef __cplusplus
class VertexExportSemantic
#else // __cplusplus
typedef struct VertexExportSemantic
#endif // __cplusplus
#ifdef __cplusplus
#endif // __cplusplus
uint8_t m_semantic; ///< Description to be specified.
uint8_t m_outIndex : 5; ///< Description to be specified.
uint8_t m_reserved : 1;
uint8_t m_exportF16 : 2; ///< if (m_exportF16 == 0) this shader exports a 32-bit value to this parameter; if (m_exportF16 & 1) this shader exports a 16-bit float value to the low 16-bits of each channel; if (m_exportF16 & 2) this shader exports a 16-bit float value to the high 16-bits of each channel
#ifdef __cplusplus
#else // __cplusplus
} VertexExportSemantic;
#endif // __cplusplus
#ifdef __cplusplus
class VsStageRegisters
#else // __cplusplus
typedef struct VsStageRegisters
#endif // __cplusplus
#ifdef __cplusplus
#endif // __cplusplus
uint32_t m_spiShaderPgmLoVs; ///< The pointer to shader program (bits 39:8).
uint32_t m_spiShaderPgmHiVs; ///< The pointer to shader program (bits 47:40). This must be set to zero.
uint32_t m_spiShaderPgmRsrc1Vs;
uint32_t m_spiShaderPgmRsrc2Vs;
uint32_t m_spiVsOutConfig;
uint32_t m_spiShaderPosFormat;
uint32_t m_paClVsOutCntl;
#ifdef __cplusplus
/** @brief Patches the GPU address of the shader code.
@param[in] gpuAddress This address to patch. This must be aligned to a 256-byte boundary.
void patchShaderGpuAddress(void *gpuAddress)
m_spiShaderPgmLoVs = static_cast<uint32_t>(uintptr_t(gpuAddress) >> 8);
m_spiShaderPgmHiVs = static_cast<uint32_t>(uintptr_t(gpuAddress) >> 40);
bool isSharingContext(const VsStageRegisters shader) const
return !((m_spiVsOutConfig - shader.m_spiVsOutConfig)
| (m_spiShaderPosFormat - shader.m_spiShaderPosFormat)
| (m_paClVsOutCntl - shader.m_paClVsOutCntl));
#endif // __cplusplus
#ifdef __cplusplus
#else // __cplusplus
} VsStageRegisters;
#endif // __cplusplus
#ifdef __cplusplus
class PsStageRegisters
#else // __cplusplus
typedef struct PsStageRegisters
#endif // __cplusplus
#ifdef __cplusplus
#endif // __cplusplus
uint32_t m_spiShaderPgmLoPs; ///< A pointer to shader program (bits 39:8).
uint32_t m_spiShaderPgmHiPs; ///< A pointer to shader program (bits 47:40). This must be set to zero.
uint32_t m_spiShaderPgmRsrc1Ps;
uint32_t m_spiShaderPgmRsrc2Ps;
uint32_t m_spiShaderZFormat;
uint32_t m_spiShaderColFormat;
uint32_t m_spiPsInputEna;
uint32_t m_spiPsInputAddr;
uint32_t m_spiPsInControl;
uint32_t m_spiBarycCntl;
uint32_t m_dbShaderControl;
uint32_t m_cbShaderMask;
#ifdef __cplusplus
void patchShaderGpuAddress(void *gpuAddress)
m_spiShaderPgmLoPs = static_cast<uint32_t>(uintptr_t(gpuAddress) >> 8);
m_spiShaderPgmHiPs = static_cast<uint32_t>(uintptr_t(gpuAddress) >> 40);
#endif // __cplusplus
#ifdef __cplusplus
#else // __cplusplus
} PsStageRegisters;
#endif // __cplusplus
#ifdef __cplusplus
class FetchShaderBuildState
#else // __cplusplus
typedef struct FetchShaderBuildState
#endif // __cplusplus
#ifdef __cplusplus
#endif // __cplusplus
// Filled up by: Generate[xx]FSBuildState functions
uint16_t m_fetchShaderBufferSize;
uint16_t m_fetchShaderFlags; ///< Description to be specified.
uint8_t m_firstFreeSgpr; ///< Description to be specified.
uint8_t m_vsharpSlotCount;
uint16_t m_numElementsInInstancingData; ///< Description to be specified.
uint32_t m_shaderModifier; // to be passed to set[x]sShader functions.
uint32_t m_reserved0; ///< Unused; must be set to zero.
const FetchShaderInstancingMode *m_fetchShaderInstancingData; ///< Description to be specified.
// From the SC:
uint8_t m_numInputSemantics;
uint8_t m_vertexBaseUsgpr;
uint8_t m_instanceBaseUsgpr;
uint8_t m_reserved1; ///< Unused; must be set to zero.
uint32_t m_numInputUsageSlots;
const VertexInputSemantic *m_inputSemantics;
const InputUsageSlot *m_inputUsageSlots;
// Vertex Buffer Semantic Remap Table: <optional>
uint32_t m_numElementsInRemapTable;
uint32_t m_reserved2;
const uint32_t *m_semanticsRemapTable; // vertex buffer index -> vertex buffer semantic (e.g. vbSemantic = m_semanticsRemapTable[vbIndex])
#ifdef __cplusplus
#else // __cplusplus
} FetchShaderBuildState;
#endif // __cplusplus
#ifdef __cplusplus
class PixelInputSemantic
#else // __cplusplus
typedef struct PixelInputSemantic
#endif // __cplusplus
#ifdef __cplusplus
#endif // __cplusplus
uint16_t m_semantic : 8; ///< The semantic, matched against the semantic value in the VertexExportSemantic table in the VS shader.
uint16_t m_defaultValue : 2; ///< The default value supplied to the shader, if m_semantic is not matched in the VS shader. 0={0,0,0,0}, 1={0,0,0,1.0}, 2={1.0,1.0,1.0,0}, 3={1.0,1.0,1.0,1.0}
uint16_t m_isFlatShaded : 1; ///< if (m_interpF16 == 0) A bitflag that specifies whether the value interpolation is constant in the shader. It is ignored if <c><i>m_isCustom</i></c> is set; otherwise, it indicates that a shader reads only { P0 } and that some handling of infinite values in the calculation of P1-P0 and P2-P0 can be disabled.
uint16_t m_isLinear : 1; ///< A bitflag that specifies whether the value interpolation is linear in the shader. It is unused by the Gnm runtime.
uint16_t m_isCustom : 1; ///< if (m_interpF16 == 0) A bitflag that specifies whether the value interpolation is custom in the shader. It determines whether hardware subtraction should be disabled, supplying { P0, P1, P2 } to the shader instead of { P0, P1-P0, P2-P0 }.
uint16_t m_reserved : 3; ///< Unused; set to zero.
// NEO mode only:
uint16_t : 12; ///< Description to be specified.
uint16_t m_defaultValueHi : 2; ///< if (m_interpF16 != 0) indicates the default value supplied to the shader for the upper 16-bits if m_semantic is not matched in the VS shader, and m_defaultValue indicates the default value for the lower 16-bits.
uint16_t m_interpF16 : 2; ///< if (m_interpF16 == 0) this is a 32-bit float or custom value; if (m_interpF16 & 1) the low 16-bits of this parameter expect 16-bit float interpolation and/or default value; if (m_interpF16 & 2) the high 16-bits of this parameter expect 16-bit float interpolation and/or default value
#ifdef __cplusplus
#else // __cplusplus
} PixelInputSemantic;
#endif // __cplusplus
class VsShader
ShaderCommonData m_common; ///< The common data for all shader stages.
VsStageRegisters m_vsStageRegisters; ///< The data to be loaded into the VS shader stage registers. Please see DrawCommandBuffer::setVsShader() for more information.
// not used if domain shader => vertex shader
uint8_t m_numInputSemantics; ///< The number of entries in the input semantic table.
uint8_t m_numExportSemantics; ///< The number of entries in the export semantic table.
uint8_t m_gsModeOrNumInputSemanticsCs; ///< Stores a union of VsShaderGsMode values for a VsShader or GsShader::getCopyShader(), which are translated into a GsMode constant. For CsVsShader::getVertexShader() with CsVsShader::getComputeShader()->m_version==0, the number of input semantic table entries to use for the CsVsShader::getComputeShader() fetch shader is stored.
uint8_t m_fetchControl; ///< The user registers that receive vertex and instance offsets for use in the fetch shader.
/** @brief Patches the GPU address of the shader code.
@param[in] gpuAddress This address to patch. This must be aligned to a 256-byte boundary.
void patchShaderGpuAddress(void *gpuAddress)
void *getBaseAddress() const
return (void *)((((uintptr_t)m_vsStageRegisters.m_spiShaderPgmHiVs) << 40) | (((uintptr_t)m_vsStageRegisters.m_spiShaderPgmLoVs) << 8));
/** @brief Gets a pointer to this shader's input usage slot table that immediately follows this shader's structure in memory.
@return A pointer to this shader's input usage slot table.
const InputUsageSlot *getInputUsageSlotTable() const { return (const InputUsageSlot *)(this + 1); }
/** @brief Gets a pointer to this shader's input semantic table that immediately follows the input usage table in memory.
@return A pointer to this shader's input semantic table.
const VertexInputSemantic *getInputSemanticTable() const { return (const VertexInputSemantic *)(getInputUsageSlotTable() + m_common.m_numInputUsageSlots); }
/** @brief Gets a pointer to this shader's export semantic table that immediately follows the input semantic table in memory.
@return A pointer to this shader's export semantic table.
const VertexExportSemantic *getExportSemanticTable() const { return (const VertexExportSemantic *)(getInputSemanticTable() + m_numInputSemantics); }
/** @brief Computes the total size (in bytes) of the shader binary including this structure, the input usage table, and the input and export semantic tables.
@return The total size in bytes of this shader binary and its associated tables.
uint32_t computeSize() const
const uint32_t size = sizeof(VsShader) +
sizeof(InputUsageSlot) * m_common.m_numInputUsageSlots +
sizeof(VertexInputSemantic) * m_numInputSemantics +
sizeof(VertexExportSemantic) * m_numExportSemantics;
return (size + 3) & ~3U;
/** @brief Gets the user register that contains the vertex offset.
@return The index of the register containing the vertex offset. A value of 0 indicates no register contains the vertex offset.
uint8_t getVertexOffsetUserRegister() const
return m_fetchControl & 0xf;
/** @brief Gets the user register that contains the instance offset.
@return The index of the register containing the instance offset. A value of 0 indicates no register contains the instance offset.
uint8_t getInstanceOffsetUserRegister() const
return (m_fetchControl >> 4) & 0xf;
class PsShader
ShaderCommonData m_common; ///< The common data for all shader stages.
PsStageRegisters m_psStageRegisters; ///< The data to be loaded into the PS shader stage registers. Please see Gnm::DrawCommandBuffer::setPsShader() for more details.
uint8_t m_numInputSemantics; ///< The number of entries in the input semantic table.
uint8_t m_reserved[3]; ///< Unused
/** @brief Patches the GPU address of the shader code.
@param[in] gpuAddress The address to patch. This must be aligned to a 256-byte boundary.
void patchShaderGpuAddress(void *gpuAddress)
/** @brief Retrieves the GPU address of the shader code.
@return The address of the shader code.
void *getBaseAddress() const
return (void *)((((uintptr_t)m_psStageRegisters.m_spiShaderPgmHiPs) << 40) | (((uintptr_t)m_psStageRegisters.m_spiShaderPgmLoPs) << 8));
/** @brief Gets a pointer to this shader's input usage slot table that immediately follows this shader's structure in memory.
@return A pointer to this shader's input usage slot table.
const InputUsageSlot *getInputUsageSlotTable() const { return (const InputUsageSlot *)(this + 1); }
/** @brief Gets a pointer to this shader's input semantic table that immediately follows the input usage table in memory.
@return A pointer to this shader's input semantic table.
const PixelInputSemantic *getPixelInputSemanticTable() const { return (const PixelInputSemantic *)(getInputUsageSlotTable() + m_common.m_numInputUsageSlots); }
/** @brief Computes the total size (in bytes) of the shader binary including this structure, the input usage table and the input semantic table.
@return The total size in bytes of this shader binary and its associated tables.
uint32_t computeSize() const
const uint32_t size = sizeof(PsShader) +
sizeof(InputUsageSlot) * m_common.m_numInputUsageSlots +
sizeof(PixelInputSemantic) * m_numInputSemantics;
return (size + 3) & ~3U;
void parseShader(ShaderInfo *shaderInfo, const void* data)
const Header *binaryHeader = (const Header*)(data);
const ShaderFileHeader *header = (const ShaderFileHeader*)(binaryHeader + 1);
const ShaderCommonData *shaderCommon = (const ShaderCommonData*)(header + 1);
const uint32_t *sbAddress = (const uint32_t*)(shaderCommon + 1);
const uint32_t sbOffsetInDW = sbAddress[0] >> 2;
shaderInfo->m_shaderStruct = (void*)shaderCommon;
shaderInfo->m_gpuShaderCode = (uint32_t*)shaderCommon + sbOffsetInDW;
shaderInfo->m_gpuShaderCodeSize = shaderCommon->computeShaderCodeSizeInBytes();
ShaderBinaryInfo* findShaderBinInfo(uint8_t* code)
for (size_t i = 0; i != 0xFFFFFFFF; i++)
if (!std::memcmp(&code[i], "OrbShdr", 7))
return (ShaderBinaryInfo*)&code[i];
return nullptr;
void generateInputResourceOffsetTable(InputResourceOffsets* outTable, ShaderBinaryInfo* sb)
//SCE_GNM_ASSERT(shaderStage <= kShaderStageCount);
// Get resource info to populate ShaderResourceOffsets
//ShaderBinaryInfo const *shaderBinaryInfo = (ShaderBinaryInfo const*)((uintptr_t)shaderCode + shaderCodeSizeInBytes - sizeof(ShaderBinaryInfo));
ShaderBinaryInfo const *shaderBinaryInfo = sb;
//SCE_GNM_ASSERT((*(reinterpret_cast<uint64_t const*>(shaderBinaryInfo->m_signature)) & kShaderBinaryInfoSignatureMask) == kShaderBinaryInfoSignatureU64);
// Get usage masks and input usage slots
uint32_t const* usageMasks = reinterpret_cast<unsigned int const*>((unsigned char const*)shaderBinaryInfo - shaderBinaryInfo->m_chunkUsageBaseOffsetInDW * 4);
int32_t inputUsageSlotsCount = shaderBinaryInfo->m_numInputUsageSlots;
InputUsageSlot const* inputUsageSlots = (InputUsageSlot const*)usageMasks - inputUsageSlotsCount;
// Cache shader input information into the ShaderResource Offsets table
__builtin_memset(outTable, 0xFF, sizeof(InputResourceOffsets));
//outTable->shaderStage = isDispatchDraw && shaderStage == kShaderStageCs ? kShaderStageAsynchronousCompute : shaderStage;
//outTable->isSrtShader = isSrtUsed;
int32_t lastUserDataResourceSizeInDwords = 0;
uint16_t requiredMemorySizeInDwords = 0;
// Here we handle all immediate resources s[1:16] plus s[16:48] (extended user data)
// resources that go into the extended user data also have "immediate" usage type, although they are stored in a table (not loaded by the SPI)
for (int32_t i = 0; i < inputUsageSlotsCount; ++i)
uint8_t apiSlot = inputUsageSlots[i].m_apiSlot;
uint8_t startRegister = inputUsageSlots[i].m_startRegister;
bool isVSharp = (inputUsageSlots[i].m_resourceType == 0);
uint16_t vsharpFlag = (isVSharp) ? kResourceIsVSharp : 0;
uint16_t extendedRegisterOffsetInDwords = (startRegister >= kMaxUserDataCount) ?
(startRegister - kMaxUserDataCount) : 0;
requiredMemorySizeInDwords = (requiredMemorySizeInDwords > extendedRegisterOffsetInDwords) ?
requiredMemorySizeInDwords : extendedRegisterOffsetInDwords;
// Handle immediate resources, including some pointer types
switch (inputUsageSlots[i].m_usageType)
case kShaderInputUsageImmGdsCounterRange:
outTable->appendConsumeCounterSgpr = startRegister;
case kShaderInputUsageImmGdsMemoryRange:
outTable->gdsMemoryRangeSgpr = startRegister;
case kShaderInputUsageImmLdsEsGsSize:
outTable->ldsEsGsSizeSgpr = startRegister;
case kShaderInputUsageSubPtrFetchShader:
SCE_GNM_ASSERT(apiSlot == 0);
outTable->fetchShaderPtrSgpr = startRegister;
case kShaderInputUsagePtrInternalGlobalTable:
SCE_GNM_ASSERT(apiSlot == 0);
outTable->globalInternalPtrSgpr = startRegister;
case kShaderInputUsagePtrExtendedUserData:
SCE_GNM_ASSERT(apiSlot == 1);
outTable->userExtendedData1PtrSgpr = startRegister;
case kShaderInputUsageImmGdsKickRingBufferOffset:
SCE_GNM_ASSERT(apiSlot == 0);
SCE_GNM_ASSERT(startRegister < kMaxUserDataCount);
outTable->gdsKickRingBufferOffsetSgpr = startRegister;
case kShaderInputUsageImmVertexRingBufferOffset:
SCE_GNM_ASSERT(apiSlot == 0);
SCE_GNM_ASSERT(startRegister < kMaxUserDataCount);
outTable->vertexRingBufferOffsetSgpr = startRegister;
case kShaderInputUsagePtrDispatchDraw:
SCE_GNM_ASSERT(apiSlot == 0);
outTable->dispatchDrawPtrSgpr = startRegister;
case kShaderInputUsageImmDispatchDrawInstances:
SCE_GNM_ASSERT(apiSlot == 0);
SCE_GNM_ASSERT(startRegister < kMaxUserDataCount);
outTable->dispatchDrawInstancesSgpr = startRegister;
// below resources can either be inside UserData or the EUD
case kShaderInputUsageImmResource:
SCE_GNM_ASSERT(apiSlot >= 0 && apiSlot < outTable->resourceSlotCount);
outTable->resourceDwOffset[apiSlot] = (startRegister < kMaxUserDataCount) ?
(kResourceInUserDataSgpr | vsharpFlag | startRegister) : (vsharpFlag | extendedRegisterOffsetInDwords);
lastUserDataResourceSizeInDwords = (startRegister < kMaxUserDataCount) ? 0 : kDwordSizeResource;
case kShaderInputUsageImmRwResource:
SCE_GNM_ASSERT(apiSlot >= 0 && apiSlot < outTable->rwResourceSlotCount);
outTable->rwResourceDwOffset[apiSlot] = (startRegister < kMaxUserDataCount) ?
(kResourceInUserDataSgpr | vsharpFlag | startRegister) : (vsharpFlag | extendedRegisterOffsetInDwords);
lastUserDataResourceSizeInDwords = (startRegister < kMaxUserDataCount) ? 0 : kDwordSizeRwResource;
case kShaderInputUsageImmSampler:
SCE_GNM_ASSERT(apiSlot >= 0 && apiSlot < outTable->samplerSlotCount);
outTable->samplerDwOffset[apiSlot] = (startRegister < kMaxUserDataCount) ?
(kResourceInUserDataSgpr | startRegister) : extendedRegisterOffsetInDwords;
lastUserDataResourceSizeInDwords = (startRegister < kMaxUserDataCount) ? 0 : kDwordSizeSampler;
case kShaderInputUsageImmConstBuffer:
SCE_GNM_ASSERT(apiSlot >= 0 && apiSlot < outTable->constBufferSlotCount);
outTable->constBufferDwOffset[apiSlot] = (startRegister < kMaxUserDataCount) ?
(kResourceInUserDataSgpr | startRegister) : extendedRegisterOffsetInDwords;
lastUserDataResourceSizeInDwords = (startRegister < kMaxUserDataCount) ? 0 : kDwordSizeConstantBuffer;
case kShaderInputUsageImmVertexBuffer:
SCE_GNM_ASSERT(apiSlot >= 0 && apiSlot < outTable->vertexBufferSlotCount);
outTable->vertexBufferDwOffset[apiSlot] = (startRegister < kMaxUserDataCount) ?
(kResourceInUserDataSgpr | startRegister) : extendedRegisterOffsetInDwords;
lastUserDataResourceSizeInDwords = (startRegister < kMaxUserDataCount) ? 0 : kDwordSizeVertexBuffer;
// SRTs will always reside inside the Imm UserData (dwords 0-15), as opposed to the
// above resources which can exist in the EUD
case kShaderInputUsageImmShaderResourceTable:
SCE_GNM_ASSERT(apiSlot >= 0 && apiSlot < kMaxUserDataCount);
outTable->userSrtDataSgpr = inputUsageSlots[i].m_startRegister;
outTable->userSrtDataCount = inputUsageSlots[i].m_srtSizeInDWordMinusOne + 1;
// case kShaderInputUsagePtrSoBufferTable: // Only present in the VS copy-shader that doesn't have a footer
// outTable->streamOutPtrSgpr = startRegister;
// break;
// Make sure we can fit a T# (if required) in the last userOffset
requiredMemorySizeInDwords += lastUserDataResourceSizeInDwords;
// Now handle only pointers to resource-tables. Items handled below cannot be found more than once
// Note: in order to maintain binary compatibility, we can only put a new chunk mask for kShaderInputUsagePtrVertexBufferTable at the end of all other chunk masks
bool bUseVertexBufferTableChunkMask = false;
for (int32_t i = 0; i < inputUsageSlotsCount; ++i)
uint8_t maskChunks = inputUsageSlots[i].m_chunkMask;
const uint64_t kNibbleToCount = 0x4332322132212110ull;
uint8_t chunksCount = (kNibbleToCount >> ((maskChunks & 0xF) * 4)) & 0xF;
SCE_GNM_ASSERT(usageMasks + chunksCount <= (uint32_t const*)shaderBinaryInfo);
// Lets fill the resource indices first
uint32_t usedApiSlots[kSlotCountResource]; // Use the size of the biggest resource table
uint32_t usedApiSlotCount;
// This thing will break if there's more than 1 table for any resource type
uint8_t startRegister = inputUsageSlots[i].m_startRegister;
switch (inputUsageSlots[i].m_usageType)
case kShaderInputUsagePtrResourceTable:
SCE_GNM_ASSERT(inputUsageSlots[i].m_apiSlot == 0);
outTable->resourcePtrSgpr = startRegister;
outTable->resourceArrayDwOffset = requiredMemorySizeInDwords;
if (!(maskChunks & 0xF))
SCE_GNM_ASSERT(usageMasks < (uint32_t const*)shaderBinaryInfo);
uint32_t maskArray[4] = { 0, 0, 0, 0 }; // Max 128 slots are supported in the kShaderInputUsagePtrResourceTable
if (maskChunks & 1) maskArray[0] = *usageMasks++; // get slots 0-31 which are set in Chunk 0
if (maskChunks & 2) maskArray[1] = *usageMasks++; // get slots 32-63 which are set in Chunk 1
if (maskChunks & 4) maskArray[2] = *usageMasks++; // get slots 64-95 which are set in Chunk 2
if (maskChunks & 8) maskArray[3] = *usageMasks++; // get slots 96-127 which are set in Chunk 3
SCE_GNM_ASSERT(usageMasks <= (uint32_t const*)shaderBinaryInfo);
usedApiSlotCount = getUsedApiSlotsFromMask(&usedApiSlots[0], outTable->resourceSlotCount, maskArray, kSlotCountResource);
SCE_GNM_ASSERT(usedApiSlotCount > 0);
uint32_t lastUsedApiSlot = usedApiSlots[usedApiSlotCount - 1];
for (uint8_t j = 0; j < usedApiSlotCount; j++)
uint16_t currentApiSlot = static_cast<uint16_t>(usedApiSlots[j]);
outTable->resourceDwOffset[currentApiSlot] = requiredMemorySizeInDwords + currentApiSlot * kDwordSizeResource;
requiredMemorySizeInDwords += (lastUsedApiSlot + 1) * kDwordSizeResource;
case kShaderInputUsagePtrRwResourceTable:
SCE_GNM_ASSERT(inputUsageSlots[i].m_apiSlot == 0);
outTable->rwResourcePtrSgpr = startRegister;
outTable->rwResourceArrayDwOffset = requiredMemorySizeInDwords;
if (!(maskChunks & 1))
SCE_GNM_ASSERT(usageMasks < (uint32_t const*)shaderBinaryInfo);
usedApiSlotCount = getUsedApiSlotsFromMask(&usedApiSlots[0], outTable->rwResourceSlotCount, *usageMasks++, kSlotCountRwResource);
SCE_GNM_ASSERT(usedApiSlotCount > 0);
uint32_t lastUsedApiSlot = usedApiSlots[usedApiSlotCount - 1];
for (uint8_t j = 0; j < usedApiSlotCount; j++)
uint16_t currentApiSlot = static_cast<uint16_t>(usedApiSlots[j]);
outTable->rwResourceDwOffset[currentApiSlot] = requiredMemorySizeInDwords + currentApiSlot * kDwordSizeRwResource;
requiredMemorySizeInDwords += (lastUsedApiSlot + 1) * kDwordSizeRwResource;
case kShaderInputUsagePtrConstBufferTable:
SCE_GNM_ASSERT(inputUsageSlots[i].m_apiSlot == 0);
outTable->constBufferPtrSgpr = startRegister;
outTable->constBufferArrayDwOffset = requiredMemorySizeInDwords;
if (!(maskChunks & 1))
SCE_GNM_ASSERT(usageMasks < (uint32_t const*)shaderBinaryInfo);
usedApiSlotCount = getUsedApiSlotsFromMask(&usedApiSlots[0], outTable->constBufferSlotCount, *usageMasks++, kSlotCountConstantBuffer);
SCE_GNM_ASSERT(usedApiSlotCount > 0);
uint32_t lastUsedApiSlot = usedApiSlots[usedApiSlotCount - 1];
for (uint8_t j = 0; j < usedApiSlotCount; j++)
uint16_t currentApiSlot = static_cast<uint16_t>(usedApiSlots[j]);
outTable->constBufferDwOffset[currentApiSlot] = requiredMemorySizeInDwords + currentApiSlot * kDwordSizeConstantBuffer;
requiredMemorySizeInDwords += (lastUsedApiSlot + 1) * kDwordSizeConstantBuffer;
case kShaderInputUsagePtrSamplerTable:
SCE_GNM_ASSERT(inputUsageSlots[i].m_apiSlot == 0);
outTable->samplerPtrSgpr = startRegister;
outTable->samplerArrayDwOffset = requiredMemorySizeInDwords;
if (!(maskChunks & 1))
SCE_GNM_ASSERT(usageMasks < (uint32_t const*)shaderBinaryInfo);
usedApiSlotCount = getUsedApiSlotsFromMask(&usedApiSlots[0], outTable->samplerSlotCount, *usageMasks++, kSlotCountSampler);
SCE_GNM_ASSERT(usedApiSlotCount > 0);
uint32_t lastUsedApiSlot = usedApiSlots[usedApiSlotCount - 1];
for (uint8_t j = 0; j < usedApiSlotCount; j++)
uint16_t currentApiSlot = static_cast<uint16_t>(usedApiSlots[j]);
outTable->samplerDwOffset[currentApiSlot] = requiredMemorySizeInDwords + currentApiSlot * kDwordSizeSampler;
requiredMemorySizeInDwords += (lastUsedApiSlot + 1) * kDwordSizeSampler;
case kShaderInputUsagePtrVertexBufferTable:
SCE_GNM_ASSERT(shaderStage == kShaderStageLs || shaderStage == kShaderStageEs || shaderStage == kShaderStageVs || shaderStage == kShaderStageCs);
SCE_GNM_ASSERT(inputUsageSlots[i].m_apiSlot == 0);
outTable->vertexBufferPtrSgpr = startRegister;
outTable->vertexBufferArrayDwOffset = requiredMemorySizeInDwords;
if (maskChunks & 1)
// Skip updating for the vertex buffer table below, since we are using the chunk mask at the end
// we'll update it after everything else.
bUseVertexBufferTableChunkMask = true;
SCE_GNM_ASSERT(gnmxShaderStruct != NULL);
const VertexInputSemantic* semanticTable = NULL;
usedApiSlotCount = 0;
if (shaderStage == kShaderStageVs && isDispatchDraw)
VsShader const* pVsShader = ((CsVsShader*)gnmxShaderStruct)->getVertexShader();
usedApiSlotCount = pVsShader->m_numInputSemantics;
semanticTable = pVsShader->getInputSemanticTable();
else if (shaderStage == kShaderStageVs)
usedApiSlotCount = ((VsShader*)gnmxShaderStruct)->m_numInputSemantics;
semanticTable = ((VsShader*)gnmxShaderStruct)->getInputSemanticTable();
else if (shaderStage == kShaderStageLs)
usedApiSlotCount = ((LsShader*)gnmxShaderStruct)->m_numInputSemantics;
semanticTable = ((LsShader*)gnmxShaderStruct)->getInputSemanticTable();
else if (shaderStage == kShaderStageEs)
usedApiSlotCount = ((EsShader*)gnmxShaderStruct)->m_numInputSemantics;
semanticTable = ((EsShader*)gnmxShaderStruct)->getInputSemanticTable();
if (shaderStage == kShaderStageCs && isDispatchDraw)
VsShader const* pVsShader = ((CsVsShader*)gnmxShaderStruct)->getVertexShader();
CsShader const* pCsShader = ((CsVsShader*)gnmxShaderStruct)->getComputeShader();
if (pCsShader->m_version >= kDdCsShaderVersion_IndependentCsFetchShader)
usedApiSlotCount = pCsShader->m_numInputSemantics;
semanticTable = pCsShader->getInputSemanticTable();
usedApiSlotCount = pVsShader->m_gsModeOrNumInputSemanticsCs;
semanticTable = pVsShader->getInputSemanticTable();
else if (shaderStage == kShaderStageCs)
CsShader const* pCsShader = ((CsShader*)gnmxShaderStruct);
if (pCsShader->m_version >= kDdCsShaderVersion_IndependentCsFetchShader)
usedApiSlotCount = pCsShader->m_numInputSemantics;
semanticTable = pCsShader->getInputSemanticTable();
usedApiSlotCount = 0;
semanticTable = NULL;
// Check if the shader uses any API-slot over the maximum count configured for the InputResourceOffset table
SCE_GNM_ASSERT(usedApiSlotCount > 0 && usedApiSlotCount <= outTable->vertexBufferSlotCount);
uint32_t usedApiSlots[kSlotCountVertexBuffer];
// First use what the shader generated
for (uint8_t i = 0; i < usedApiSlotCount; i++)
uint8_t semanticIndex = semanticTable[i].m_semantic;
SCE_GNM_ASSERT(semanticIndex >= 0 && semanticIndex < outTable->vertexBufferSlotCount);
usedApiSlots[i] = semanticIndex;
uint32_t firstUsedApiSlot = usedApiSlots[0];
uint32_t lastUsedApiSlot = usedApiSlots[usedApiSlotCount - 1];
// If a semanticRemapTable has been provided, override the shaders defined usage slots to conform with the remapped layout
if (semanticRemapTable && numElementsInSemanticRemapTable != 0)
// Override values defined in the shader binary header above
SCE_GNM_ASSERT(usedApiSlotCount <= numElementsInSemanticRemapTable);
usedApiSlotCount = remapVertexBufferOffsetsWithSemanticTable(&usedApiSlots[0], firstUsedApiSlot, lastUsedApiSlot, outTable->vertexBufferSlotCount,
semanticRemapTable, numElementsInSemanticRemapTable);
// Generate the final dword offsets for the vertex buffer table
lastUsedApiSlot = usedApiSlots[usedApiSlotCount - 1];
for (uint8_t j = 0; j < usedApiSlotCount; j++)
uint16_t currentApiSlot = static_cast<uint16_t>(usedApiSlots[j]);
outTable->vertexBufferDwOffset[currentApiSlot] = requiredMemorySizeInDwords + currentApiSlot * kDwordSizeVertexBuffer;
requiredMemorySizeInDwords += (lastUsedApiSlot + 1) * kDwordSizeVertexBuffer;
// Note: this must be called after all other tables are processed above, as the vertex buffer table chunk mask (*usageMasks)
// is always stored at the end of the chunk mask table
if (bUseVertexBufferTableChunkMask)
uint32_t usedApiSlots[kSlotCountVertexBuffer];
uint32_t usedApiSlotCount;
SCE_GNM_ASSERT(usageMasks < (uint32_t const*)shaderBinaryInfo);
usedApiSlotCount = getUsedApiSlotsFromMask(&usedApiSlots[0], outTable->vertexBufferSlotCount, *usageMasks++, kSlotCountVertexBuffer);
SCE_GNM_ASSERT(usedApiSlotCount > 0);
uint32_t firstUsedApiSlot = usedApiSlots[0];
uint32_t lastUsedApiSlot = usedApiSlots[usedApiSlotCount - 1];
// If a semanticRemapTable has been provided, override the shaders defined usage slots to conform with the remapped layout
if (semanticRemapTable && numElementsInSemanticRemapTable != 0)
// Override values defined in the shader binary header above
SCE_GNM_ASSERT((uint32_t)usedApiSlotCount <= numElementsInSemanticRemapTable);
usedApiSlotCount = remapVertexBufferOffsetsWithSemanticTable(&usedApiSlots[0], firstUsedApiSlot, lastUsedApiSlot, outTable->vertexBufferSlotCount,
semanticRemapTable, numElementsInSemanticRemapTable);
// Generate the final dword offsets for the vertex buffer table
lastUsedApiSlot = usedApiSlots[usedApiSlotCount - 1];
for (uint8_t j = 0; j < usedApiSlotCount; j++)
int32_t currentApiSlot = usedApiSlots[j];
outTable->vertexBufferDwOffset[currentApiSlot] = requiredMemorySizeInDwords + currentApiSlot * kDwordSizeVertexBuffer;
requiredMemorySizeInDwords += (lastUsedApiSlot + 1) * kDwordSizeVertexBuffer;
// Final amount of memory the shader will use from the scratch and resource buffer
outTable->requiredBufferSizeInDwords = requiredMemorySizeInDwords;
// Checking for non handled input data
for (int32_t i = 0; i < inputUsageSlotsCount; ++i)
switch (inputUsageSlots[i].m_usageType)
case kShaderInputUsageImmResource:
case kShaderInputUsageImmRwResource:
case kShaderInputUsageImmSampler:
case kShaderInputUsageImmConstBuffer:
case kShaderInputUsageImmVertexBuffer:
case kShaderInputUsageImmShaderResourceTable:
case kShaderInputUsageSubPtrFetchShader:
case kShaderInputUsagePtrExtendedUserData:
case kShaderInputUsagePtrResourceTable:
case kShaderInputUsagePtrRwResourceTable:
case kShaderInputUsagePtrConstBufferTable:
case kShaderInputUsagePtrVertexBufferTable:
case kShaderInputUsagePtrSamplerTable:
case kShaderInputUsagePtrInternalGlobalTable:
case kShaderInputUsageImmGdsCounterRange:
case kShaderInputUsageImmGdsMemoryRange:
case kShaderInputUsageImmLdsEsGsSize:
case kShaderInputUsageImmGdsKickRingBufferOffset:
case kShaderInputUsageImmVertexRingBufferOffset:
case kShaderInputUsagePtrDispatchDraw:
case kShaderInputUsageImmDispatchDrawInstances:
// case kShaderInputUsagePtrSoBufferTable: // Only present in the VS copy-shader that doesn't have a footer
// Not handled yet
(false, "Input Usage Slot type %d is not supported by LCUE sce::generateInputResourceOffsetTable()", inputUsageSlots[i].m_usageType);
int generateVsFetchShaderBuildState(FetchShaderBuildState *fsbs, const VsStageRegisters *pregs, uint32_t numInputs,
const FetchShaderInstancingMode *instancingData, const uint32_t numElementsInInstancingData,
uint8_t vertexBaseUserSgpr, uint8_t instanceBaseUsgpr)
uint32_t spiShaderPgmRsrc2Vs; // ebp
uint32_t spiShaderPgmRsrc1Vs; // er14
uint32_t firstFreeSgpr; // ebp
_BOOL4 v10; // ebx
unsigned int vsharpSlotCount; // er15
int v12; // er14
int v13; // esi
__int16 result; // ax
bool noInstance; // zf
signed int v16; // ecx
spiShaderPgmRsrc1Vs = pregs->m_spiShaderPgmRsrc1Vs;
spiShaderPgmRsrc2Vs = pregs->m_spiShaderPgmRsrc2Vs;
firstFreeSgpr = ((((unsigned __int8)(spiShaderPgmRsrc2Vs >> 7) | (unsigned __int8)(spiShaderPgmRsrc2Vs >> 12)) & 1)
+ ((spiShaderPgmRsrc2Vs >> 24) & 1)
+ ((spiShaderPgmRsrc2Vs >> 12) & 1)
+ ((spiShaderPgmRsrc2Vs >> 11) & 1)
+ ((spiShaderPgmRsrc2Vs >> 10) & 1)
+ ((spiShaderPgmRsrc2Vs >> 9) & 1)
+ ((unsigned __int8)spiShaderPgmRsrc2Vs >> 8)
+ ((spiShaderPgmRsrc2Vs >> 7) & 1)
+ (spiShaderPgmRsrc2Vs & 1)
+ ((spiShaderPgmRsrc2Vs >> 1) & 0x1F)
+ 3) & 0xFFFFFFFC;
v10 = (((spiShaderPgmRsrc1Vs >> 3) & 0x78) + 8 - firstFreeSgpr) >> 2 == 0;
vsharpSlotCount = ((((spiShaderPgmRsrc1Vs >> 3) & 0x78) + 8 - firstFreeSgpr) >> 2) + 2 * v10;
v12 = spiShaderPgmRsrc1Vs & 0x3000000;
v13 = (((_BYTE)v10 + (unsigned __int8)(pregs->m_spiShaderPgmRsrc1Vs >> 6)) & 0xF) << 6;
fsbs->m_fetchShaderBufferSize = 4
* ((numInputs + vsharpSlotCount - 1) / vsharpSlotCount
+ (instanceBaseUsgpr != 0)
+ numInputs
- ((vertexBaseUserSgpr < 1u)
- 1))
+ 8 * numInputs
+ 12;
fsbs->m_fetchShaderFlags = 0;
fsbs->m_firstFreeSgpr = firstFreeSgpr;
fsbs->m_vsharpSlotCount = vsharpSlotCount;
fsbs->m_fetchShaderInstancingData = instancingData;
result = 0;
noInstance = instancingData == 0LL;
if (instancingData)
result = numElementsInInstancingData;
v16 = 0x3000000;
if (noInstance)
v16 = v12;
fsbs->m_numElementsInInstancingData = result;
fsbs->m_vertexBaseUsgpr = vertexBaseUserSgpr;
fsbs->m_instanceBaseUsgpr = instanceBaseUsgpr;
fsbs->m_reserved1 = 0;
fsbs->m_reserved2 = 0;
fsbs->m_reserved0 = 0;
fsbs->m_shaderModifier = v13 | v16;
return result;
int generateFetchShader(uint32_t *fs, FetchShaderBuildState *fsbs)
FetchShaderBuildState *fsbs_; // r11
__int64 *v3; // rsi
char vbtStartRegister; // r14
uint16_t v5; // ax
const InputUsageSlot *inputUsageSlot; // rcx
unsigned __int64 i; // rdx
signed __int64 idx; // r9
unsigned __int64 inputSemaCount; // rdi
uint8_t vsharpSlotCount; // bl
_DWORD *pEleNumInRemapTable; // r15
__int64 v12; // rdx
unsigned int k; // er13
int shiftVbtStartRegister; // er14
unsigned __int64 m; // r10
unsigned int n; // er8
const uint32_t *semanticsRemapTable; // rbx
int v18; // er12
int semanticIdx; // edx
__int64 j; // rax
unsigned int firstFreeSgprAndVsharp; // eax
char v22; // cl
int v23; // edx
__int64 v24; // rcx
__int64 v25; // rax
unsigned int v26; // esi
__int64 v27; // rcx
const FetchShaderInstancingMode *fetchShaderInstancingData; // r12
const VertexInputSemantic *inputSemantics; // rdx
__int64 t; // r14
int v31; // er11
signed __int64 pSizeInElements; // r8
unsigned int v33; // edx
unsigned int v34; // er13
unsigned int v35; // er15
int vgpr; // edi
int sizeInElmt; // esi
unsigned int v38; // ebx
unsigned int v39; // ecx
__int64 v40; // rbx
__int64 v41; // rdi
__int64 v42; // rcx
__int64 v43; // rax
__int64 v44; // r9
unsigned __int64 numElementsInInstancingData; // [rsp+8h] [rbp-98h]
__int64 v47; // [rsp+10h] [rbp-90h]
signed __int64 v48; // [rsp+18h] [rbp-88h]
__int64 v49; // [rsp+20h] [rbp-80h]
FetchShaderBuildState *fsbs_bak; // [rsp+28h] [rbp-78h]
unsigned __int64 numInputSemantics; // [rsp+30h] [rbp-70h]
signed __int64 pEleNumInRemapTable_; // [rsp+38h] [rbp-68h]
int v53; // [rsp+44h] [rbp-5Ch]
int v54; // [rsp+48h] [rbp-58h]
int shiftVbtStartRegister_; // [rsp+4Ch] [rbp-54h]
uint32_t *fs_dst; // [rsp+50h] [rbp-50h]
uint8_t vsharpSlotCount_; // [rsp+5Fh] [rbp-41h]
__int64 v58; // [rsp+60h] [rbp-40h]
int v59; // [rsp+68h] [rbp-38h]
int v60; // [rsp+6Ch] [rbp-34h]
__int64 v61; // [rsp+70h] [rbp-30h]
fsbs_ = fsbs;
//v3 = (__int64 *)loc_96B;
fs_dst = fs;
vbtStartRegister = -1;
//v61 = *loc_96B;
v5 = fsbs_->m_fetchShaderFlags;
v58 = 0x300000000LL;
v59 = (v5 & 1) + 1;
v60 = 2;
if (fsbs_->m_numInputUsageSlots)
inputUsageSlot = fsbs_->m_inputUsageSlots;
vbtStartRegister = -1;
i = 0LL;
if (inputUsageSlot[i].m_usageType == kShaderInputUsagePtrVertexBufferTable)
vbtStartRegister = inputUsageSlot[i].m_startRegister;
} while (i < fsbs_->m_numInputUsageSlots);
idx = 0LL;
if (fsbs_->m_vertexBaseUsgpr)
fs[idx] = fsbs_->m_vertexBaseUsgpr | 0x4A000000;
if (fsbs_->m_instanceBaseUsgpr)
fs[idx] = fsbs_->m_instanceBaseUsgpr | 0x4A060600;
inputSemaCount = fsbs_->m_numInputSemantics;
if (fsbs_->m_numInputSemantics)
vsharpSlotCount = fsbs_->m_vsharpSlotCount;
pEleNumInRemapTable = &fsbs_->m_numElementsInRemapTable;
v12 = 0LL;
k = 0;
fsbs_bak = fsbs_;
numInputSemantics = fsbs_->m_numInputSemantics;
shiftVbtStartRegister = (vbtStartRegister & 0x7E) << 8;
pEleNumInRemapTable_ = (signed __int64)&fsbs_->m_numElementsInRemapTable;
shiftVbtStartRegister_ = shiftVbtStartRegister;
vsharpSlotCount_ = fsbs_->m_vsharpSlotCount;
if (vsharpSlotCount)
m = (unsigned int)v12;
n = 0;
while (1)
v12 = (unsigned int)m;
if (m >= inputSemaCount)
semanticsRemapTable = fsbs_->m_semanticsRemapTable;
v18 = m;
semanticIdx = fsbs_->m_inputSemantics[m].m_semantic;
if (semanticsRemapTable && *pEleNumInRemapTable)
j = 0LL;
if (semanticsRemapTable[j] == semanticIdx)
} while ((unsigned int)j < *pEleNumInRemapTable);
LOBYTE(semanticIdx) = j;
firstFreeSgprAndVsharp = *(unsigned __int16 *)&fsbs_->m_firstFreeSgpr;
v22 = firstFreeSgprAndVsharp + 4 * n++;
v23 = ((v22 & 0x7F) << 15) | shiftVbtStartRegister | 4 * (semanticIdx & 0x3F);
v24 = (unsigned int)idx;
fs_dst[v24] = v23 | 0xC0800100;
if (n >= firstFreeSgprAndVsharp >> 8)
v12 = (unsigned int)(v18 + 1);
vsharpSlotCount = vsharpSlotCount_;
v25 = (unsigned int)idx;
fs_dst[v25] = 0xBF8C007F;
if ((unsigned int)v12 > k)
v26 = *(_DWORD *)&fsbs_->m_firstFreeSgpr;
v27 = v12;
fetchShaderInstancingData = fsbs_->m_fetchShaderInstancingData;
inputSemantics = fsbs_->m_inputSemantics;
t = 0LL;
v49 = v27;
v53 = 2 * k;
v54 = 2 * v27;
v47 = k;
v31 = k - v27;
pSizeInElements = (signed __int64)&inputSemantics[k].m_sizeInElements;
v33 = idx;
numElementsInInstancingData = v26 >> 16;
v48 = (signed __int64)&fetchShaderInstancingData[k];
v34 = (v26 & 0xFFFFFFFC) << 14;
v35 = 0;
if (fetchShaderInstancingData && v47 + t < numElementsInInstancingData)
v35 = *(_DWORD *)(v48 + 4 * t);
vgpr = *(unsigned __int8 *)(pSizeInElements + 4 * t - 1);
sizeInElmt = *(unsigned __int8 *)(pSizeInElements + 4 * t);
v38 = v34;
v34 += 0x10000;
v39 = (v38 & 0x1F0000) + (*((unsigned __int8 *)&v58 + 4 * v35) | (vgpr << 8)) + 0x80000000;
v40 = v33;
v41 = v33 + 1;
v33 += 2;
fs_dst[v40] = ((sizeInElmt << 18) + 0x1FC0000) & 0x1FC0000 | 0xE0002000;
fs_dst[v41] = v39;
} while (v31 + (_DWORD)t);
v12 = v49;
fsbs_ = fsbs_bak;
shiftVbtStartRegister = shiftVbtStartRegister_;
inputSemaCount = numInputSemantics;
pEleNumInRemapTable = (_DWORD *)pEleNumInRemapTable_;
vsharpSlotCount = vsharpSlotCount_;
LODWORD(idx) = v54 + idx - v53;
k = v49;
} while (k < (unsigned int)inputSemaCount);
//v3 = (__int64 *)loc_C1E;
pEleNumInRemapTable = &fsbs_->m_numElementsInRemapTable;
v42 = (unsigned int)idx;
v43 = (unsigned int)(idx + 1);
v44 = (unsigned int)(idx + 2);
fs_dst[v42] = 0xBF8C0000;
fs_dst[v43] = 0xBE802000;
if (*pEleNumInRemapTable)
LODWORD(inputSemaCount) = *pEleNumInRemapTable;
fs_dst[v44] = inputSemaCount;
return 0;
int getGpuMode()
return 1;
void generatePsShaderUsageTable(uint32_t *inputTable, const VertexExportSemantic *vsTable, uint32_t vsTableNumItems, const PixelInputSemantic *psTable, uint32_t psTableNumItem)
const PixelInputSemantic *psTab; // r12
uint32_t numVsExportSemantics; // ebx
__int64 idx; // r14
int v8; // ecx
char v9; // r8
__int64 v10; // rax
signed __int64 v11; // rax
unsigned __int64 v12; // rax
unsigned __int64 v13; // rcx
unsigned __int64 v14; // rdx
unsigned __int64 v15; // rax
__int64 matchVsIdx; // r15
char hasMatchedVsSema; // r13
const PixelInputSemantic *psTab_; // r8
unsigned int v19; // edx
unsigned int v20; // ecx
uint32_t numPsInputSemantics; // [rsp+4h] [rbp-3Ch]
const VertexExportSemantic *vsTab; // [rsp+10h] [rbp-30h]
psTab = psTable;
numVsExportSemantics = vsTableNumItems;
numPsInputSemantics = psTableNumItem;
if (psTableNumItem)
idx = 0LL;
vsTab = vsTable;
LODWORD(matchVsIdx) = 0;
hasMatchedVsSema = 0;
if (numVsExportSemantics)
matchVsIdx = 0LL;
while (vsTable[matchVsIdx].m_semantic != psTab[idx].m_semantic)
if ((unsigned int)++matchVsIdx >= numVsExportSemantics)
hasMatchedVsSema = 0;
goto LABEL_14;
hasMatchedVsSema = 1;
if (getGpuMode() == 1 && psTab[idx].m_interpF16)
if (hasMatchedVsSema)
vsTable = vsTab;
v9 = *((_BYTE *)&vsTab[(unsigned int)matchVsIdx] + 1) & 0x1F;
v8 = (unsigned __int16)(psTab[idx].m_interpF16) & ~(unsigned __int8)(*((_BYTE *)&vsTab[(unsigned int)matchVsIdx] + 1) >> 6);
vsTable = vsTab;
v8 = 0;
v9 = 0x20;
inputTable[idx] = 0x80000;
v10 = (*(_WORD *)&psTab[idx] << 10) & 0x1000000;
inputTable[idx] = v10 + 0x80000;
v11 = ((*(_WORD *)&psTab[idx] << 10) & 0x2000000) + v10 + 0x80000;
inputTable[idx] = v11;
if (*(_WORD *)&psTab[idx] < 0xC000u)
v15 = ((unsigned __int8)v9 | (unsigned __int8)(32 * (v8 != 0))) & 0x3F | (unsigned __int64)v11;
inputTable[idx] = v15;
v14 = v15 | (((*(_WORD *)&psTab[idx] >> (4 * v8 & 4 ^ 0xC)) & 3LL) << 8);
v12 = ((unsigned __int8)v9 | (unsigned __int8)(32 * v8)) & 0x3F | (unsigned __int64)v11;
inputTable[idx] = v12;
v13 = v12 | ((unsigned __int64)(*((_BYTE *)&psTab[idx] + 1) & 3) << 8) | (v8 << 19) & 0x100000;
inputTable[idx] = v13;
v14 = v13 & 0xFFFFFFFFFF9FFFFFLL | (*(_WORD *)&psTab[idx] << 9) & 0x600000;
inputTable[idx] = v14;
inputTable[idx] = 0;
if (hasMatchedVsSema)
vsTable = vsTab;
psTab_ = &psTab[idx];
v19 = *((_BYTE *)&vsTab[(unsigned int)matchVsIdx] + 1) & 0x1F | ((unsigned int)*(_WORD *)&psTab[idx] >> 7) & 0x20;
inputTable[idx] = v19;
v20 = v19 | (((((unsigned int)*(_WORD *)&psTab[idx] >> 10) | ((unsigned int)*(_WORD *)&psTab[idx] >> 12)) & 1) << 10);
vsTable = vsTab;
psTab_ = &psTab[idx];
v20 = 32;
inputTable[idx] = v20;
inputTable[idx] = v20 & 0xFFFFFCFF | ((*((_BYTE *)psTab_ + 1) & 3) << 8);
} while ((_DWORD)idx != numPsInputSemantics);
// 128 bits
struct VSharpBuffer
uint64_t base : 44; // base byte address (only 40 bits supported)
uint64_t mtype_L1s : 2; // mtype for scalar L1
uint64_t mtype_L2 : 2; // mtype for L2
uint64_t stride : 14; // bytes: 0..16383
uint64_t cache_swizzle : 1; // buffer access. optionally swizzle TC L1 cache banks
uint64_t swizzle_en : 1; // swizzle AOS according to stride, index_stride, and element_size, else linear (stride * index + offset)
uint32_t num_records; // in units of 'stride'
// Destination channel select:
// 0=0, 1=1, 4=R, 5=G, 6=B, 7=A
uint32_t dst_sel_x : 3;
uint32_t dst_sel_y : 3;
uint32_t dst_sel_z : 3;
uint32_t dst_sel_w : 3;
uint32_t nfmt : 3; // numeric data type (float, int, <20><>)
uint32_t dfmt : 4; // # of fields, size of each field. Note: dfmt=0 (invalid) is a special case that will disable buffer access via vector memory ops.
uint32_t element_size : 2; // 2, 4, 8, or 16 bytes. Used for swizzled buffer addressing
uint32_t index_stride : 2; // 8, 16, 32, or 64. Used for swizzled buffer addressing
uint32_t addtid_en : 1; // add thread id to the index for addr calc
uint32_t reserved0 : 1;
uint32_t hash_en : 1; // 1 = buffer addresses are hashed for better cache perf
uint32_t reserved1 : 1;
uint32_t mtype : 3; // mtype for L1
uint32_t type : 2; // value == 0 for buf. Overlaps upper 2 bits of 4-bit TYPE field in 128-bit T# resource
typedef VSharpBuffer GnmBuffer;
// T# Texture Descriptor Buffer
// 256 bits
struct TSharpBuffer
uint64_t baseaddr256 : 38; // base 256-byte aligned address bits [39:8] (top 6 bits are not used)
uint64_t mtype_L2 : 2; // mtype for L2
uint64_t min_lod : 12; // fixed point 4.8 minimum LOD (0.0..15.0)
uint64_t dfmt : 6; // texture data format; num components, num bits
uint64_t nfmt : 4; // texture numeric format; value conversion
uint64_t mtype_lsbs : 2; // mtype for L1 (LSBs)
uint64_t width : 14; // texture width (0..16383)
uint64_t height : 14; // texture height (0..16383)
// Specifies the scale factor applied to the perf_z, perf_mip,
// aniso_bias, aniso_threshold, lod_bias_sec settings
// specified in the associated S#:
// 0=0/16, 1=2/16, 2=5/16, 3=7/16, 4=9/16, 5=11/16, 6=14/16, 7=16/16
// The result after scaling is rounded down to the nearest
// representable value for the given S# field.
// (Note that perf_mod=0 effectively disables these S#
// settings, while perf_mod=7 essentially eliminates the
// dependency between the T# and S#.)
uint64_t perf_mod : 3;
uint64_t interlaced : 1; // texture is interlaced
// Destination channel select:
// 0=0, 1=1, 4=R, 5=G, 6=B, 7=A
uint64_t dst_sel_x : 3;
uint64_t dst_sel_y : 3;
uint64_t dst_sel_z : 3;
uint64_t dst_sel_w : 3;
uint64_t base_level : 4;
uint64_t last_level : 4;
uint64_t tiling_idx : 5;
uint64_t pow2pad : 1;
uint64_t mtype_msb : 1;
uint64_t reserved0 : 1;
uint64_t type : 4;
uint64_t depth : 13;
uint64_t pitch : 14; // texture pitch in texels (0..16383); defaults to width
uint64_t reserved1 : 5;
uint64_t base_array : 13; // first array index (0..16383)
uint64_t last_array : 13; // texture height (0..16383)
uint64_t reserved2 : 6;
uint64_t min_lod_warn : 12;
uint64_t counter_bank_id : 8;
uint64_t LOD_hdw_cnt_en : 1;
uint64_t reserved3 : 43;
typedef TSharpBuffer GnmTexture;
// S# Sampler Descriptor Buffer
// 128 bits
struct SSharpBuffer
uint64_t clamp_x : 3;
uint64_t clamp_y : 3;
uint64_t clamp_z : 3;
uint64_t max_aniso_ratio : 3;
uint64_t depth_compare_func : 3;
uint64_t force_unorm_coords : 1;
uint64_t aniso_threshold : 3;
uint64_t mc_coord_trunc : 1;
uint64_t force_degamma : 1;
uint64_t aniso_bias : 6;
uint64_t trunc_coord : 1;
uint64_t disable_cube_wrap : 1;
uint64_t filter_mode : 2;
uint64_t reserved0 : 1;
uint64_t min_lod : 12;
uint64_t max_lod : 12;
uint64_t perf_mip : 4;
uint64_t perf_z : 4;
uint64_t lod_bias : 14;
uint64_t lod_bias_sec : 6;
uint64_t xy_mag_filter : 2;
uint64_t xy_min_filter : 2;
uint64_t z_filter : 2;
uint64_t mip_filter : 2;
uint64_t reserved1 : 4;
uint64_t border_color_ptr : 12;
uint64_t reserved2 : 18;
uint64_t border_color_type : 2;
int main(void)
ShaderInfo shaderInfo;
parseShader(&shaderInfo, s_vex_vv);
void *shaderBinary = malloc(shaderInfo.m_gpuShaderCodeSize);
void *shaderHeader = malloc(shaderInfo.m_vsShader->computeSize());
memcpy(shaderBinary, shaderInfo.m_gpuShaderCode, shaderInfo.m_gpuShaderCodeSize);
memcpy(shaderHeader, shaderInfo.m_vsShader, shaderInfo.m_vsShader->computeSize());
ShaderBinaryInfo* binInfo = findShaderBinInfo((uint8_t*)shaderBinary);
InputResourceOffsets table;
generateInputResourceOffsetTable(&table, binInfo);
size_t codeLength = binInfo->m_length + sizeof(ShaderBinaryInfo);
VsShader* m_shader = static_cast<VsShader*>(shaderHeader);
VsShader* vertexShader = m_shader;
FetchShaderBuildState fb = { 0 };
generateVsFetchShaderBuildState(&fb, (const VsStageRegisters*)&vertexShader->m_vsStageRegisters, vertexShader->m_numInputSemantics, nullptr, 0, fb.m_vertexBaseUsgpr, fb.m_instanceBaseUsgpr);
const InputUsageSlot *inputUsageSlots = vertexShader->getInputUsageSlotTable();
fb.m_numInputSemantics = vertexShader->m_numInputSemantics;
fb.m_inputSemantics = vertexShader->getInputSemanticTable();
fb.m_numInputUsageSlots = vertexShader->m_common.m_numInputUsageSlots;
fb.m_inputUsageSlots = inputUsageSlots;
fb.m_numElementsInRemapTable = 0;
fb.m_semanticsRemapTable = 0;
uint32_t *fs = (uint32_t *)malloc(fb.m_fetchShaderBufferSize);
generateFetchShader(fs, &fb);
std::ofstream fout("fetch_shader.bin");
fout.write((char*)fs, fb.m_fetchShaderBufferSize);
const InputUsageSlot* slot = m_shader->getInputUsageSlotTable();
const VertexInputSemantic* inputSema = m_shader->getInputSemanticTable();
const VertexExportSemantic* expSema = m_shader->getExportSemanticTable();
//ShaderInfo psShaderInfo;
//parseShader(&psShaderInfo, s_pix_p);
//void *psshaderBinary = malloc(psShaderInfo.m_gpuShaderCodeSize);
//void *psshaderHeader = malloc(psShaderInfo.m_psShader->computeSize());
//memcpy(psshaderBinary, psShaderInfo.m_gpuS haderCode, psShaderInfo.m_gpuShaderCodeSize);
//memcpy(psshaderHeader, psShaderInfo.m_psShader, psShaderInfo.m_psShader->computeSize());
//ShaderBinaryInfo* psbinInfo = findShaderBinInfo((uint8_t*)psshaderBinary);
//InputResourceOffsets pstable;
//generateInputResourceOffsetTable(&pstable, psbinInfo);
//size_t pscodeLength = psbinInfo->m_length + sizeof(ShaderBinaryInfo);
//PsShader* m_psshader = static_cast<PsShader*>(psshaderHeader);
//uint32_t psInputs[32] = {0};
// vertexShader->getExportSemanticTable(), vertexShader->m_numExportSemantics,
// m_psshader->getPixelInputSemanticTable(), m_psshader->m_numInputSemantics);
return binInfo->m_isSrt;