gpu: added CLUT cache implementation

2024-05-20 12:57:38 -04:00 · 2020-05-10 16:06:37 +02:00 · 2020-05-10 16:06:37 +02:00 · 93580f34c1
parent d35f1c612b
commit 93580f34c1
9 changed files with 67 additions and 23 deletions
--- a/.gitignore
+++ b/.gitignore
@ -62,6 +62,7 @@ spu.bin
 .vs/
 imgui.ini
 /vram.png
 todo.txt
 config.json
--- a/src/device/gpu/render/texture_utils.cpp
+++ b/src/device/gpu/render/texture_utils.cpp
@ -1,4 +1,4 @@
-#include "texture_utils.h"
+#include "color_depth.h"
 ColorDepth bitsToDepth(int bits) {
    switch (bits) {
--- a/src/device/gpu/color_depth.h
+++ b/src/device/gpu/color_depth.h
@ -0,0 +1,17 @@
 #pragma once
 enum class ColorDepth { NONE, BIT_4, BIT_8, BIT_16 };
 ColorDepth bitsToDepth(int bits);
 template <int bits>
 constexpr ColorDepth bitsToDepth() {
    if constexpr (bits == 4)
        return ColorDepth::BIT_4;
    else if constexpr (bits == 8)
        return ColorDepth::BIT_8;
    else if constexpr (bits == 16)
        return ColorDepth::BIT_16;
    else
        return ColorDepth::NONE;
 }
--- a/src/device/gpu/gpu.cpp
+++ b/src/device/gpu/gpu.cpp
@ -49,6 +49,8 @@ void GPU::reset() {
    drawingOffsetY = 0;
    gp0_e6._reg = 0;
    clutCachePos = ivec2(-1, -1);
 }
 void GPU::drawTriangle(const primitive::Triangle& triangle) {
@ -607,6 +609,7 @@ void GPU::writeGP0(uint32_t data) {
            }
        } else if (command == 0x01) {
            // Clear Cache
            clutCachePos = ivec2(-1, -1);
        } else if (command == 0x02) {
            // Fill rectangle
            cmd = Command::FillRectangle;
--- a/src/device/gpu/gpu.h
+++ b/src/device/gpu/gpu.h
@ -1,6 +1,7 @@
 #pragma once
 #include <array>
 #include <vector>
 #include "color_depth.h"
 #include "primitive.h"
 #include "psx_color.h"
 #include "registers.h"
@ -98,6 +99,11 @@ class GPU {
    std::array<uint16_t, VRAM_WIDTH * VRAM_HEIGHT> vram{};
    // TODO: Serialize?
    std::array<uint16_t, 256> clutCache{};
    ivec2 clutCachePos{-1, -1};
    ColorDepth clutCacheColorDepth = ColorDepth::NONE;
   private:
    // Hardware rendering
    std::vector<Vertex> vertices;
--- a/src/device/gpu/render/render_rectangle.cpp
+++ b/src/device/gpu/render/render_rectangle.cpp
@ -44,6 +44,8 @@ INLINE void rasterizeRectangle(gpu::GPU* gpu, const primitive::Rect& rect) {
        vStep = -1;
    }
    loadClutCacheIfRequired<bits>(gpu, rect.clut);
    int x, y, u, v;
    for (y = min.y, v = uv.y; y <= max.y; y++, v += vStep) {
        for (x = min.x, u = uv.x; x <= max.x; x++, u += uStep) {
@ -57,7 +59,7 @@ INLINE void rasterizeRectangle(gpu::GPU* gpu, const primitive::Rect& rect) {
                c = PSXColor(rect.color.r, rect.color.g, rect.color.b);
            } else {
                const ivec2 texel = maskTexel(ivec2(u, v), textureWindow);
-                c = fetchTex<bits>(gpu, texel, rect.texpage, rect.clut);
+                c = fetchTex<bits>(gpu, texel, rect.texpage);
                if (c.raw == 0x0000) continue;
                if constexpr (isBlended) {
--- a/src/device/gpu/render/render_triangle.cpp
+++ b/src/device/gpu/render/render_triangle.cpp
@ -200,6 +200,8 @@ void rasterizeTriangle(gpu::GPU* gpu, const primitive::Triangle& triangle) {
    const int area = orient2d(pos[0], pos[1], pos[2]);
    if (area == 0) return;
    loadClutCacheIfRequired<bits>(gpu, triangle.clut);
    ivec2 min(                                     //
        std::min({pos[0].x, pos[1].x, pos[2].x}),  //
        std::min({pos[0].y, pos[1].y, pos[2].y})   //
@ -278,7 +280,7 @@ void rasterizeTriangle(gpu::GPU* gpu, const primitive::Triangle& triangle) {
                } else {
                    const ivec2 uv(FROM_FP(attrib.u), FROM_FP(attrib.v));
                    const ivec2 texel = maskTexel(uv, textureWindow);
-                    c = fetchTex<bits>(gpu, texel, triangle.texpage, triangle.clut);
+                    c = fetchTex<bits>(gpu, texel, triangle.texpage);
                    if (c.raw == 0x0000) goto DONE;
                    if constexpr (isBlended) {
--- a/src/device/gpu/render/texture_utils.h
+++ b/src/device/gpu/render/texture_utils.h
@ -1,48 +1,55 @@
 #pragma once
 #include "device/gpu/gpu.h"
 #include "utils/macros.h"
 #include "../color_depth.h"
 #include "../primitive.h"
 #define gpuVRAM ((uint16_t(*)[gpu::VRAM_WIDTH])gpu->vram.data())
-enum class ColorDepth { NONE, BIT_4, BIT_8, BIT_16 };
+template <ColorDepth bits>
 void loadClutCacheIfRequired(gpu::GPU* gpu, ivec2 clut) {
    // Only paletted textures should reload the color look-up table cache
    if constexpr (bits != ColorDepth::BIT_4 && bits != ColorDepth::BIT_8) {
        return;
    }
-ColorDepth bitsToDepth(int bits);
+    bool textureFormatRequireReload = bits > gpu->clutCacheColorDepth;
    bool clutPositionChanged = gpu->clutCachePos != clut;
-template <int bits>
+    if (!textureFormatRequireReload && !clutPositionChanged) {
-constexpr ColorDepth bitsToDepth() {
+        return;
-    if constexpr (bits == 4)
+    }
-        return ColorDepth::BIT_4;
+
-    else if constexpr (bits == 8)
+    gpu->clutCacheColorDepth = bits;
-        return ColorDepth::BIT_8;
+    gpu->clutCachePos = clut;
-    else if constexpr (bits == 16)
+
-        return ColorDepth::BIT_16;
+    constexpr int entries = (bits == ColorDepth::BIT_8) ? 256 : 16;
-    else
+    for (int i = 0; i < entries; i++) {
-        return ColorDepth::NONE;
+        gpu->clutCache[i] = gpuVRAM[clut.y][clut.x + i];
    }
 }
 namespace {
-// Using unsigned vectors allows compiler to generate slightly faster division code
+INLINE uint16_t tex4bit(gpu::GPU* gpu, ivec2 tex, ivec2 texPage) {
 INLINE uint16_t tex4bit(gpu::GPU* gpu, ivec2 tex, ivec2 texPage, ivec2 clut) {
    uint16_t index = gpuVRAM[(texPage.y + tex.y) & 511][(texPage.x + tex.x / 4) & 1023];
    uint8_t entry = (index >> ((tex.x & 3) * 4)) & 0xf;
-    return gpuVRAM[clut.y][clut.x + entry];
+    return gpu->clutCache[entry];
 }
-INLINE uint16_t tex8bit(gpu::GPU* gpu, ivec2 tex, ivec2 texPage, ivec2 clut) {
+INLINE uint16_t tex8bit(gpu::GPU* gpu, ivec2 tex, ivec2 texPage) {
    uint16_t index = gpuVRAM[(texPage.y + tex.y) & 511][(texPage.x + tex.x / 2) & 1023];
    uint8_t entry = (index >> ((tex.x & 1) * 8)) & 0xff;
-    return gpuVRAM[clut.y][clut.x + entry];
+    return gpu->clutCache[entry];
 }
 INLINE uint16_t tex16bit(gpu::GPU* gpu, ivec2 tex, ivec2 texPage) { return gpuVRAM[(texPage.y + tex.y) & 511][(texPage.x + tex.x) & 1023]; }
 template <ColorDepth bits>
-INLINE PSXColor fetchTex(gpu::GPU* gpu, ivec2 texel, const ivec2 texPage, const ivec2 clut) {
+INLINE PSXColor fetchTex(gpu::GPU* gpu, ivec2 texel, const ivec2 texPage) {
    if constexpr (bits == ColorDepth::BIT_4) {
-        return tex4bit(gpu, texel, texPage, clut);
+        return tex4bit(gpu, texel, texPage);
    } else if constexpr (bits == ColorDepth::BIT_8) {
-        return tex8bit(gpu, texel, texPage, clut);
+        return tex8bit(gpu, texel, texPage);
    } else if constexpr (bits == ColorDepth::BIT_16) {
        return tex16bit(gpu, texel, texPage);
    } else {
--- a/src/utils/vector.h
+++ b/src/utils/vector.h
@ -7,6 +7,8 @@ struct ivec2 {
    ivec2 operator-(const ivec2& b) const { return {x - b.x, y - b.y}; }
    ivec2 operator+(const ivec2& b) const { return {x + b.x, y + b.y}; }
    bool operator==(const ivec2& b) const { return x == b.x && y == b.y; }
    bool operator!=(const ivec2& b) const { return x != b.x || y != b.y; }
 };
 struct ivec3 {
@ -15,6 +17,8 @@ struct ivec3 {
    ivec3 operator-(const ivec3& b) const { return {x - b.x, y - b.y, z - b.z}; }
    ivec3 operator+(const ivec3& b) const { return {x + b.x, y + b.y, z - b.z}; }
    bool operator==(const ivec3& b) const { return x == b.x && y == b.y && z == b.z; }
    bool operator!=(const ivec3& b) const { return x != b.x || y != b.y || z != b.z; }
 };
 struct vec2 {
@ -27,6 +31,8 @@ struct vec2 {
    vec2 operator+(const vec2& b) const { return {x + b.x, y + b.y}; }
    vec2 operator*(const float b) const { return {x * b, y * b}; }
    vec2 operator/(const float b) const { return {x / b, y / b}; }
    bool operator==(const vec2& b) const { return x == b.x && y == b.y; }
    bool operator!=(const vec2& b) const { return x != b.x || y != b.y; }
    float length() const;
    static vec2 normalize(const vec2& v);