melonDS/src/GPU3D_Soft.h
Jaklyy 0b85038586 remove accuracy toggle
doesn't seem necessary
2024-05-09 09:45:36 -04:00

542 lines
17 KiB
C++

/*
Copyright 2016-2023 melonDS team
This file is part of melonDS.
melonDS is free software: you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation, either version 3 of the License, or (at your option)
any later version.
melonDS is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with melonDS. If not, see http://www.gnu.org/licenses/.
*/
#pragma once
#include "GPU.h"
#include "GPU3D.h"
#include "Platform.h"
#include <thread>
#include <atomic>
namespace melonDS
{
class SoftRenderer : public Renderer3D
{
public:
SoftRenderer(bool threaded = false) noexcept;
~SoftRenderer() override;
void Reset(GPU& gpu) override;
void SetThreaded(bool threaded, GPU& gpu) noexcept;
[[nodiscard]] bool IsThreaded() const noexcept { return Threaded; }
void VCount144(GPU& gpu) override;
void RenderFrame(GPU& gpu) override;
void RestartFrame(GPU& gpu) override;
u32* GetLine(int line) override;
void ScanlineSync(int line) override;
void SetupRenderThread(GPU& gpu);
void EnableRenderThread();
void StopRenderThread();
private:
friend void GPU3D::DoSavestate(Savestate* file) noexcept;
// Notes on the interpolator:
//
// This is a theory on how the DS hardware interpolates values. It matches hardware output
// in the tests I did, but the hardware may be doing it differently. You never know.
//
// Assuming you want to perspective-correctly interpolate a variable named A across two points
// in a typical rasterizer, you would calculate A/W and 1/W at each point, interpolate linearly,
// then divide A/W by 1/W to recover the correct A value.
//
// The DS GPU approximates interpolation by calculating a perspective-correct interpolation
// between 0 and 1, then using the result as a factor to linearly interpolate the actual
// vertex attributes. The factor has 9 bits of precision when interpolating along Y and
// 8 bits along X.
//
// There's a special path for when the two W values are equal: it directly does linear
// interpolation, avoiding precision loss from the aforementioned approximation.
// Which is desirable when using the GPU to draw 2D graphics.
template<int dir>
class Interpolator
{
public:
constexpr Interpolator() {}
constexpr Interpolator(s32 x0, s32 x1, s32 w0, s32 w1)
{
Setup(x0, x1, w0, w1);
}
constexpr void Setup(s32 x0, s32 x1, s32 w0, s32 w1)
{
this->x0 = x0;
this->x1 = x1;
this->xdiff = x1 - x0;
// calculate reciprocal for Z interpolation
// TODO eventually: use a faster reciprocal function?
if (this->xdiff != 0)
this->xrecip_z = (1<<22) / this->xdiff;
else
this->xrecip_z = 0;
// linear mode is used if both W values are equal and have
// low-order bits cleared (0-6 along X, 1-6 along Y)
u32 mask = dir ? 0x7E : 0x7F;
if ((w0 == w1) && !(w0 & mask) && !(w1 & mask))
this->linear = true;
else
this->linear = false;
if (dir)
{
// along Y
if ((w0 & 0x1) && !(w1 & 0x1))
{
this->w0n = w0 - 1;
this->w0d = w0 + 1;
this->w1d = w1;
}
else
{
this->w0n = w0 & 0xFFFE;
this->w0d = w0 & 0xFFFE;
this->w1d = w1 & 0xFFFE;
}
this->shift = 9;
}
else
{
// along X
this->w0n = w0;
this->w0d = w0;
this->w1d = w1;
this->shift = 8;
}
}
constexpr void SetX(s32 x)
{
x -= x0;
this->x = x;
if (xdiff != 0 && !linear)
{
s64 num = ((s64)x * w0n) << shift;
s32 den = (x * w0d) + ((xdiff-x) * w1d);
// this seems to be a proper division on hardware :/
// I haven't been able to find cases that produce imperfect output
if (den == 0) yfactor = 0;
else yfactor = (s32)(num / den);
}
}
constexpr s32 Interpolate(s32 y0, s32 y1) const
{
if (xdiff == 0 || y0 == y1) return y0;
if (!linear)
{
// perspective-correct approx. interpolation
if (y0 < y1)
return y0 + (((y1-y0) * yfactor) >> shift);
else
return y1 + (((y0-y1) * ((1<<shift)-yfactor)) >> shift);
}
else
{
// linear interpolation
if (y0 < y1)
return y0 + (s64)(y1-y0) * x / xdiff;
else
return y1 + (s64)(y0-y1) * (xdiff - x) / xdiff;
}
}
constexpr s32 InterpolateZ(s32 z0, s32 z1, bool wbuffer) const
{
if (xdiff == 0 || z0 == z1) return z0;
if (wbuffer)
{
// W-buffering: perspective-correct approx. interpolation
if (z0 < z1)
return z0 + (((s64)(z1-z0) * yfactor) >> shift);
else
return z1 + (((s64)(z0-z1) * ((1<<shift)-yfactor)) >> shift);
}
else
{
// Z-buffering: linear interpolation
// still doesn't quite match hardware...
s32 base = 0, disp = 0, factor = 0;
if (z0 < z1)
{
base = z0;
disp = z1 - z0;
factor = x;
}
else
{
base = z1;
disp = z0 - z1,
factor = xdiff - x;
}
if (dir)
{
int shift = 0;
while (disp > 0x3FF)
{
disp >>= 1;
shift++;
}
return base + ((((s64)disp * factor * xrecip_z) >> 22) << shift);
}
else
{
disp >>= 9;
return base + (((s64)disp * factor * xrecip_z) >> 13);
}
}
}
private:
s32 x0, x1, xdiff, x;
int shift;
bool linear;
s32 xrecip_z;
s32 w0n, w0d, w1d;
u32 yfactor;
};
template<int side>
class Slope
{
public:
constexpr Slope() {}
constexpr s32 SetupDummy(s32 x0)
{
dx = 0;
this->x0 = x0;
this->xmin = x0;
this->xmax = x0;
Increment = 0;
XMajor = false;
Interp.Setup(0, 0, 0, 0);
Interp.SetX(0);
xcov_incr = 0;
return x0;
}
constexpr s32 Setup(s32 x0, s32 x1, s32 y0, s32 y1, s32 w0, s32 w1, s32 y)
{
this->x0 = x0;
this->y = y;
if (x1 > x0)
{
this->xmin = x0;
this->xmax = x1-1;
this->Negative = false;
}
else if (x1 < x0)
{
this->xmin = x1;
this->xmax = x0-1;
this->Negative = true;
}
else
{
this->xmin = x0;
this->xmax = this->xmin;
this->Negative = false;
}
xlen = xmax+1 - xmin;
ylen = y1 - y0;
// slope increment has a 18-bit fractional part
// note: for some reason, x/y isn't calculated directly,
// instead, 1/y is calculated and then multiplied by x
// TODO: this is still not perfect (see for example x=169 y=33)
if (ylen == 0)
Increment = 0;
else if (ylen == xlen && xlen != 1)
Increment = 0x40000;
else
{
s32 yrecip = (1<<18) / ylen;
Increment = (x1-x0) * yrecip;
if (Increment < 0) Increment = -Increment;
}
XMajor = (Increment > 0x40000);
if constexpr (side)
{
// right
if (XMajor) dx = Negative ? (0x20000 + 0x40000) : (Increment - 0x20000);
else if (Increment != 0) dx = Negative ? 0x40000 : 0;
else dx = 0;
}
else
{
// left
if (XMajor) dx = Negative ? ((Increment - 0x20000) + 0x40000) : 0x20000;
else if (Increment != 0) dx = Negative ? 0x40000 : 0;
else dx = 0;
}
dx += (y - y0) * Increment;
s32 x = XVal();
int interpoffset = (Increment >= 0x40000) && (side ^ Negative);
Interp.Setup(y0-interpoffset, y1-interpoffset, w0, w1);
Interp.SetX(y);
// used for calculating AA coverage
if (XMajor) xcov_incr = (ylen << 10) / xlen;
return x;
}
constexpr s32 Step()
{
dx += Increment;
y++;
s32 x = XVal();
Interp.SetX(y);
return x;
}
constexpr s32 XVal() const
{
s32 ret = 0;
if (Negative) ret = x0 - (dx >> 18);
else ret = x0 + (dx >> 18);
if (ret < xmin) ret = xmin;
else if (ret > xmax) ret = xmax;
return ret;
}
template<bool swapped>
constexpr void EdgeParams_XMajor(s32* length, s32* coverage) const
{
// only do length calc for right side when swapped as it's
// only needed for aa calcs, as actual line spans are broken
if constexpr (!swapped || side)
{
if (side ^ Negative)
*length = (dx >> 18) - ((dx-Increment) >> 18);
else
*length = ((dx+Increment) >> 18) - (dx >> 18);
}
// for X-major edges, we return the coverage
// for the first pixel, and the increment for
// further pixels on the same scanline
s32 startx = dx >> 18;
if (Negative) startx = xlen - startx;
if (side) startx = startx - *length + 1;
s32 startcov = (((startx << 10) + 0x1FF) * ylen) / xlen;
*coverage = (1<<31) | ((startcov & 0x3FF) << 12) | (xcov_incr & 0x3FF);
if constexpr (swapped) *length = 1;
}
template<bool swapped>
constexpr void EdgeParams_YMajor(s32* length, s32* coverage) const
{
*length = 1;
if (Increment == 0)
{
// for some reason vertical edges' aa values
// are inverted too when the edges are swapped
if constexpr (swapped)
*coverage = 0;
else
*coverage = 31;
}
else
{
s32 cov = ((dx >> 9) + (Increment >> 10)) >> 4;
if ((cov >> 5) != (dx >> 18)) cov = 31;
cov &= 0x1F;
if constexpr (swapped)
{
if (side ^ Negative) cov = 0x1F - cov;
}
else
{
if (!(side ^ Negative)) cov = 0x1F - cov;
}
*coverage = cov;
}
}
template<bool swapped>
constexpr void EdgeParams(s32* length, s32* coverage) const
{
if (XMajor)
return EdgeParams_XMajor<swapped>(length, coverage);
else
return EdgeParams_YMajor<swapped>(length, coverage);
}
s32 Increment;
bool Negative;
bool XMajor;
Interpolator<1> Interp;
private:
s32 x0, xmin, xmax;
s32 xlen, ylen;
s32 dx;
s32 y;
s32 xcov_incr;
s32 ycoverage, ycov_incr;
};
template <typename T>
inline T ReadVRAM_Texture(u32 addr, const GPU& gpu) const
{
return *(T*)&gpu.VRAMFlat_Texture[addr & 0x7FFFF];
}
template <typename T>
inline T ReadVRAM_TexPal(u32 addr, const GPU& gpu) const
{
return *(T*)&gpu.VRAMFlat_TexPal[addr & 0x1FFFF];
}
u32 AlphaBlend(const GPU3D& gpu3d, u32 srccolor, u32 dstcolor, u32 alpha) const noexcept;
struct RendererPolygon
{
Polygon* PolyData;
Slope<0> SlopeL;
Slope<1> SlopeR;
s32 XL, XR;
u32 CurVL, CurVR;
u32 NextVL, NextVR;
};
RendererPolygon PolygonList[2048];
bool DoTimings(s32 cycles, s32* timingcounter);
bool CheckTimings(s32 cycles, s32* timingcounter);
u32 DoTimingsPixels(s32 pixels, s32* timingcounter);
void FindFirstPolyDoTimings(int npolys, s32 y, int* firstpolyeven, int* firstpolyodd, s32* timingcountereven, s32*timingcounterodd);
void TextureLookup(const GPU& gpu, u32 texparam, u32 texpal, s16 s, s16 t, u16* color, u8* alpha) const;
u32 RenderPixel(const GPU& gpu, const Polygon* polygon, u8 vr, u8 vg, u8 vb, s16 s, s16 t) const;
void PlotTranslucentPixel(const GPU3D& gpu3d, u32 pixeladdr, u32 color, u32 z, u32 polyattr, u32 shadow);
void SetupPolygonLeftEdge(RendererPolygon* rp, s32 y) const;
void SetupPolygonRightEdge(RendererPolygon* rp, s32 y) const;
void SetupPolygon(RendererPolygon* rp, Polygon* polygon) const;
void Step(RendererPolygon* rp);
void CheckSlope(RendererPolygon* rp, s32 y);
bool RenderShadowMaskScanline(const GPU3D& gpu3d, RendererPolygon* rp, s32 y, s32* timingcounter);
bool RenderPolygonScanline(const GPU& gpu, RendererPolygon* rp, s32 y, s32* timingcounter);
void RenderScanline(const GPU& gpu, s32 y, int firstpoly, int npolys, s32* timingcounter);
u32 CalculateFogDensity(const GPU3D& gpu3d, u32 pixeladdr) const;
bool CheckEdgeMarkingPixel(u32 polyid, u32 z, u32 pixeladdr);
bool CheckEdgeMarkingClearPlane(const GPU3D& gpu3d, u32 polyid, u32 z);
template <bool push> void ScanlineFinalPass(const GPU3D& gpu3d, s32 y, bool checkprev, bool checknext);
void ClearBuffers(const GPU& gpu);
void RenderPolygonsFast(GPU& gpu, Polygon** polygons, int npolys);
void RenderPolygonsTiming(GPU& gpu, Polygon** polygons, int npolys);
void RenderThreadFunc(GPU& gpu);
// counters for scanline rasterization timings
s32 ScanlineTimeout;
s32 RasterTiming;
// buffer dimensions are 258x194 to add a offscreen 1px border
// which simplifies edge marking tests
// buffer is duplicated to keep track of the two topmost pixels
// TODO: check if the hardware can accidentally plot pixels
// offscreen in that border
static constexpr int ScanlineWidth = 256;
static constexpr int NumScanlinesIntBuf = 192;
//static constexpr int NumScanlinesRD = 48;
static constexpr int NumScanlinesFinal = 192;
static constexpr int BufferSize = ScanlineWidth * NumScanlinesIntBuf;
//static constexpr int RDBufferSize = ScanlineWidth * NumScanlinesRD;
static constexpr int FinalBufferSize = ScanlineWidth * NumScanlinesFinal;
u32 ColorBuffer[BufferSize * 2];
u32 DepthBuffer[BufferSize * 2];
u32 AttrBuffer[BufferSize * 2];
//u32 RDBuffer[RDBufferSize]; // is this buffer ever initialized by hw before writing to it? what is its initial value? can you transfer 3d framebuffer data between games?
u32 FinalBuffer[FinalBufferSize];
// attribute buffer:
// bit0-3: edge flags (left/right/top/bottom)
// bit4: backfacing flag
// bit8-12: antialiasing alpha
// bit15: fog enable
// bit16-21: polygon ID for translucent pixels
// bit22: translucent flag
// bit24-29: polygon ID for opaque pixels
u8 StencilBuffer[256*2];
bool PrevIsShadowMask;
bool Enabled;
bool FrameIdentical;
// threading
bool Threaded;
Platform::Thread* RenderThread;
std::atomic_bool RenderThreadRunning;
std::atomic_bool RenderThreadRendering;
// Used by the main thread to tell the render thread to start rendering a frame
Platform::Semaphore* Sema_RenderStart;
// Used by the render thread to tell the main thread that it's done rendering a frame
Platform::Semaphore* Sema_RenderDone;
// Used to allow the main thread to read some scanlines
// before (the 3D portion of) the entire frame is rasterized.
Platform::Semaphore* Sema_ScanlineCount;
};
}