diff --git a/CMakeLists.txt b/CMakeLists.txt index c1860a2b5332..fcf74005e7f9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1906,6 +1906,8 @@ set(GPU_SOURCES GPU/Common/Draw2D.cpp GPU/Common/Draw2D.h GPU/Common/DepthBufferCommon.cpp + GPU/Common/DepthRaster.cpp + GPU/Common/DepthRaster.h GPU/Common/TextureShaderCommon.cpp GPU/Common/TextureShaderCommon.h GPU/Common/DepalettizeShaderCommon.cpp diff --git a/Common/Data/Convert/ColorConv.cpp b/Common/Data/Convert/ColorConv.cpp index 72fac52f2f01..5c4df7fca808 100644 --- a/Common/Data/Convert/ColorConv.cpp +++ b/Common/Data/Convert/ColorConv.cpp @@ -65,7 +65,7 @@ void ConvertBGRA8888ToRGB888(u8 *dst, const u32 *src, u32 numPixels) { } #if PPSSPP_ARCH(SSE2) -// fp64's improved version, see #19751 +// fp64's improved SSE2 version, see #19751. SSE4 no longer required here. static inline void ConvertRGBA8888ToRGBA5551(__m128i *dstp, const __m128i *srcp, u32 sseChunks) { const __m128i maskRB = _mm_set1_epi32(0x00F800F8); const __m128i maskGA = _mm_set1_epi32(0x8000F800); @@ -76,7 +76,7 @@ static inline void ConvertRGBA8888ToRGBA5551(__m128i *dstp, const __m128i *srcp, __m128i c0 = _mm_load_si128(&srcp[i + 0]); __m128i c1 = _mm_load_si128(&srcp[i + 1]); - __m128i rb0 = _mm_and_si128(c0, maskRB); // 00000000bbbbb00000000000rrrrr000 + __m128i rb0 = _mm_and_si128(c0, maskRB); // 00000000bbbbb00000000000rrrrr000 (each 32-bit lane) __m128i rb1 = _mm_and_si128(c1, maskRB); // 00000000bbbbb00000000000rrrrr000 __m128i ga0 = _mm_and_si128(c0, maskGA); // a000000000000000ggggg00000000000 __m128i ga1 = _mm_and_si128(c1, maskGA); // a000000000000000ggggg00000000000 diff --git a/Common/GPU/DataFormat.h b/Common/GPU/DataFormat.h index 6f5cd2bc2eae..428146802a6c 100644 --- a/Common/GPU/DataFormat.h +++ b/Common/GPU/DataFormat.h @@ -74,6 +74,7 @@ bool DataFormatIsDepthStencil(DataFormat fmt); inline bool DataFormatIsColor(DataFormat fmt) { return !DataFormatIsDepthStencil(fmt); } +int DataFormatNumChannels(DataFormat fmt); bool DataFormatIsBlockCompressed(DataFormat fmt, int *blockSize); // Limited format support for now. diff --git a/Common/GPU/Vulkan/thin3d_vulkan.cpp b/Common/GPU/Vulkan/thin3d_vulkan.cpp index e3d135e8c0b1..b744235b892f 100644 --- a/Common/GPU/Vulkan/thin3d_vulkan.cpp +++ b/Common/GPU/Vulkan/thin3d_vulkan.cpp @@ -803,9 +803,15 @@ bool VKTexture::Create(VkCommandBuffer cmd, VulkanBarrierBatch *postBarriers, Vu } VkComponentMapping r8AsAlpha[4] = { VK_COMPONENT_SWIZZLE_ONE, VK_COMPONENT_SWIZZLE_ONE, VK_COMPONENT_SWIZZLE_ONE, VK_COMPONENT_SWIZZLE_R }; + VkComponentMapping r8AsColor[4] = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_ONE }; + VkComponentMapping *swizzle = nullptr; + switch (desc.swizzle) { + case TextureSwizzle::R8_AS_ALPHA: swizzle = r8AsAlpha; break; + case TextureSwizzle::R8_AS_GRAYSCALE: swizzle = r8AsColor; break; + } VulkanBarrierBatch barrier; - if (!vkTex_->CreateDirect(width_, height_, 1, mipLevels_, vulkanFormat, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, usageBits, &barrier, desc.swizzle == TextureSwizzle::R8_AS_ALPHA ? r8AsAlpha : nullptr)) { + if (!vkTex_->CreateDirect(width_, height_, 1, mipLevels_, vulkanFormat, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, usageBits, &barrier, swizzle)) { ERROR_LOG(Log::G3D, "Failed to create VulkanTexture: %dx%dx%d fmt %d, %d levels", width_, height_, depth_, (int)vulkanFormat, mipLevels_); return false; } diff --git a/Common/GPU/thin3d.cpp b/Common/GPU/thin3d.cpp index de8db83fb331..72fbe873e86a 100644 --- a/Common/GPU/thin3d.cpp +++ b/Common/GPU/thin3d.cpp @@ -118,6 +118,25 @@ bool DataFormatIsBlockCompressed(DataFormat fmt, int *blockSize) { } } +int DataFormatNumChannels(DataFormat fmt) { + switch (fmt) { + case DataFormat::D16: + case DataFormat::D32F: + case DataFormat::R8_UNORM: + case DataFormat::R16_UNORM: + case DataFormat::R16_FLOAT: + case DataFormat::R32_FLOAT: + return 1; + case DataFormat::R8G8B8A8_UNORM: + case DataFormat::R8G8B8A8_UNORM_SRGB: + case DataFormat::B8G8R8A8_UNORM: + case DataFormat::B8G8R8A8_UNORM_SRGB: + return 4; + default: + return 0; + } +} + RefCountedObject::~RefCountedObject() { const int rc = refcount_.load(); _dbg_assert_msg_(rc == 0xDEDEDE, "Unexpected refcount %d in object of type '%s'", rc, name_); diff --git a/Common/GPU/thin3d.h b/Common/GPU/thin3d.h index ce62382718e2..f7a23795b017 100644 --- a/Common/GPU/thin3d.h +++ b/Common/GPU/thin3d.h @@ -643,6 +643,7 @@ typedef std::function= y, else 0x0000. +inline __m128i _mm_cmpge_epu16(__m128i x, __m128i y) { + return _mm_cmple_epu16(y, x); +} + +// Return 0xFFFF where x > y, else 0x0000. +inline __m128i _mm_cmpgt_epu16(__m128i x, __m128i y) { + return _mm_andnot_si128(_mm_cmpeq_epi16(x, y), _mm_cmple_epu16(y, x)); +} + +// Return 0xFFFF where x < y, else 0x0000. +inline __m128i _mm_cmplt_epu16(__m128i x, __m128i y) { + return _mm_cmpgt_epu16(y, x); +} + #endif diff --git a/Common/Math/fast/fast_matrix.c b/Common/Math/fast/fast_matrix.c index 0402f366297e..d23ce3b0e0b2 100644 --- a/Common/Math/fast/fast_matrix.c +++ b/Common/Math/fast/fast_matrix.c @@ -6,8 +6,6 @@ #if PPSSPP_ARCH(SSE2) -#include "fast_matrix.h" - void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b) { int i; __m128 a_col_1 = _mm_loadu_ps(a); diff --git a/Core/Compatibility.cpp b/Core/Compatibility.cpp index 379042198011..cd2c50c48835 100644 --- a/Core/Compatibility.cpp +++ b/Core/Compatibility.cpp @@ -149,6 +149,7 @@ void Compatibility::CheckSettings(IniFile &iniFile, const std::string &gameID) { CheckSetting(iniFile, gameID, "DisableMemcpySlicing", &flags_.DisableMemcpySlicing); CheckSetting(iniFile, gameID, "ForceEnableGPUReadback", &flags_.ForceEnableGPUReadback); CheckSetting(iniFile, gameID, "UseFFMPEGFindStreamInfo", &flags_.UseFFMPEGFindStreamInfo); + CheckSetting(iniFile, gameID, "SoftwareRasterDepth", &flags_.SoftwareRasterDepth); } void Compatibility::CheckVRSettings(IniFile &iniFile, const std::string &gameID) { diff --git a/Core/Compatibility.h b/Core/Compatibility.h index 8a0e33af4d34..4688df37c055 100644 --- a/Core/Compatibility.h +++ b/Core/Compatibility.h @@ -112,6 +112,7 @@ struct CompatFlags { bool DisableMemcpySlicing; bool ForceEnableGPUReadback; bool UseFFMPEGFindStreamInfo; + bool SoftwareRasterDepth; }; struct VRCompat { diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp new file mode 100644 index 000000000000..ca8f81cedfb3 --- /dev/null +++ b/GPU/Common/DepthRaster.cpp @@ -0,0 +1,372 @@ +#include +#include +#include + +#include "Common/Math/CrossSIMD.h" +#include "GPU/Common/DepthRaster.h" +#include "GPU/Math3D.h" +#include "Common/Math/math_util.h" +#include "GPU/Common/VertexDecoderCommon.h" + +// We only need to support these three modes. +enum class ZCompareMode { + Greater, // Most common + Less, // Less common + Always, // Fairly common +}; + +void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, short depthValue, ZCompareMode compareMode) { + // Swap coordinates if needed, we don't back-face-cull rects. + // We also ignore the UV rotation here. + if (x1 > x2) { + std::swap(x1, x2); + } + if (y1 > y2) { + std::swap(y1, y2); + } + if (x1 == x2 || y1 == y2) { + return; + } + + Vec8U16 valueX8 = Vec8U16::Splat(depthValue); + for (int y = y1; y < y2; y++) { + uint16_t *ptr = (uint16_t *)(dest + stride * y + x1); + int w = x2 - x1; + switch (compareMode) { + case ZCompareMode::Always: + if (depthValue == 0) { + memset(ptr, 0, w * 2); + } else { + while (w >= 8) { + valueX8.Store(ptr); + ptr += 8; + w -= 8; + } + } + break; + // TODO: Trailer + default: + // TODO + break; + } + } +} + +alignas(16) static const int zero123[4] = {0, 1, 2, 3}; + +struct Edge { + // Dimensions of our pixel group + static const int stepXSize = 4; + static const int stepYSize = 1; + + Vec4S32 oneStepX; + Vec4S32 oneStepY; + + Vec4S32 init(int v0x, int v0y, int v1x, int v1y, int p0x, int p0y) { + // Edge setup + int A = v0y - v1y; + int B = v1x - v0x; + int C = v0x * v1y - v0y * v1x; + + // Step deltas + oneStepX = Vec4S32::Splat(A * stepXSize); + oneStepY = Vec4S32::Splat(B * stepYSize); + + // x/y values for initial pixel block. Add horizontal offsets. + Vec4S32 x = Vec4S32::Splat(p0x) + Vec4S32::LoadAligned(zero123); + Vec4S32 y = Vec4S32::Splat(p0y); + + // Edge function values at origin + return Vec4S32::Splat(A) * x + Vec4S32::Splat(B) * y + Vec4S32::Splat(C); + } +}; + +// Adapted from Intel's depth rasterizer example. +// Started with the scalar version, will SIMD-ify later. +// x1/y1 etc are the scissor rect. +void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, ZCompareMode compareMode) { + int tileStartX = x1; + int tileEndX = x2; + + int tileStartY = y1; + int tileEndY = y2; + + // BEGIN triangle setup. This should be done SIMD, four triangles at a time. + // Due to the many multiplications, we might want to do it in floating point as 32-bit integer muls + // are slow on SSE2. + + // Convert to whole pixels for now. Later subpixel precision. + int v0x = tx[0]; + int v0y = ty[0]; + int v0z = tz[0]; + int v1x = tx[1]; + int v1y = ty[1]; + int v1z = tz[1]; + int v2x = tx[2]; + int v2y = ty[2]; + int v2z = tz[2]; + + // use fixed-point only for X and Y. Avoid work for Z and W. + // We use 4x1 tiles for simplicity. + int minX = std::max(std::min(std::min(v0x, v1x), v2x), tileStartX) & ~3; + int maxX = std::min(std::max(std::max(v0x, v1x), v2x) + 3, tileEndX) & ~3; + int minY = std::max(std::min(std::min(v0y, v1y), v2y), tileStartY); + int maxY = std::min(std::max(std::max(v0y, v1y), v2y), tileEndY); + if (maxX == minX || maxY == minY) { + // No pixels, or outside screen. + return; + } + + // TODO: Cull really small triangles here. + int triArea = (v1y - v2y) * v0x + (v2x - v1x) * v0y + (v1x * v2y - v2x * v1y); + if (triArea <= 0) { + return; + } + + float oneOverTriArea = 1.0f / (float)triArea; + + Edge e01, e12, e20; + + Vec4S32 w0_row = e12.init(v1x, v1y, v2x, v2y, minX, minY); + Vec4S32 w1_row = e20.init(v2x, v2y, v0x, v0y, minX, minY); + Vec4S32 w2_row = e01.init(v0x, v0y, v1x, v1y, minX, minY); + + // Prepare to interpolate Z + Vec4F32 zz0 = Vec4F32::Splat((float)v0z); + Vec4F32 zz1 = Vec4F32::Splat((float)(v1z - v0z) * oneOverTriArea); + Vec4F32 zz2 = Vec4F32::Splat((float)(v2z - v0z) * oneOverTriArea); + + // Rasterize + for (int y = minY; y <= maxY; y += Edge::stepYSize, w0_row += e12.oneStepY, w1_row += e20.oneStepY, w2_row += e01.oneStepY) { + // Barycentric coordinates at start of row + Vec4S32 w0 = w0_row; + Vec4S32 w1 = w1_row; + Vec4S32 w2 = w2_row; + + uint16_t *rowPtr = depthBuf + stride * y; + + for (int x = minX; x <= maxX; x += Edge::stepXSize, w0 += e12.oneStepX, w1 += e20.oneStepX, w2 += e01.oneStepX) { + // If p is on or inside all edges for any pixels, + // render those pixels. + Vec4S32 signCalc = w0 | w1 | w2; + if (!AnyZeroSignBit(signCalc)) { + continue; + } + + Vec4U16 bufferValues = Vec4U16::Load(rowPtr + x); + Vec4U16 shortMaskInv = SignBits32ToMaskU16(signCalc); + // Now, the mask has 1111111 where we should preserve the contents of the depth buffer. + + // Compute the Z value for all four pixels. + // float depth = zz[0] + beta * zz[1] + gamma * zz[2]; + Vec4U16 shortZ = Vec4U16::FromVec4F32(zz0 + Vec4F32FromS32(w1) * zz1 + Vec4F32FromS32(w2) * zz2); + + // TODO: Lift this switch out of the inner loop, or even out of the function with templating. + switch (compareMode) { + case ZCompareMode::Greater: + // To implement the greater/greater-than comparison, we can combine mask and max. + // It might be better to do the math in float space on x86 due to SSE2 deficiencies. + // We use AndNot to zero out Z results, before doing Max with the buffer. + AndNot(shortZ, shortMaskInv).Max(bufferValues).Store(rowPtr + x); + break; + case ZCompareMode::Less: // UNTESTED + // This time, we OR the mask and use .Min. + (shortZ | shortMaskInv).Min(bufferValues).Store(rowPtr + x); + break; + case ZCompareMode::Always: // UNTESTED + // This could be replaced with a vblend operation. + ((bufferValues & shortMaskInv) | AndNot(shortZ, shortMaskInv)).Store(rowPtr + x); + break; + } + } + } +} + +void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, VertexDecoder *dec, u32 vertTypeID) { + // TODO: Ditch skinned and morphed prims for now since we don't have a fast way to skin without running the full decoder. + _dbg_assert_((vertTypeID & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) == 0); + + int vertexStride = dec->VertexSize(); + int offset = dec->posoff; + + Mat4F32 mat(worldviewproj); + + const u8 *startPtr = (const u8 *)vertexData + indexLowerBound * vertexStride; + int count = indexUpperBound - indexLowerBound + 1; + + switch (vertTypeID & GE_VTYPE_POS_MASK) { + case GE_VTYPE_POS_FLOAT: + for (int i = 0; i < count; i++) { + const float *data = (const float *)(startPtr + i * vertexStride + offset); + Vec4F32::Load(data).AsVec3ByMatrix44(mat).Store(dest + i * 4); + } + break; + case GE_VTYPE_POS_16BIT: + for (int i = 0; i < count; i++) { + const s16 *data = ((const s16 *)((const s8 *)startPtr + i * vertexStride + offset)); + Vec4F32::LoadConvertS16(data).Mul(1.0f / 32768.f).AsVec3ByMatrix44(mat).Store(dest + i * 4); + } + break; + case GE_VTYPE_POS_8BIT: + for (int i = 0; i < count; i++) { + const s8 *data = (const s8 *)startPtr + i * vertexStride + offset; + Vec4F32::LoadConvertS8(data).Mul(1.0f / 128.0f).AsVec3ByMatrix44(mat).Store(dest + i * 4); + } + break; + } +} + +int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count) { + bool cullEnabled = gstate.isCullEnabled(); + GECullMode cullMode = gstate.getCullMode(); + + // TODO: On ARM we can do better by keeping these in lanes instead of splatting. + // However, hard to find a common abstraction. + const Vec4F32 viewportX = Vec4F32::Splat(gstate.getViewportXCenter()); + const Vec4F32 viewportY = Vec4F32::Splat(gstate.getViewportYCenter()); + const Vec4F32 viewportZ = Vec4F32::Splat(gstate.getViewportZCenter()); + const Vec4F32 viewportScaleX = Vec4F32::Splat(gstate.getViewportXScale()); + const Vec4F32 viewportScaleY = Vec4F32::Splat(gstate.getViewportYScale()); + const Vec4F32 viewportScaleZ = Vec4F32::Splat(gstate.getViewportZScale()); + + const Vec4F32 offsetX = Vec4F32::Splat(gstate.getOffsetX()); // We remove the 16 scale here + const Vec4F32 offsetY = Vec4F32::Splat(gstate.getOffsetY()); + + bool cullCCW = false; + + int outCount = 0; + + int flipCull = 0; + if (cullEnabled && cullMode == GE_CULL_CW) { + flipCull = 3; + } + for (int i = 0; i < count; i += 3) { + const float *verts[3] = { + transformed + indexBuffer[i] * 4, + transformed + indexBuffer[i + (1 ^ flipCull)] * 4, + transformed + indexBuffer[i + (2 ^ flipCull)] * 4, + }; + + // Check if any vertex is behind the 0 plane. + if (verts[0][3] < 0.0f || verts[1][3] < 0.0f || verts[2][3] < 0.0f) { + // Ditch this triangle. Later we should clip here. + continue; + } + + // These names are wrong .. until we transpose. + Vec4F32 x = Vec4F32::Load(verts[0]); + Vec4F32 y = Vec4F32::Load(verts[1]); + Vec4F32 z = Vec4F32::Load(verts[2]); + Vec4F32 w = Vec4F32::Zero(); + Vec4F32::Transpose(x, y, z, w); + // Now the names are accurate! Since we only have three vertices, the fourth member of each vector is zero + // and will not be stored (well it will be stored, but it'll be overwritten by the next vertex). + Vec4F32 recipW = w.Recip(); + + x *= recipW; + y *= recipW; + z *= recipW; + + Vec4S32 screen[3]; + screen[0] = Vec4S32FromF32((x * viewportScaleX + viewportX) - offsetX); + screen[1] = Vec4S32FromF32((y * viewportScaleY + viewportY) - offsetY); + screen[2] = Vec4S32FromF32((z * viewportScaleZ + viewportZ).Clamp(0.0f, 65535.0f)); + + screen[0].Store(tx + outCount); + screen[1].Store(ty + outCount); + screen[2].Store(tz + outCount); + outCount += 3; + + if (!cullEnabled) { + // If culling is off, shuffle the three vectors to produce the opposite triangle, and store them after. + + // HOWEVER! I realized that this is not the optimal layout, after all. + // We should group 4 triangles at a time and interleave them (so we first have all X of vertex 0, + // then all X of vertex 1, and so on). This seems solvable with another transpose, if we can easily + // collect four triangles at a time... + + screen[0].SwapLowerElements().Store(tx + outCount); + screen[1].SwapLowerElements().Store(ty + outCount); + screen[2].SwapLowerElements().Store(tz + outCount); + outCount += 3; + } + } + return outCount; +} + +void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, GEPrimitiveType prim, const TransformedVertex *transformed, int count) { + _dbg_assert_(prim == GE_PRIM_RECTANGLES || prim == GE_PRIM_TRIANGLES); + + // TODO: This is basically a transpose, or AoS->SoA conversion. There may be fast ways. + for (int i = 0; i < count; i++) { + tx[i] = (int)transformed[i].pos[0]; + ty[i] = (int)transformed[i].pos[1]; + tz[i] = (u16)transformed[i].pos[2]; + } +} + +// Rasterizes screen-space vertices. +void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, int count) { + // Prim should now be either TRIANGLES or RECTs. + _dbg_assert_(prim == GE_PRIM_RECTANGLES || prim == GE_PRIM_TRIANGLES); + + // Ignore draws where stencil operations are active? + if (gstate.isStencilTestEnabled()) { + // return; + } + + GEComparison compareMode = gstate.getDepthTestFunction(); + + ZCompareMode comp; + // Ignore some useless compare modes. + switch (compareMode) { + case GE_COMP_ALWAYS: + comp = ZCompareMode::Always; + break; + case GE_COMP_LEQUAL: + case GE_COMP_LESS: + comp = ZCompareMode::Less; + break; + case GE_COMP_GEQUAL: + case GE_COMP_GREATER: + comp = ZCompareMode::Greater; // Most common + break; + case GE_COMP_NEVER: + case GE_COMP_EQUAL: + // These will never have a useful effect in Z-only raster. + [[fallthrough]]; + case GE_COMP_NOTEQUAL: + // This is highly unusual, let's just ignore it. + [[fallthrough]]; + default: + return; + } + + if (gstate.isModeClear()) { + if (!gstate.isClearModeDepthMask()) { + return; + } + comp = ZCompareMode::Always; + } else { + if (!gstate.isDepthTestEnabled() || !gstate.isDepthWriteEnabled()) + return; + } + + switch (prim) { + case GE_PRIM_RECTANGLES: + for (int i = 0; i < count; i += 2) { + uint16_t z = tz[i + 1]; // depth from second vertex + // TODO: Should clip coordinates to the scissor rectangle. + // We remove the subpixel information here. + DepthRasterRect(depth, depthStride, tx[i], ty[i], tx[i + 1], ty[i + 1], z, comp); + } + break; + case GE_PRIM_TRIANGLES: + for (int i = 0; i < count; i += 3) { + DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, &tx[i], &ty[i], &tz[i], comp); + } + break; + default: + _dbg_assert_(false); + } +} diff --git a/GPU/Common/DepthRaster.h b/GPU/Common/DepthRaster.h new file mode 100644 index 000000000000..e92c1a1348ed --- /dev/null +++ b/GPU/Common/DepthRaster.h @@ -0,0 +1,23 @@ +#pragma once + +#include "Common/CommonTypes.h" +#include "GPU/ge_constants.h" + +struct DepthScreenVertex { + int x; + int y; + int z; +}; + +// Specialized, very limited depth-only rasterizer. +// Meant to run in parallel with hardware rendering, in games that read back the depth buffer +// for effects like lens flare. +// So, we can be quite inaccurate without any issues, and skip a lot of functionality. + +class VertexDecoder; +struct TransformedVertex; + +int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count); +void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, VertexDecoder *dec, u32 vertTypeID); +void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, GEPrimitiveType prim, const TransformedVertex *transformed, int count); +void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, int count); diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index 818021a79b3a..bbe36fb479e7 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -23,9 +23,11 @@ #include "Common/LogReporting.h" #include "Common/Math/SIMDHeaders.h" #include "Common/Math/lin/matrix4x4.h" +#include "Core/System.h" #include "Core/Config.h" #include "GPU/Common/DrawEngineCommon.h" #include "GPU/Common/SplineCommon.h" +#include "GPU/Common/DepthRaster.h" #include "GPU/Common/VertexDecoderCommon.h" #include "GPU/Common/SoftwareTransformCommon.h" #include "GPU/ge_constants.h" @@ -34,7 +36,11 @@ #define QUAD_INDICES_MAX 65536 enum { - TRANSFORMED_VERTEX_BUFFER_SIZE = VERTEX_BUFFER_MAX * sizeof(TransformedVertex) + TRANSFORMED_VERTEX_BUFFER_SIZE = VERTEX_BUFFER_MAX * sizeof(TransformedVertex), + DEPTH_TRANSFORMED_SIZE = VERTEX_BUFFER_MAX * 4, + DEPTH_SCREENVERTS_COMPONENT_COUNT = VERTEX_BUFFER_MAX, + DEPTH_SCREENVERTS_COMPONENT_SIZE = DEPTH_SCREENVERTS_COMPONENT_COUNT * sizeof(int) + 384, + DEPTH_SCREENVERTS_SIZE = DEPTH_SCREENVERTS_COMPONENT_SIZE * 3, }; DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) { @@ -46,6 +52,12 @@ DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) { decoded_ = (u8 *)AllocateMemoryPages(DECODED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); decIndex_ = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); indexGen.Setup(decIndex_); + + useDepthRaster_ = PSP_CoreParameter().compat.flags().SoftwareRasterDepth; + if (useDepthRaster_) { + depthTransformed_ = (float *)AllocateMemoryPages(DEPTH_TRANSFORMED_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); + depthScreenVerts_ = (int *)AllocateMemoryPages(DEPTH_SCREENVERTS_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); + } } DrawEngineCommon::~DrawEngineCommon() { @@ -53,6 +65,10 @@ DrawEngineCommon::~DrawEngineCommon() { FreeMemoryPages(decIndex_, DECODED_INDEX_BUFFER_SIZE); FreeMemoryPages(transformed_, TRANSFORMED_VERTEX_BUFFER_SIZE); FreeMemoryPages(transformedExpanded_, 3 * TRANSFORMED_VERTEX_BUFFER_SIZE); + if (depthTransformed_) { + FreeMemoryPages(depthTransformed_, DEPTH_TRANSFORMED_SIZE); + FreeMemoryPages(depthScreenVerts_, DEPTH_SCREENVERTS_SIZE); + } delete decJitCache_; decoderMap_.Iterate([&](const uint32_t vtype, VertexDecoder *decoder) { delete decoder; @@ -886,3 +902,99 @@ bool DrawEngineCommon::DescribeCodePtr(const u8 *ptr, std::string &name) const { return false; } } + +void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID, int vertexCount) { + switch (prim) { + case GE_PRIM_INVALID: + case GE_PRIM_KEEP_PREVIOUS: + case GE_PRIM_LINES: + case GE_PRIM_LINE_STRIP: + case GE_PRIM_POINTS: + return; + default: + break; + } + + if (vertTypeID & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) { + return; + } + + float world[16]; + float view[16]; + float worldview[16]; + float worldviewproj[16]; + ConvertMatrix4x3To4x4(world, gstate.worldMatrix); + ConvertMatrix4x3To4x4(view, gstate.viewMatrix); + Matrix4ByMatrix4(worldview, world, view); + Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix); // TODO: Include adjustments to the proj matrix? + + // Decode. + int numDec = 0; + for (int i = 0; i < numDrawVerts_; i++) { + DeferredVerts &dv = drawVerts_[i]; + + int indexLowerBound = dv.indexLowerBound; + drawVertexOffsets_[i] = numDec - indexLowerBound; + + int indexUpperBound = dv.indexUpperBound; + if (indexUpperBound + 1 - indexLowerBound + numDec >= VERTEX_BUFFER_MAX) { + // Hit our limit! Stop decoding in this draw. + break; + } + + // Decode the verts (and at the same time apply morphing/skinning). Simple. + DecodeAndTransformForDepthRaster(depthTransformed_ + numDec * 4, prim, worldviewproj, dv.verts, indexLowerBound, indexUpperBound, dec, vertTypeID); + numDec += indexUpperBound - indexLowerBound + 1; + } + + int *tx = depthScreenVerts_; + int *ty = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT; + int *tz = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2; + + // Clip and triangulate using the index buffer. + int outVertCount = DepthRasterClipIndexedTriangles(tx, ty, tz, depthTransformed_, decIndex_, vertexCount); + if (outVertCount & 15) { + // Zero padding + for (int i = outVertCount; i < ((outVertCount + 16) & ~15); i++) { + tx[i] = 0; + ty[i] = 0; + tz[i] = 0; + } + } + + DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(), + GE_PRIM_TRIANGLES, gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), + tx, ty, tz, outVertCount); +} + +void DrawEngineCommon::DepthRasterPretransformed(GEPrimitiveType prim, const TransformedVertex *inVerts, int count) { + switch (prim) { + case GE_PRIM_INVALID: + case GE_PRIM_KEEP_PREVIOUS: + case GE_PRIM_LINES: + case GE_PRIM_LINE_STRIP: + case GE_PRIM_POINTS: + return; + default: + break; + } + + _dbg_assert_(prim != GE_PRIM_TRIANGLE_STRIP && prim != GE_PRIM_TRIANGLE_FAN); + + int *tx = depthScreenVerts_; + int *ty = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT; + int *tz = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2; + + DepthRasterConvertTransformed(tx, ty, tz, prim, inVerts, count); + if (count & 15) { + // Zero padding + for (int i = count; i < ((count + 16) & ~15); i++) { + tx[i] = 0; + ty[i] = 0; + tz[i] = 0; + } + } + DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(), + prim, gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), + tx, ty, tz, count); +} diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h index 0f8ab8a7515a..053c4c31f55b 100644 --- a/GPU/Common/DrawEngineCommon.h +++ b/GPU/Common/DrawEngineCommon.h @@ -27,6 +27,7 @@ #include "GPU/Common/GPUStateUtils.h" #include "GPU/Common/IndexGenerator.h" #include "GPU/Common/VertexDecoderCommon.h" +#include "GPU/Common/DepthRaster.h" class VertexDecoder; @@ -158,6 +159,11 @@ class DrawEngineCommon { _dbg_assert_(numDrawVerts_ == 0 && numDrawInds_ == 0); } + // temporary hack + uint8_t *GetTempSpace() { + return decoded_ + 12 * 65536; + } + protected: virtual bool UpdateUseHWTessellation(bool enabled) const { return enabled; } void UpdatePlanes(); @@ -169,6 +175,9 @@ class DrawEngineCommon { void ApplyFramebufferRead(FBOTexState *fboTexState); + void DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID, int vertexCount); + void DepthRasterPretransformed(GEPrimitiveType prim, const TransformedVertex *inVerts, int count); + static inline int IndexSize(u32 vtype) { const u32 indexType = (vtype & GE_VTYPE_IDX_MASK); if (indexType == GE_VTYPE_IDX_16BIT) { @@ -223,6 +232,11 @@ class DrawEngineCommon { } inline bool CollectedPureDraw() const { + // TODO: Do something faster. + if (useDepthRaster_) { + return false; + } + switch (seenPrims_) { case 1 << GE_PRIM_TRIANGLE_STRIP: return !anyCCWOrIndexed_ && numDrawInds_ == 1; @@ -338,4 +352,10 @@ class DrawEngineCommon { bool offsetOutsideEdge_; GPUCommon *gpuCommon_; + + // Software depth raster + bool useDepthRaster_ = false; + + float *depthTransformed_ = nullptr; + int *depthScreenVerts_ = nullptr; }; diff --git a/GPU/Common/IndexGenerator.h b/GPU/Common/IndexGenerator.h index 723f4caabd7c..48df11e97291 100644 --- a/GPU/Common/IndexGenerator.h +++ b/GPU/Common/IndexGenerator.h @@ -54,7 +54,7 @@ class IndexGenerator { void TranslatePrim(int prim, int numInds, const u32_le *inds, int indexOffset, bool clockwise); // This is really the number of generated indices, or 3x the number of triangles. - int VertexCount() const { return inds_ - indsBase_; } + int VertexCount() const { return (int)(inds_ - indsBase_); } private: // Points (why index these? code simplicity) diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h index 2793ad80eed5..4b905f03c956 100644 --- a/GPU/Common/VertexDecoderCommon.h +++ b/GPU/Common/VertexDecoderCommon.h @@ -122,7 +122,7 @@ class IndexConverter { // Reads decoded vertex formats in a convenient way. For software transform and debugging. class VertexReader { public: - VertexReader(u8 *base, const DecVtxFormat &decFmt, int vtype) : base_(base), data_(base), decFmt_(decFmt), vtype_(vtype) {} + VertexReader(const u8 *base, const DecVtxFormat &decFmt, int vtype) : base_(base), data_(base), decFmt_(decFmt), vtype_(vtype) {} void ReadPos(float pos[3]) const { // Only DEC_FLOAT_3 is supported. @@ -297,8 +297,8 @@ class VertexReader { } private: - u8 *base_; - u8 *data_; + const u8 *base_; + const u8 *data_; DecVtxFormat decFmt_; int vtype_; }; diff --git a/GPU/GPU.vcxproj b/GPU/GPU.vcxproj index 5cb3ea9e6238..c27d08354936 100644 --- a/GPU/GPU.vcxproj +++ b/GPU/GPU.vcxproj @@ -346,6 +346,7 @@ + @@ -468,6 +469,7 @@ + diff --git a/GPU/GPU.vcxproj.filters b/GPU/GPU.vcxproj.filters index 610ba94cbe33..1529b974c13f 100644 --- a/GPU/GPU.vcxproj.filters +++ b/GPU/GPU.vcxproj.filters @@ -279,6 +279,9 @@ Debugger + + Common + @@ -554,6 +557,9 @@ Debugger + + Common + diff --git a/GPU/GPUState.h b/GPU/GPUState.h index b5318331750b..e2814d5a1ce7 100644 --- a/GPU/GPUState.h +++ b/GPU/GPUState.h @@ -227,7 +227,7 @@ struct GPUgstate { // Cull bool isCullEnabled() const { return cullfaceEnable & 1; } - int getCullMode() const { return cullmode & 1; } + GECullMode getCullMode() const { return (GECullMode)(cullmode & 1); } // Color Mask bool isClearModeColorMask() const { return (clearmode&0x100) != 0; } diff --git a/GPU/Vulkan/DrawEngineVulkan.cpp b/GPU/Vulkan/DrawEngineVulkan.cpp index f1279b855a69..ebe3d022df2a 100644 --- a/GPU/Vulkan/DrawEngineVulkan.cpp +++ b/GPU/Vulkan/DrawEngineVulkan.cpp @@ -370,6 +370,9 @@ void DrawEngineVulkan::Flush() { } else { renderManager->Draw(descSetIndex, ARRAY_SIZE(dynamicUBOOffsets), dynamicUBOOffsets, vbuf, vbOffset, vertexCount); } + if (useDepthRaster_) { + DepthRasterTransform(prim, dec_, dec_->VertexType(), vertexCount); + } } else { PROFILE_THIS_SCOPE("soft"); VertexDecoder *swDec = dec_; @@ -438,6 +441,12 @@ void DrawEngineVulkan::Flush() { swTransform.SetProjMatrix(gstate.projMatrix, gstate_c.vpWidth < 0, gstate_c.vpHeight < 0, trans, scale); swTransform.Transform(prim, swDec->VertexType(), swDec->GetDecVtxFmt(), numDecodedVerts_, &result); + + // At this point, rect and line primitives are still preserved as such. So, it's the best time to do software depth raster. + if (useDepthRaster_) { + DepthRasterPretransformed(prim, transformed_, numDecodedVerts_); + } + // Non-zero depth clears are unusual, but some drivers don't match drawn depth values to cleared values. // Games sometimes expect exact matches (see #12626, for example) for equal comparisons. if (result.action == SW_CLEAR && everUsedEqualDepth_ && gstate.isClearModeDepthMask() && result.depth > 0.0f && result.depth < 1.0f) diff --git a/GPU/ge_constants.h b/GPU/ge_constants.h index 1d123e162093..f4a366f80250 100644 --- a/GPU/ge_constants.h +++ b/GPU/ge_constants.h @@ -623,6 +623,11 @@ enum GEPatchPrimType GE_PATCHPRIM_UNKNOWN = 3, }; +enum GECullMode { + GE_CULL_CW = 0, + GE_CULL_CCW = 1, +}; + inline GEPrimitiveType PatchPrimToPrim(GEPatchPrimType type) { switch (type) { case GE_PATCHPRIM_TRIANGLES: return GE_PRIM_TRIANGLES; diff --git a/UI/ImDebugger/ImDebugger.cpp b/UI/ImDebugger/ImDebugger.cpp index 5f14a8529c0d..9cb082cf68bf 100644 --- a/UI/ImDebugger/ImDebugger.cpp +++ b/UI/ImDebugger/ImDebugger.cpp @@ -1552,4 +1552,5 @@ void ImConfig::SyncConfig(IniFile *ini, bool save) { sync.SetSection(ini->GetOrCreateSection("Settings")); sync.Sync("displayLatched", &displayLatched, false); + sync.Sync("realtimePixelPreview", &realtimePixelPreview, false); } diff --git a/UI/ImDebugger/ImDebugger.h b/UI/ImDebugger/ImDebugger.h index f1363d6a0f5f..c0fa9a037d56 100644 --- a/UI/ImDebugger/ImDebugger.h +++ b/UI/ImDebugger/ImDebugger.h @@ -153,6 +153,7 @@ struct ImConfig { int selectedMemCheck = -1; uint64_t selectedTexAddr = 0; + bool realtimePixelPreview = false; int breakCount = 0; bool displayLatched = false; @@ -170,12 +171,14 @@ enum class ImCmd { SHOW_IN_CPU_DISASM, SHOW_IN_GE_DISASM, SHOW_IN_MEMORY_VIEWER, // param is address, param2 is viewer index + SHOW_IN_PIXEL_VIEWER, // param is address, param2 is stride, |0x80000000 if depth, param3 is w/h }; struct ImCommand { ImCmd cmd; uint32_t param; uint32_t param2; + uint32_t param3; }; struct ImControl { diff --git a/UI/ImDebugger/ImGe.cpp b/UI/ImDebugger/ImGe.cpp index 415cd0a7e133..52783719a545 100644 --- a/UI/ImDebugger/ImGe.cpp +++ b/UI/ImDebugger/ImGe.cpp @@ -149,12 +149,44 @@ void ImGePixelViewerWindow::Draw(ImConfig &cfg, ImControl &control, GPUDebugInte if (ImGui::Button("Refresh")) { viewer_.Snapshot(); } + if (ImGui::Button("Show cur depth")) { + viewer_.addr = gstate.getDepthBufRawAddress() | 0x04000000; + viewer_.format = GE_FORMAT_DEPTH16; + viewer_.stride = gstate.DepthBufStride(); + viewer_.width = viewer_.stride; + viewer_.Snapshot(); + } + if (ImGui::Button("Show cur color")) { + viewer_.addr = gstate.getFrameBufAddress(); + viewer_.format = gstate.FrameBufFormat(); + viewer_.stride = gstate.FrameBufStride(); + viewer_.width = viewer_.stride; + viewer_.Snapshot(); + } + ImGui::Checkbox("Realtime", &cfg.realtimePixelPreview); } ImGui::EndChild(); + if (cfg.realtimePixelPreview) { + viewer_.Snapshot(); + } + ImGui::SameLine(); if (ImGui::BeginChild("right")) { + ImVec2 p0 = ImGui::GetCursorScreenPos(); viewer_.Draw(gpuDebug, draw); + if (ImGui::IsItemHovered()) { + int x = (int)(ImGui::GetMousePos().x - p0.x); + int y = (int)(ImGui::GetMousePos().y - p0.y); + char temp[128]; + if (viewer_.FormatValueAt(temp, sizeof(temp), x, y)) { + ImGui::Text("(%d, %d): %s", x, y, temp); + } else { + ImGui::Text("%d, %d: N/A", x, y); + } + } else { + ImGui::TextUnformatted("(no pixel hovered)"); + } } ImGui::EndChild(); ImGui::End(); @@ -211,6 +243,12 @@ bool ImGePixelViewer::FormatValueAt(char *buf, size_t bufSize, int x, int y) con snprintf(buf, bufSize, "%08x (raw: %04x)", RGBA5551ToRGBA8888(raw), raw); break; } + case GE_FORMAT_DEPTH16: + { + u16 raw = Memory::Read_U16(pixelAddr); + snprintf(buf, bufSize, "%0.4f (raw: %04x / %d)", (float)raw / 65535.0f, raw, raw); + break; + } default: snprintf(buf, bufSize, "N/A"); return false; @@ -356,6 +394,7 @@ bool ImGeReadbackViewer::Draw(GPUDebugInterface *gpuDebug, Draw::DrawContext *dr readbackFmt_ = Draw::DataFormat::R8G8B8A8_UNORM; break; case Draw::Aspect::DEPTH_BIT: + // TODO: Add fallback readbackFmt_ = Draw::DataFormat::D32F; break; case Draw::Aspect::STENCIL_BIT: @@ -385,14 +424,15 @@ bool ImGeReadbackViewer::Draw(GPUDebugInterface *gpuDebug, Draw::DrawContext *dr } } + Draw::DataFormat fmt = rbBpp == 1 ? Draw::DataFormat::R8_UNORM : Draw::DataFormat::R32_FLOAT; Draw::TextureDesc desc{ Draw::TextureType::LINEAR2D, - rbBpp == 1 ? Draw::DataFormat::R8_UNORM : Draw::DataFormat::R32_FLOAT, + fmt, (int)w, (int)h, 1, 1, false, - rbBpp == 1 ? Draw::TextureSwizzle::R8_AS_ALPHA : Draw::TextureSwizzle::DEFAULT, + Draw::DataFormatNumChannels(fmt) == 1 ? Draw::TextureSwizzle::R8_AS_GRAYSCALE: Draw::TextureSwizzle::DEFAULT, "PixelViewer temp", { texData }, nullptr, @@ -432,7 +472,9 @@ bool ImGeReadbackViewer::FormatValueAt(char *buf, size_t bufSize, int x, int y) case Draw::DataFormat::D32F: { const float *read = (const float *)(data_ + offset); - snprintf(buf, bufSize, "%0.4f", *read); + float value = *read; + int ivalue = *read * 65535.0f; + snprintf(buf, bufSize, "%0.4f (raw: %04x / %d)", *read, ivalue, ivalue); return true; } case Draw::DataFormat::S8: @@ -1060,6 +1102,7 @@ void ImGeDebuggerWindow::Draw(ImConfig &cfg, ImControl &control, GPUDebugInterfa DrawPreviewPrimitive(drawList, p0, previewPrim_, previewIndices_, previewVertices_, previewCount_, true, texW, texH); drawList->PopClipRect(); + } else { ImGui::Text("(no valid texture bound)"); // In software mode, we should just decode the texture here. diff --git a/UI/ImDebugger/ImGe.h b/UI/ImDebugger/ImGe.h index c642b620efb7..41dc6a8b1f94 100644 --- a/UI/ImDebugger/ImGe.h +++ b/UI/ImDebugger/ImGe.h @@ -71,14 +71,14 @@ struct ImGePixelViewer : public PixelLookup { } bool FormatValueAt(char *buf, size_t bufSize, int x, int y) const override; - uint32_t addr = 0x04000000; + uint32_t addr = 0x04110000; uint16_t stride = 512; uint16_t width = 480; uint16_t height = 272; - GEBufferFormat format = GE_FORMAT_565; + GEBufferFormat format = GE_FORMAT_DEPTH16; bool useAlpha = false; bool showAlpha = false; - float scale = 1.0f; + float scale = 20.0f; private: void UpdateTexture(Draw::DrawContext *draw); @@ -124,8 +124,6 @@ class ImGePixelViewerWindow { } private: - void UpdateTexture(Draw::DrawContext *draw); - ImGePixelViewer viewer_; }; diff --git a/UWP/GPU_UWP/GPU_UWP.vcxproj b/UWP/GPU_UWP/GPU_UWP.vcxproj index 7bb4b346bd8f..a7ba27a14097 100644 --- a/UWP/GPU_UWP/GPU_UWP.vcxproj +++ b/UWP/GPU_UWP/GPU_UWP.vcxproj @@ -109,6 +109,7 @@ + @@ -177,6 +178,7 @@ + @@ -261,4 +263,4 @@ - + \ No newline at end of file diff --git a/UWP/GPU_UWP/GPU_UWP.vcxproj.filters b/UWP/GPU_UWP/GPU_UWP.vcxproj.filters index 84b4c5d39630..31d14b549feb 100644 --- a/UWP/GPU_UWP/GPU_UWP.vcxproj.filters +++ b/UWP/GPU_UWP/GPU_UWP.vcxproj.filters @@ -80,6 +80,7 @@ Debugger + @@ -163,10 +164,11 @@ Debugger + {49bcf7f6-518a-4ecd-af55-bda3a344efe7} - + \ No newline at end of file diff --git a/android/jni/Android.mk b/android/jni/Android.mk index 10ab9a5f77a3..dbd88097886f 100644 --- a/android/jni/Android.mk +++ b/android/jni/Android.mk @@ -530,6 +530,7 @@ EXEC_AND_LIB_FILES := \ $(SRC)/GPU/Common/SoftwareTransformCommon.cpp.arm \ $(SRC)/GPU/Common/ReinterpretFramebuffer.cpp \ $(SRC)/GPU/Common/DepthBufferCommon.cpp \ + $(SRC)/GPU/Common/DepthRaster.cpp \ $(SRC)/GPU/Common/VertexDecoderCommon.cpp.arm \ $(SRC)/GPU/Common/VertexDecoderHandwritten.cpp.arm \ $(SRC)/GPU/Common/TextureCacheCommon.cpp.arm \ diff --git a/assets/compat.ini b/assets/compat.ini index 8a374753efb2..0452c5b85484 100644 --- a/assets/compat.ini +++ b/assets/compat.ini @@ -1228,8 +1228,10 @@ ULJS19067 = true ULAS42247 = true ULAS42318 = true +[SoftwareRasterDepth] + [DisableFirstFrameReadback] -# Wipeout Pure: Temporary workaround for lens flare flicker. See #13344 +# Wipeout Pure UCUS98612 = true UCJS10007 = true UCES00001 = true diff --git a/ext/imgui/imgui_impl_thin3d.cpp b/ext/imgui/imgui_impl_thin3d.cpp index bd302a55d51b..4c1ecba7a8e3 100644 --- a/ext/imgui/imgui_impl_thin3d.cpp +++ b/ext/imgui/imgui_impl_thin3d.cpp @@ -114,6 +114,10 @@ void ImGui_ImplThin3d_RenderDrawData(ImDrawData* draw_data, Draw::DrawContext *d boundSampler = bd->fontSampler; } else { size_t index = (size_t)pcmd->TextureId - TEX_ID_OFFSET; + if (index >= bd->tempTextures.size()) { + WARN_LOG(Log::System, "Missing temp texture %d (out of %d)", index, (int)bd->tempTextures.size()); + continue; + } _dbg_assert_(index < bd->tempTextures.size()); switch (bd->tempTextures[index].type) { case RegisteredTextureType::Framebuffer: diff --git a/libretro/Makefile.common b/libretro/Makefile.common index 804a1d72199f..c1cb5a454fed 100644 --- a/libretro/Makefile.common +++ b/libretro/Makefile.common @@ -543,6 +543,7 @@ SOURCES_CXX += \ $(GPUDIR)/Common/TextureScalerCommon.cpp \ $(GPUDIR)/Common/SoftwareTransformCommon.cpp \ $(GPUDIR)/Common/DepthBufferCommon.cpp \ + $(GPUDIR)/Common/DepthRaster.cpp \ $(GPUDIR)/Common/StencilCommon.cpp \ $(GPUDIR)/Software/TransformUnit.cpp \ $(GPUDIR)/Software/SoftGpu.cpp \ diff --git a/unittest/UnitTest.cpp b/unittest/UnitTest.cpp index 45c664a29af8..a087d205b96b 100644 --- a/unittest/UnitTest.cpp +++ b/unittest/UnitTest.cpp @@ -1112,14 +1112,17 @@ bool TestSIMD() { EXPECT_EQ_INT(testdata[1], 0); __m128i a = _mm_set_epi16(0, 0x4444, 0, 0x3333, 0, 0x2222, 0, 0x1111); - __m128i b = _mm_set_epi16(0, 0x8888, 0, 0x7777, 0, 0x6666, 0, 0x5555); + __m128i b = _mm_set_epi16(0, (int16_t)0x8888, 0, 0x7777, 0, 0x6666, 0, 0x5555); __m128i c = _mm_packu2_epi32_SSE2(a, b); - __m128i d = _mm_packus_epi32(a, b); + __m128i d = _mm_packu1_epi32_SSE2(b); - uint64_t testdata2[2]; + uint64_t testdata2[4]; _mm_store_si128((__m128i *)testdata2, c); + _mm_store_si128((__m128i *)testdata2 + 1, d); EXPECT_EQ_INT(testdata2[0], 0x4444333322221111); EXPECT_EQ_INT(testdata2[1], 0x8888777766665555); + EXPECT_EQ_INT(testdata2[2], 0x8888777766665555); + EXPECT_EQ_INT(testdata2[2], 0x8888777766665555); #endif return true; }