From 58adb379ea0019f2b803c22962bd8c997809a0a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Wed, 18 Dec 2024 18:35:55 +0100 Subject: [PATCH 01/28] GE debugger improvements --- Common/GPU/DataFormat.h | 1 + Common/GPU/Vulkan/thin3d_vulkan.cpp | 8 +++++- Common/GPU/thin3d.cpp | 19 +++++++++++++ Common/GPU/thin3d.h | 1 + UI/ImDebugger/ImDebugger.h | 2 ++ UI/ImDebugger/ImGe.cpp | 44 +++++++++++++++++++++++++++-- ext/imgui/imgui_impl_thin3d.cpp | 4 +++ 7 files changed, 75 insertions(+), 4 deletions(-) diff --git a/Common/GPU/DataFormat.h b/Common/GPU/DataFormat.h index 6f5cd2bc2eae..428146802a6c 100644 --- a/Common/GPU/DataFormat.h +++ b/Common/GPU/DataFormat.h @@ -74,6 +74,7 @@ bool DataFormatIsDepthStencil(DataFormat fmt); inline bool DataFormatIsColor(DataFormat fmt) { return !DataFormatIsDepthStencil(fmt); } +int DataFormatNumChannels(DataFormat fmt); bool DataFormatIsBlockCompressed(DataFormat fmt, int *blockSize); // Limited format support for now. diff --git a/Common/GPU/Vulkan/thin3d_vulkan.cpp b/Common/GPU/Vulkan/thin3d_vulkan.cpp index e3d135e8c0b1..b744235b892f 100644 --- a/Common/GPU/Vulkan/thin3d_vulkan.cpp +++ b/Common/GPU/Vulkan/thin3d_vulkan.cpp @@ -803,9 +803,15 @@ bool VKTexture::Create(VkCommandBuffer cmd, VulkanBarrierBatch *postBarriers, Vu } VkComponentMapping r8AsAlpha[4] = { VK_COMPONENT_SWIZZLE_ONE, VK_COMPONENT_SWIZZLE_ONE, VK_COMPONENT_SWIZZLE_ONE, VK_COMPONENT_SWIZZLE_R }; + VkComponentMapping r8AsColor[4] = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_ONE }; + VkComponentMapping *swizzle = nullptr; + switch (desc.swizzle) { + case TextureSwizzle::R8_AS_ALPHA: swizzle = r8AsAlpha; break; + case TextureSwizzle::R8_AS_GRAYSCALE: swizzle = r8AsColor; break; + } VulkanBarrierBatch barrier; - if (!vkTex_->CreateDirect(width_, height_, 1, mipLevels_, vulkanFormat, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, usageBits, &barrier, desc.swizzle == TextureSwizzle::R8_AS_ALPHA ? r8AsAlpha : nullptr)) { + if (!vkTex_->CreateDirect(width_, height_, 1, mipLevels_, vulkanFormat, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, usageBits, &barrier, swizzle)) { ERROR_LOG(Log::G3D, "Failed to create VulkanTexture: %dx%dx%d fmt %d, %d levels", width_, height_, depth_, (int)vulkanFormat, mipLevels_); return false; } diff --git a/Common/GPU/thin3d.cpp b/Common/GPU/thin3d.cpp index de8db83fb331..72fbe873e86a 100644 --- a/Common/GPU/thin3d.cpp +++ b/Common/GPU/thin3d.cpp @@ -118,6 +118,25 @@ bool DataFormatIsBlockCompressed(DataFormat fmt, int *blockSize) { } } +int DataFormatNumChannels(DataFormat fmt) { + switch (fmt) { + case DataFormat::D16: + case DataFormat::D32F: + case DataFormat::R8_UNORM: + case DataFormat::R16_UNORM: + case DataFormat::R16_FLOAT: + case DataFormat::R32_FLOAT: + return 1; + case DataFormat::R8G8B8A8_UNORM: + case DataFormat::R8G8B8A8_UNORM_SRGB: + case DataFormat::B8G8R8A8_UNORM: + case DataFormat::B8G8R8A8_UNORM_SRGB: + return 4; + default: + return 0; + } +} + RefCountedObject::~RefCountedObject() { const int rc = refcount_.load(); _dbg_assert_msg_(rc == 0xDEDEDE, "Unexpected refcount %d in object of type '%s'", rc, name_); diff --git a/Common/GPU/thin3d.h b/Common/GPU/thin3d.h index ce62382718e2..f7a23795b017 100644 --- a/Common/GPU/thin3d.h +++ b/Common/GPU/thin3d.h @@ -643,6 +643,7 @@ typedef std::functionPopClipRect(); + } else { ImGui::Text("(no valid texture bound)"); // In software mode, we should just decode the texture here. diff --git a/ext/imgui/imgui_impl_thin3d.cpp b/ext/imgui/imgui_impl_thin3d.cpp index bd302a55d51b..4c1ecba7a8e3 100644 --- a/ext/imgui/imgui_impl_thin3d.cpp +++ b/ext/imgui/imgui_impl_thin3d.cpp @@ -114,6 +114,10 @@ void ImGui_ImplThin3d_RenderDrawData(ImDrawData* draw_data, Draw::DrawContext *d boundSampler = bd->fontSampler; } else { size_t index = (size_t)pcmd->TextureId - TEX_ID_OFFSET; + if (index >= bd->tempTextures.size()) { + WARN_LOG(Log::System, "Missing temp texture %d (out of %d)", index, (int)bd->tempTextures.size()); + continue; + } _dbg_assert_(index < bd->tempTextures.size()); switch (bd->tempTextures[index].type) { case RegisteredTextureType::Framebuffer: From b442183259f5ff8e5a872b5a466602238686574a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Thu, 19 Dec 2024 10:12:58 +0100 Subject: [PATCH 02/28] Add "Realtime" checkbox to pixel viewer --- UI/ImDebugger/ImDebugger.cpp | 1 + UI/ImDebugger/ImDebugger.h | 1 + UI/ImDebugger/ImGe.cpp | 5 +++++ UI/ImDebugger/ImGe.h | 2 -- 4 files changed, 7 insertions(+), 2 deletions(-) diff --git a/UI/ImDebugger/ImDebugger.cpp b/UI/ImDebugger/ImDebugger.cpp index 5f14a8529c0d..9cb082cf68bf 100644 --- a/UI/ImDebugger/ImDebugger.cpp +++ b/UI/ImDebugger/ImDebugger.cpp @@ -1552,4 +1552,5 @@ void ImConfig::SyncConfig(IniFile *ini, bool save) { sync.SetSection(ini->GetOrCreateSection("Settings")); sync.Sync("displayLatched", &displayLatched, false); + sync.Sync("realtimePixelPreview", &realtimePixelPreview, false); } diff --git a/UI/ImDebugger/ImDebugger.h b/UI/ImDebugger/ImDebugger.h index 5a8f84cf2d88..c0fa9a037d56 100644 --- a/UI/ImDebugger/ImDebugger.h +++ b/UI/ImDebugger/ImDebugger.h @@ -153,6 +153,7 @@ struct ImConfig { int selectedMemCheck = -1; uint64_t selectedTexAddr = 0; + bool realtimePixelPreview = false; int breakCount = 0; bool displayLatched = false; diff --git a/UI/ImDebugger/ImGe.cpp b/UI/ImDebugger/ImGe.cpp index 9503c7a08e30..52783719a545 100644 --- a/UI/ImDebugger/ImGe.cpp +++ b/UI/ImDebugger/ImGe.cpp @@ -163,9 +163,14 @@ void ImGePixelViewerWindow::Draw(ImConfig &cfg, ImControl &control, GPUDebugInte viewer_.width = viewer_.stride; viewer_.Snapshot(); } + ImGui::Checkbox("Realtime", &cfg.realtimePixelPreview); } ImGui::EndChild(); + if (cfg.realtimePixelPreview) { + viewer_.Snapshot(); + } + ImGui::SameLine(); if (ImGui::BeginChild("right")) { ImVec2 p0 = ImGui::GetCursorScreenPos(); diff --git a/UI/ImDebugger/ImGe.h b/UI/ImDebugger/ImGe.h index c642b620efb7..227b0f0c1e52 100644 --- a/UI/ImDebugger/ImGe.h +++ b/UI/ImDebugger/ImGe.h @@ -124,8 +124,6 @@ class ImGePixelViewerWindow { } private: - void UpdateTexture(Draw::DrawContext *draw); - ImGePixelViewer viewer_; }; From c5ad81e3d513f47b1873f05b98d185c0a0e9710b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Tue, 17 Dec 2024 12:58:33 +0100 Subject: [PATCH 03/28] Add DepthRaster.cpp/h. Rasterize depth rectangles, some triangles --- CMakeLists.txt | 2 + Common/Math/CrossSIMD.h | 1 - Core/Compatibility.cpp | 1 + Core/Compatibility.h | 1 + GPU/Common/DepthRaster.cpp | 367 ++++++++++++++++++++++++++++ GPU/Common/DepthRaster.h | 12 + GPU/Common/DrawEngineCommon.h | 5 + GPU/Common/VertexDecoderCommon.h | 6 +- GPU/GPU.vcxproj | 2 + GPU/GPU.vcxproj.filters | 6 + GPU/GPUCommonHW.cpp | 9 + UI/ImDebugger/ImGe.h | 6 +- UWP/GPU_UWP/GPU_UWP.vcxproj | 4 +- UWP/GPU_UWP/GPU_UWP.vcxproj.filters | 4 +- android/jni/Android.mk | 1 + assets/compat.ini | 4 +- libretro/Makefile.common | 1 + 17 files changed, 422 insertions(+), 10 deletions(-) create mode 100644 GPU/Common/DepthRaster.cpp create mode 100644 GPU/Common/DepthRaster.h diff --git a/CMakeLists.txt b/CMakeLists.txt index c1860a2b5332..fcf74005e7f9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1906,6 +1906,8 @@ set(GPU_SOURCES GPU/Common/Draw2D.cpp GPU/Common/Draw2D.h GPU/Common/DepthBufferCommon.cpp + GPU/Common/DepthRaster.cpp + GPU/Common/DepthRaster.h GPU/Common/TextureShaderCommon.cpp GPU/Common/TextureShaderCommon.h GPU/Common/DepalettizeShaderCommon.cpp diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index 6ad03b832217..94b2d3933b52 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -5,4 +5,3 @@ #pragma once #include "Common/Math/SIMDHeaders.h" - diff --git a/Core/Compatibility.cpp b/Core/Compatibility.cpp index 379042198011..cd2c50c48835 100644 --- a/Core/Compatibility.cpp +++ b/Core/Compatibility.cpp @@ -149,6 +149,7 @@ void Compatibility::CheckSettings(IniFile &iniFile, const std::string &gameID) { CheckSetting(iniFile, gameID, "DisableMemcpySlicing", &flags_.DisableMemcpySlicing); CheckSetting(iniFile, gameID, "ForceEnableGPUReadback", &flags_.ForceEnableGPUReadback); CheckSetting(iniFile, gameID, "UseFFMPEGFindStreamInfo", &flags_.UseFFMPEGFindStreamInfo); + CheckSetting(iniFile, gameID, "SoftwareRasterDepth", &flags_.SoftwareRasterDepth); } void Compatibility::CheckVRSettings(IniFile &iniFile, const std::string &gameID) { diff --git a/Core/Compatibility.h b/Core/Compatibility.h index 8a0e33af4d34..4688df37c055 100644 --- a/Core/Compatibility.h +++ b/Core/Compatibility.h @@ -112,6 +112,7 @@ struct CompatFlags { bool DisableMemcpySlicing; bool ForceEnableGPUReadback; bool UseFFMPEGFindStreamInfo; + bool SoftwareRasterDepth; }; struct VRCompat { diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp new file mode 100644 index 000000000000..20ac069f8248 --- /dev/null +++ b/GPU/Common/DepthRaster.cpp @@ -0,0 +1,367 @@ +#include + +#include "Common/Math/CrossSIMD.h" +#include "GPU/Common/DepthRaster.h" +#include "GPU/Math3D.h" +#include "Common/Math/math_util.h" +#include "GPU/Common/VertexDecoderCommon.h" + +// TODO: Should respect the scissor rect. + +struct ScreenVert { + int x; + int y; + uint16_t z; + uint16_t behind; +}; + +void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, short depthValue, GEComparison depthCompare) { + // Swap coordinates if needed, we don't back-face-cull rects. + // We also ignore the UV rotation here. + if (x1 > x2) { + std::swap(x1, x2); + } + if (y1 > y2) { + std::swap(y1, y2); + } + if (x1 == x2 || y1 == y2) { + return; + } + + __m128i valueX8 = _mm_set1_epi16(depthValue); + +#if PPSSPP_ARCH(SSE2) + for (int y = y1; y < y2; y++) { + __m128i *ptr = (__m128i *)(dest + stride * y + x1); + int w = x2 - x1; + + switch (depthCompare) { + case GE_COMP_ALWAYS: + while (w >= 8) { + _mm_storeu_si128(ptr, valueX8); + ptr++; + w -= 8; + } + break; + // TODO: Trailer + case GE_COMP_NEVER: + break; + default: + // TODO + break; + } + } +#elif PPSSPP_ARCH(ARM64_NEON) + +#else + // Do nothing for now +#endif +} + +using namespace Math3D; +struct int2 { + int x, y; + int2(float a, float b) { + x = (int)(a + 0.5f); + y = (int)(b + 0.5f); + } +}; + +// Adapted from Intel's depth rasterizer example. +// Started with the scalar version, will SIMD-ify later. +void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const ScreenVert vertsSub[3], GEComparison compareMode) { + int tileStartX = x1; + int tileEndX = x2; + + int tileStartY = y1; + int tileEndY = y2; + + // Convert to whole pixels for now. Later subpixel precision. + ScreenVert verts[3]; + verts[0].x = vertsSub[0].x >> 4; + verts[0].y = vertsSub[0].y >> 4; + verts[0].z = vertsSub[0].z; + verts[1].x = vertsSub[2].x >> 4; + verts[1].y = vertsSub[2].y >> 4; + verts[1].z = vertsSub[2].z; + verts[2].x = vertsSub[1].x >> 4; + verts[2].y = vertsSub[1].y >> 4; + verts[2].z = vertsSub[1].z; + + // use fixed-point only for X and Y. Avoid work for Z and W. + int startX = std::max(std::min(std::min(verts[0].x, verts[1].x), verts[2].x), tileStartX) & int(0xFFFFFFFE); + int endX = std::min(std::max(std::max(verts[0].x, verts[1].x), verts[2].x) + 1, tileEndX); + + int startY = std::max(std::min(std::min(verts[0].y, verts[1].y), verts[2].y), tileStartY) & int(0xFFFFFFFE); + int endY = std::min(std::max(std::max(verts[0].y, verts[1].y), verts[2].y) + 1, tileEndY); + + if (endX == startX || endY == startY) { + // No pixels + return; + } + + // Fab(x, y) = Ax + By + C = 0 + // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 + // Compute A = (ya - yb) for the 3 line segments that make up each triangle + int A0 = verts[1].y - verts[2].y; + int A1 = verts[2].y - verts[0].y; + int A2 = verts[0].y - verts[1].y; + + // Compute B = (xb - xa) for the 3 line segments that make up each triangle + int B0 = verts[2].x - verts[1].x; + int B1 = verts[0].x - verts[2].x; + int B2 = verts[1].x - verts[0].x; + + // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle + int C0 = verts[1].x * verts[2].y - verts[2].x * verts[1].y; + int C1 = verts[2].x * verts[0].y - verts[0].x * verts[2].y; + int C2 = verts[0].x * verts[1].y - verts[1].x * verts[0].y; + + // Compute triangle area + int triArea = A0 * verts[0].x + B0 * verts[0].y + C0; + if (triArea <= 0) { + // Too small to rasterize or backface culled + // NOTE: Just disabling this check won't enable two-sided rendering. + // Since it's not that common, let's just queue the triangles with both windings. + return; + } + + float oneOverTriArea = (1.0f / float(triArea)); + + float zz[3]; + for (int vv = 0; vv < 3; vv++) { + zz[vv] = (float)verts[vv].z * oneOverTriArea; + } + + int rowIdx = (startY * stride + startX); + int col = startX; + int row = startY; + + // Calculate slopes at starting corner. + int alpha0 = (A0 * col) + (B0 * row) + C0; + int beta0 = (A1 * col) + (B1 * row) + C1; + int gama0 = (A2 * col) + (B2 * row) + C2; + + // Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY) + for (int r = startY; r < endY; r++, + row++, + rowIdx += stride, + alpha0 += B0, + beta0 += B1, + gama0 += B2) + { + // Compute barycentric coordinates + int idx = rowIdx; + int alpha = alpha0; + int beta = beta0; + int gama = gama0; + + for (int c = startX; c < endX; c++, + idx++, + alpha += A0, + beta += A1, + gama += A2) + { + int mask = alpha >= 0 && beta >= 0 && gama >= 0; + // Early out if all of this quad's pixels are outside the triangle. + if (!mask) { + continue; + } + // Compute barycentric-interpolated depth + float depth = alpha * zz[0] + beta * zz[1] + gama * zz[2]; + float previousDepthValue = (float)depthBuf[idx]; + + int depthMask; + switch (compareMode) { + case GE_COMP_EQUAL: depthMask = depth == previousDepthValue; break; + case GE_COMP_LESS: depthMask = depth < previousDepthValue; break; + case GE_COMP_LEQUAL: depthMask = depth <= previousDepthValue; break; + case GE_COMP_GEQUAL: depthMask = depth >= previousDepthValue; break; + case GE_COMP_GREATER: depthMask = depth > previousDepthValue; break; + case GE_COMP_NOTEQUAL: depthMask = depth != previousDepthValue; break; + case GE_COMP_ALWAYS: + default: + depthMask = 1; + break; + } + int finalMask = mask & depthMask; + depth = finalMask == 1 ? depth : previousDepthValue; + depthBuf[idx] = (u16)depth; + } //for each column + } // for each row +} + +// We ignore lots of primitive types for now. +void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, int y2, void *bufferData, + const void *vertexData, const void *indexData, GEPrimitiveType prim, int count, VertexDecoder *dec, u32 vertTypeID, bool clockwise) { + + GEComparison compareMode = gstate.getDepthTestFunction(); + if (gstate.isModeClear()) { + if (!gstate.isClearModeDepthMask()) { + return; + } + compareMode = GE_COMP_ALWAYS; + } else { + if (!gstate.isDepthTestEnabled() || !gstate.isDepthWriteEnabled()) + return; + } + + switch (prim) { + case GE_PRIM_INVALID: + case GE_PRIM_KEEP_PREVIOUS: + case GE_PRIM_LINES: + case GE_PRIM_LINE_STRIP: + case GE_PRIM_POINTS: + return; + default: + break; + } + + // TODO: Ditch indexed primitives for now, also ditched skinned ones since we don't have a fast way to skin without + // running the full decoder. + if (vertTypeID & (GE_VTYPE_IDX_MASK | GE_VTYPE_WEIGHT_MASK)) { + return; + } + + bool isThroughMode = (vertTypeID & GE_VTYPE_THROUGH_MASK) != 0; + + // Turn the input data into a raw float array that we can pass to an optimized triangle rasterizer. + float *verts = (float *)bufferData; + ScreenVert *screenVerts = (ScreenVert *)((uint8_t *)bufferData + 65536 * 8); + + // Simple, most common case. + int vertexStride = dec->VertexSize(); + int offset = dec->posoff; + float factor = 1.0f; + switch (vertTypeID & GE_VTYPE_POS_MASK) { + case GE_VTYPE_POS_8BIT: + if (!isThroughMode) { + factor = 1.0f / 128.0f; + } + for (int i = 0; i < count; i++) { + const s8 *data = (const s8 *)vertexData + i * vertexStride + offset; + for (int j = 0; j < 3; j++) { + verts[i * 3 + j] = data[j] * factor; + } + } + break; + case GE_VTYPE_POS_16BIT: + if (!isThroughMode) { + factor = 1.0f / 32768.0f; + } + for (int i = 0; i < count; i++) { + const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset)); + for (int j = 0; j < 3; j++) { + verts[i * 3 + j] = data[j] * factor; + } + } + break; + case GE_VTYPE_POS_FLOAT: + for (int i = 0; i < count; i++) + memcpy(&verts[i * 3], (const u8 *)vertexData + vertexStride * i + offset, sizeof(float) * 3); + break; + } + + // OK, we now have the coordinates. Let's transform, we can actually do this in-place. + if (!(vertTypeID & GE_VTYPE_THROUGH_MASK)) { + // TODO: This is very suboptimal. This should be one matrix multiplication per vertex. + + float viewportX = gstate.getViewportXCenter(); + float viewportY = gstate.getViewportYCenter(); + float viewportZ = gstate.getViewportZCenter(); + float viewportScaleX = gstate.getViewportXScale(); + float viewportScaleY = gstate.getViewportYScale(); + float viewportScaleZ = gstate.getViewportZScale(); + + bool allBehind = true; + + for (int i = 0; i < count; i++) { + float world[3]; + float view[3]; + float proj[4]; + Vec3ByMatrix43(world, verts + i * 3, gstate.worldMatrix); + Vec3ByMatrix43(view, world, gstate.viewMatrix); + Vec3ByMatrix44(proj, view, gstate.projMatrix); // TODO: Include adjustments to the proj matrix? + + float w = proj[3]; + + bool inFront = w > 0.0f; + screenVerts[i].behind = !inFront; + if (inFront) { + allBehind = false; + } + + // Clip to the w=0 plane. + proj[0] /= w; + proj[1] /= w; + proj[2] /= w; + + // Then transform by the viewport and offset to finally get subpixel coordinates. Normally, this is done by the viewport + // and offset params. + float screen[3]; + screen[0] = (proj[0] * viewportScaleX + viewportX) * 16.0f - gstate.getOffsetX16(); + screen[1] = (proj[1] * viewportScaleY + viewportY) * 16.0f - gstate.getOffsetY16(); + screen[2] = (proj[2] * viewportScaleZ + viewportZ); + if (screen[2] < 0.0f) { + screen[2] = 0.0f; + } + if (screen[2] >= 65535.0f) { + screen[2] = 65535.0f; + } + screenVerts[i].x = screen[0]; + screenVerts[i].y = screen[1]; + screenVerts[i].z = screen[2]; + } + if (allBehind) { + // Cull the whole draw. + return; + } + } else { + for (int i = 0; i < count; i++) { + screenVerts[i].x = (int)verts[i * 3 + 0] << 4; + screenVerts[i].y = (int)verts[i * 3 + 1] << 4; + screenVerts[i].z = (u16)clamp_value(verts[i * 3 + 2], 0.0f, 65535.0f); + } + } + + // Then we need to stitch primitives from strips, etc etc... + // For now we'll just do it tri by tri. Later let's be more efficient. + + switch (prim) { + case GE_PRIM_RECTANGLES: + for (int i = 0; i < count / 2; i++) { + uint16_t z = screenVerts[i + 1].z; // depth from second vertex + // We remove the subpixel information here. + DepthRasterRect(depth, depthStride, screenVerts[i].x >> 4, screenVerts[i].y >> 4, screenVerts[i + 1].x >> 4, screenVerts[i + 1].y >> 4, + z, compareMode); + } + break; + case GE_PRIM_TRIANGLES: + for (int i = 0; i < count / 3; i++) { + if (screenVerts[i * 3].behind || screenVerts[i * 3 + 1].behind || screenVerts[i * 3 + 2].behind) { + continue; + } + DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, screenVerts + i * 3, compareMode); + } + break; + case GE_PRIM_TRIANGLE_STRIP: + { + int wind = 2; + for (int i = 0; i < count - 2; i++) { + int i0 = i; + int i1 = i + wind; + wind ^= 3; + int i2 = i + wind; + if (screenVerts[i0].behind || screenVerts[i1].behind || screenVerts[i2].behind) { + continue; + } + ScreenVert v[3]; + v[0] = screenVerts[i0]; + v[1] = screenVerts[i1]; + v[2] = screenVerts[i2]; + DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, v, compareMode); + } + break; + } + } +} diff --git a/GPU/Common/DepthRaster.h b/GPU/Common/DepthRaster.h new file mode 100644 index 000000000000..01fa60e257d1 --- /dev/null +++ b/GPU/Common/DepthRaster.h @@ -0,0 +1,12 @@ +#pragma once + +#include "Common/CommonTypes.h" +#include "GPU/ge_constants.h" + +// Specialized, very limited depth-only rasterizer. +// Meant to run in parallel with hardware rendering, in games that read back the depth buffer +// for effects like lens flare. +// So, we can be quite inaccurate without any issues, and skip a lot of functionality. + +class VertexDecoder; +void DepthRasterPrim(uint16_t *dest, int stride, int x1, int x2, int y1, int y2, void *bufferData, const void *vertexData, const void *indexData, GEPrimitiveType prim, int count, VertexDecoder *decoder, u32 vertexTypeID, bool clockwise); diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h index 0f8ab8a7515a..595ab929aab4 100644 --- a/GPU/Common/DrawEngineCommon.h +++ b/GPU/Common/DrawEngineCommon.h @@ -158,6 +158,11 @@ class DrawEngineCommon { _dbg_assert_(numDrawVerts_ == 0 && numDrawInds_ == 0); } + // temporary hack + uint8_t *GetTempSpace() { + return decoded_ + 12 * 65536; + } + protected: virtual bool UpdateUseHWTessellation(bool enabled) const { return enabled; } void UpdatePlanes(); diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h index 2793ad80eed5..4b905f03c956 100644 --- a/GPU/Common/VertexDecoderCommon.h +++ b/GPU/Common/VertexDecoderCommon.h @@ -122,7 +122,7 @@ class IndexConverter { // Reads decoded vertex formats in a convenient way. For software transform and debugging. class VertexReader { public: - VertexReader(u8 *base, const DecVtxFormat &decFmt, int vtype) : base_(base), data_(base), decFmt_(decFmt), vtype_(vtype) {} + VertexReader(const u8 *base, const DecVtxFormat &decFmt, int vtype) : base_(base), data_(base), decFmt_(decFmt), vtype_(vtype) {} void ReadPos(float pos[3]) const { // Only DEC_FLOAT_3 is supported. @@ -297,8 +297,8 @@ class VertexReader { } private: - u8 *base_; - u8 *data_; + const u8 *base_; + const u8 *data_; DecVtxFormat decFmt_; int vtype_; }; diff --git a/GPU/GPU.vcxproj b/GPU/GPU.vcxproj index 5cb3ea9e6238..c27d08354936 100644 --- a/GPU/GPU.vcxproj +++ b/GPU/GPU.vcxproj @@ -346,6 +346,7 @@ + @@ -468,6 +469,7 @@ + diff --git a/GPU/GPU.vcxproj.filters b/GPU/GPU.vcxproj.filters index 610ba94cbe33..1529b974c13f 100644 --- a/GPU/GPU.vcxproj.filters +++ b/GPU/GPU.vcxproj.filters @@ -279,6 +279,9 @@ Debugger + + Common + @@ -554,6 +557,9 @@ Debugger + + Common + diff --git a/GPU/GPUCommonHW.cpp b/GPU/GPUCommonHW.cpp index 9b5389c8750a..f5383cf6be59 100644 --- a/GPU/GPUCommonHW.cpp +++ b/GPU/GPUCommonHW.cpp @@ -13,6 +13,7 @@ #include "GPU/Common/DrawEngineCommon.h" #include "GPU/Common/TextureCacheCommon.h" #include "GPU/Common/FramebufferManagerCommon.h" +#include "GPU/Common/DepthRaster.h" struct CommonCommandTableEntry { uint8_t cmd; @@ -1039,6 +1040,10 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) { if (passCulling) { if (!drawEngineCommon_->SubmitPrim(verts, inds, prim, count, decoder, vertTypeID, true, &bytesRead)) { canExtend = false; + } else if (PSP_CoreParameter().compat.flags().SoftwareRasterDepth) { + DepthRasterPrim((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(), + gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), drawEngineCommon_->GetTempSpace(), + verts, inds, prim, count, decoder, vertTypeID, false); } onePassed = true; } else { @@ -1117,6 +1122,10 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) { if (passCulling) { if (!drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, decoder, vertTypeID, clockwise, &bytesRead)) { canExtend = false; + } else if (PSP_CoreParameter().compat.flags().SoftwareRasterDepth) { + DepthRasterPrim((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(), + gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), drawEngineCommon_->GetTempSpace(), + verts, inds, newPrim, count, decoder, vertTypeID, clockwise); } // As soon as one passes, assume we don't need to check the rest of this batch. onePassed = true; diff --git a/UI/ImDebugger/ImGe.h b/UI/ImDebugger/ImGe.h index 227b0f0c1e52..41dc6a8b1f94 100644 --- a/UI/ImDebugger/ImGe.h +++ b/UI/ImDebugger/ImGe.h @@ -71,14 +71,14 @@ struct ImGePixelViewer : public PixelLookup { } bool FormatValueAt(char *buf, size_t bufSize, int x, int y) const override; - uint32_t addr = 0x04000000; + uint32_t addr = 0x04110000; uint16_t stride = 512; uint16_t width = 480; uint16_t height = 272; - GEBufferFormat format = GE_FORMAT_565; + GEBufferFormat format = GE_FORMAT_DEPTH16; bool useAlpha = false; bool showAlpha = false; - float scale = 1.0f; + float scale = 20.0f; private: void UpdateTexture(Draw::DrawContext *draw); diff --git a/UWP/GPU_UWP/GPU_UWP.vcxproj b/UWP/GPU_UWP/GPU_UWP.vcxproj index 7bb4b346bd8f..a7ba27a14097 100644 --- a/UWP/GPU_UWP/GPU_UWP.vcxproj +++ b/UWP/GPU_UWP/GPU_UWP.vcxproj @@ -109,6 +109,7 @@ + @@ -177,6 +178,7 @@ + @@ -261,4 +263,4 @@ - + \ No newline at end of file diff --git a/UWP/GPU_UWP/GPU_UWP.vcxproj.filters b/UWP/GPU_UWP/GPU_UWP.vcxproj.filters index 84b4c5d39630..31d14b549feb 100644 --- a/UWP/GPU_UWP/GPU_UWP.vcxproj.filters +++ b/UWP/GPU_UWP/GPU_UWP.vcxproj.filters @@ -80,6 +80,7 @@ Debugger + @@ -163,10 +164,11 @@ Debugger + {49bcf7f6-518a-4ecd-af55-bda3a344efe7} - + \ No newline at end of file diff --git a/android/jni/Android.mk b/android/jni/Android.mk index 10ab9a5f77a3..dbd88097886f 100644 --- a/android/jni/Android.mk +++ b/android/jni/Android.mk @@ -530,6 +530,7 @@ EXEC_AND_LIB_FILES := \ $(SRC)/GPU/Common/SoftwareTransformCommon.cpp.arm \ $(SRC)/GPU/Common/ReinterpretFramebuffer.cpp \ $(SRC)/GPU/Common/DepthBufferCommon.cpp \ + $(SRC)/GPU/Common/DepthRaster.cpp \ $(SRC)/GPU/Common/VertexDecoderCommon.cpp.arm \ $(SRC)/GPU/Common/VertexDecoderHandwritten.cpp.arm \ $(SRC)/GPU/Common/TextureCacheCommon.cpp.arm \ diff --git a/assets/compat.ini b/assets/compat.ini index 8a374753efb2..0452c5b85484 100644 --- a/assets/compat.ini +++ b/assets/compat.ini @@ -1228,8 +1228,10 @@ ULJS19067 = true ULAS42247 = true ULAS42318 = true +[SoftwareRasterDepth] + [DisableFirstFrameReadback] -# Wipeout Pure: Temporary workaround for lens flare flicker. See #13344 +# Wipeout Pure UCUS98612 = true UCJS10007 = true UCES00001 = true diff --git a/libretro/Makefile.common b/libretro/Makefile.common index 804a1d72199f..c1cb5a454fed 100644 --- a/libretro/Makefile.common +++ b/libretro/Makefile.common @@ -543,6 +543,7 @@ SOURCES_CXX += \ $(GPUDIR)/Common/TextureScalerCommon.cpp \ $(GPUDIR)/Common/SoftwareTransformCommon.cpp \ $(GPUDIR)/Common/DepthBufferCommon.cpp \ + $(GPUDIR)/Common/DepthRaster.cpp \ $(GPUDIR)/Common/StencilCommon.cpp \ $(GPUDIR)/Software/TransformUnit.cpp \ $(GPUDIR)/Software/SoftGpu.cpp \ From d27d8c9dae7497bec335fe6caf0a8f86475e4cc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Thu, 19 Dec 2024 09:36:50 +0100 Subject: [PATCH 04/28] Remove subpixel precision. Some sketching. --- GPU/Common/DepthRaster.cpp | 198 ++++++++++++++++++++++++++++++------- 1 file changed, 161 insertions(+), 37 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index 20ac069f8248..afeb4b64a107 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -1,4 +1,5 @@ #include +#include #include "Common/Math/CrossSIMD.h" #include "GPU/Common/DepthRaster.h" @@ -6,7 +7,91 @@ #include "Common/Math/math_util.h" #include "GPU/Common/VertexDecoderCommon.h" -// TODO: Should respect the scissor rect. +#if PPSSPP_ARCH(SSE2) + +struct Vec4S32 { + __m128i v; + + Vec4S32 operator +(Vec4S32 other) const { + return Vec4S32{ _mm_add_epi32(v, other.v) }; + } + Vec4S32 operator -(Vec4S32 other) const { + return Vec4S32{ _mm_sub_epi32(v, other.v) }; + } + // This is really bad if we restrict ourselves to SSE2 only. + // If we have SSE4, we can do _mm_mullo_epi32. + // Let's avoid using it as much as possible. + // https://stackoverflow.com/questions/17264399/fastest-way-to-multiply-two-vectors-of-32bit-integers-in-c-with-sse + Vec4S32 operator *(Vec4S32 other) const { + __m128i a13 = _mm_shuffle_epi32(v, 0xF5); // (-,a3,-,a1) + __m128i b13 = _mm_shuffle_epi32(other.v, 0xF5); // (-,b3,-,b1) + __m128i prod02 = _mm_mul_epu32(v, other.v); // (-,a2*b2,-,a0*b0) + __m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1) + __m128i prod01 = _mm_unpacklo_epi32(prod02, prod13); // (-,-,a1*b1,a0*b0) + __m128i prod23 = _mm_unpackhi_epi32(prod02, prod13); // (-,-,a3*b3,a2*b2) + return Vec4S32{ _mm_unpacklo_epi64(prod01, prod23) }; // (ab3,ab2,ab1,ab0) + } +}; + +struct Vec4F32 { + __m128 v; + + static Vec4F32 FromVec4S32(Vec4S32 other) { + return Vec4F32{ _mm_cvtepi32_ps(other.v) }; + } + + Vec4F32 operator +(Vec4F32 other) const { + return Vec4F32{ _mm_add_ps(v, other.v) }; + } + Vec4F32 operator -(Vec4F32 other) const { + return Vec4F32{ _mm_sub_ps(v, other.v) }; + } + Vec4F32 operator *(Vec4F32 other) const { + return Vec4F32{ _mm_mul_ps(v, other.v) }; + } +}; + +#elif PPSSPP_ARCH(ARM_NEON) + +struct Vec4S32 { + uint32x4_t v; + + Vec4S32 operator +(Vec4S32 other) const { + return Vec4S32{ vaddq_s32(v, other.v) }; + } + Vec4S32 operator -(Vec4S32 other) const { + return Vec4S32{ vsubq_s32(v, other.v) }; + } + Vec4S32 operator *(Vec4S32 other) const { + return Vec4S32{ vmulq_s32(v, other.v) }; + } +}; + +struct Vec4F32 { + float32x4_t v; + + static Vec4F32 FromVec4S32(Vec4S32 other) { + return Vec4F32{ _mm_cvtepi32_ps(other.v) }; + } + + Vec4F32 operator +(Vec4F32 other) const { + return Vec4F32{ vaddq_f32(v, other.v) }; + } + Vec4F32 operator -(Vec4F32 other) const { + return Vec4F32{ vsubq_f32(v, other.v) }; + } + Vec4F32 operator *(Vec4F32 other) const { + return Vec4F32{ vmulq_f32(v, other.v) }; + } +}; + +#else + +struct Vec4S32 { + s32 v[4]; +}; + +#endif struct ScreenVert { int x; @@ -28,19 +113,21 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, return; } - __m128i valueX8 = _mm_set1_epi16(depthValue); - #if PPSSPP_ARCH(SSE2) + __m128i valueX8 = _mm_set1_epi16(depthValue); for (int y = y1; y < y2; y++) { __m128i *ptr = (__m128i *)(dest + stride * y + x1); int w = x2 - x1; - switch (depthCompare) { case GE_COMP_ALWAYS: - while (w >= 8) { - _mm_storeu_si128(ptr, valueX8); - ptr++; - w -= 8; + if (depthValue == 0) { + memset(ptr, 0, w * 2); + } else { + while (w >= 8) { + _mm_storeu_si128(ptr, valueX8); + ptr++; + w -= 8; + } } break; // TODO: Trailer @@ -51,8 +138,33 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, break; } } + #elif PPSSPP_ARCH(ARM64_NEON) + uint16x8_t valueX8 = vdupq_n_u16(depthValue); + for (int y = y1; y < y2; y++) { + uint16_t *ptr = (uint16_t *)(dest + stride * y + x1); + int w = x2 - x1; + switch (depthCompare) { + case GE_COMP_ALWAYS: + if (depthValue == 0) { + memset(ptr, 0, w * 2); + } else { + while (w >= 8) { + vst1q_u16(ptr, valueX8); + ptr += 8; + w -= 8; + } + } + break; + // TODO: Trailer + case GE_COMP_NEVER: + break; + default: + // TODO + break; + } + } #else // Do nothing for now #endif @@ -69,6 +181,7 @@ struct int2 { // Adapted from Intel's depth rasterizer example. // Started with the scalar version, will SIMD-ify later. +// x1/y1 etc are the scissor rect. void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const ScreenVert vertsSub[3], GEComparison compareMode) { int tileStartX = x1; int tileEndX = x2; @@ -76,29 +189,33 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int tileStartY = y1; int tileEndY = y2; + // BEGIN triangle setup. This should be done SIMD, four triangles at a time. + // Due to the many multiplications, we might want to do it in floating point as 32-bit integer muls + // are slow on SSE2. + // Convert to whole pixels for now. Later subpixel precision. ScreenVert verts[3]; - verts[0].x = vertsSub[0].x >> 4; - verts[0].y = vertsSub[0].y >> 4; + verts[0].x = vertsSub[0].x; + verts[0].y = vertsSub[0].y; verts[0].z = vertsSub[0].z; - verts[1].x = vertsSub[2].x >> 4; - verts[1].y = vertsSub[2].y >> 4; + verts[1].x = vertsSub[2].x; + verts[1].y = vertsSub[2].y; verts[1].z = vertsSub[2].z; - verts[2].x = vertsSub[1].x >> 4; - verts[2].y = vertsSub[1].y >> 4; + verts[2].x = vertsSub[1].x; + verts[2].y = vertsSub[1].y; verts[2].z = vertsSub[1].z; // use fixed-point only for X and Y. Avoid work for Z and W. - int startX = std::max(std::min(std::min(verts[0].x, verts[1].x), verts[2].x), tileStartX) & int(0xFFFFFFFE); + int startX = std::max(std::min(std::min(verts[0].x, verts[1].x), verts[2].x), tileStartX); int endX = std::min(std::max(std::max(verts[0].x, verts[1].x), verts[2].x) + 1, tileEndX); - int startY = std::max(std::min(std::min(verts[0].y, verts[1].y), verts[2].y), tileStartY) & int(0xFFFFFFFE); + int startY = std::max(std::min(std::min(verts[0].y, verts[1].y), verts[2].y), tileStartY); int endY = std::min(std::max(std::max(verts[0].y, verts[1].y), verts[2].y) + 1, tileEndY); - if (endX == startX || endY == startY) { // No pixels return; } + // TODO: Cull really small triangles here. // Fab(x, y) = Ax + By + C = 0 // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 @@ -126,13 +243,6 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, return; } - float oneOverTriArea = (1.0f / float(triArea)); - - float zz[3]; - for (int vv = 0; vv < 3; vv++) { - zz[vv] = (float)verts[vv].z * oneOverTriArea; - } - int rowIdx = (startY * stride + startX); int col = startX; int row = startY; @@ -140,7 +250,15 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, // Calculate slopes at starting corner. int alpha0 = (A0 * col) + (B0 * row) + C0; int beta0 = (A1 * col) + (B1 * row) + C1; - int gama0 = (A2 * col) + (B2 * row) + C2; + int gamma0 = (A2 * col) + (B2 * row) + C2; + + float oneOverTriArea = (1.0f / float(triArea)); + + // END triangle setup. + float zz[3]; + for (int vv = 0; vv < 3; vv++) { + zz[vv] = (float)verts[vv].z * oneOverTriArea; + } // Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY) for (int r = startY; r < endY; r++, @@ -148,27 +266,28 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, rowIdx += stride, alpha0 += B0, beta0 += B1, - gama0 += B2) + gamma0 += B2) { - // Compute barycentric coordinates int idx = rowIdx; + + // Restore row steppers. int alpha = alpha0; int beta = beta0; - int gama = gama0; + int gamma = gamma0; for (int c = startX; c < endX; c++, idx++, alpha += A0, beta += A1, - gama += A2) + gamma += A2) { - int mask = alpha >= 0 && beta >= 0 && gama >= 0; + int mask = alpha >= 0 && beta >= 0 && gamma >= 0; // Early out if all of this quad's pixels are outside the triangle. if (!mask) { continue; } // Compute barycentric-interpolated depth - float depth = alpha * zz[0] + beta * zz[1] + gama * zz[2]; + float depth = alpha * zz[0] + beta * zz[1] + gamma * zz[2]; float previousDepthValue = (float)depthBuf[idx]; int depthMask; @@ -224,6 +343,8 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i } bool isThroughMode = (vertTypeID & GE_VTYPE_THROUGH_MASK) != 0; + bool cullEnabled = false; + bool cullCCW = false; // Turn the input data into a raw float array that we can pass to an optimized triangle rasterizer. float *verts = (float *)bufferData; @@ -244,7 +365,7 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i verts[i * 3 + j] = data[j] * factor; } } - break; + break; case GE_VTYPE_POS_16BIT: if (!isThroughMode) { factor = 1.0f / 32768.0f; @@ -264,6 +385,8 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i // OK, we now have the coordinates. Let's transform, we can actually do this in-place. if (!(vertTypeID & GE_VTYPE_THROUGH_MASK)) { + cullEnabled = gstate.isCullEnabled(); + // TODO: This is very suboptimal. This should be one matrix multiplication per vertex. float viewportX = gstate.getViewportXCenter(); @@ -308,8 +431,8 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i if (screen[2] >= 65535.0f) { screen[2] = 65535.0f; } - screenVerts[i].x = screen[0]; - screenVerts[i].y = screen[1]; + screenVerts[i].x = screen[0] * (1.0f / 16.0f); // We ditch the subpixel precision here. + screenVerts[i].y = screen[1] * (1.0f / 16.0f); screenVerts[i].z = screen[2]; } if (allBehind) { @@ -318,8 +441,8 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i } } else { for (int i = 0; i < count; i++) { - screenVerts[i].x = (int)verts[i * 3 + 0] << 4; - screenVerts[i].y = (int)verts[i * 3 + 1] << 4; + screenVerts[i].x = (int)verts[i * 3 + 0]; + screenVerts[i].y = (int)verts[i * 3 + 1]; screenVerts[i].z = (u16)clamp_value(verts[i * 3 + 2], 0.0f, 65535.0f); } } @@ -331,8 +454,9 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i case GE_PRIM_RECTANGLES: for (int i = 0; i < count / 2; i++) { uint16_t z = screenVerts[i + 1].z; // depth from second vertex + // TODO: Should clip coordinates to the scissor rectangle. // We remove the subpixel information here. - DepthRasterRect(depth, depthStride, screenVerts[i].x >> 4, screenVerts[i].y >> 4, screenVerts[i + 1].x >> 4, screenVerts[i + 1].y >> 4, + DepthRasterRect(depth, depthStride, screenVerts[i].x, screenVerts[i].y, screenVerts[i + 1].x, screenVerts[i + 1].y, z, compareMode); } break; From 09afe363ca963df852da031ae3dc0d845817903f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Thu, 19 Dec 2024 10:23:30 +0100 Subject: [PATCH 05/28] One less operation in the inner loop --- GPU/Common/DepthRaster.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index afeb4b64a107..af20c9c89a41 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -254,11 +254,12 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, float oneOverTriArea = (1.0f / float(triArea)); - // END triangle setup. float zz[3]; - for (int vv = 0; vv < 3; vv++) { - zz[vv] = (float)verts[vv].z * oneOverTriArea; - } + zz[0] = (float)verts[0].z; + zz[1] = (float)(verts[1].z - verts[0].z) * oneOverTriArea; + zz[2] = (float)(verts[2].z - verts[0].z) * oneOverTriArea; + + // END triangle setup. // Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY) for (int r = startY; r < endY; r++, @@ -287,7 +288,7 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, continue; } // Compute barycentric-interpolated depth - float depth = alpha * zz[0] + beta * zz[1] + gamma * zz[2]; + float depth = zz[0] + beta * zz[1] + gamma * zz[2]; float previousDepthValue = (float)depthBuf[idx]; int depthMask; From 72c954d8c31176ca2502ba74af4c13d57f8a8c4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Thu, 19 Dec 2024 10:38:56 +0100 Subject: [PATCH 06/28] Add convenient wrappers --- GPU/Common/DepthRaster.cpp | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index af20c9c89a41..2bb83831bbb5 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -1,5 +1,6 @@ #include #include +#include #include "Common/Math/CrossSIMD.h" #include "GPU/Common/DepthRaster.h" @@ -18,18 +19,9 @@ struct Vec4S32 { Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ _mm_sub_epi32(v, other.v) }; } - // This is really bad if we restrict ourselves to SSE2 only. - // If we have SSE4, we can do _mm_mullo_epi32. - // Let's avoid using it as much as possible. - // https://stackoverflow.com/questions/17264399/fastest-way-to-multiply-two-vectors-of-32bit-integers-in-c-with-sse + // NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow. Vec4S32 operator *(Vec4S32 other) const { - __m128i a13 = _mm_shuffle_epi32(v, 0xF5); // (-,a3,-,a1) - __m128i b13 = _mm_shuffle_epi32(other.v, 0xF5); // (-,b3,-,b1) - __m128i prod02 = _mm_mul_epu32(v, other.v); // (-,a2*b2,-,a0*b0) - __m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1) - __m128i prod01 = _mm_unpacklo_epi32(prod02, prod13); // (-,-,a1*b1,a0*b0) - __m128i prod23 = _mm_unpackhi_epi32(prod02, prod13); // (-,-,a3*b3,a2*b2) - return Vec4S32{ _mm_unpacklo_epi64(prod01, prod23) }; // (ab3,ab2,ab1,ab0) + return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; // (ab3,ab2,ab1,ab0) } }; @@ -234,7 +226,8 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int C1 = verts[2].x * verts[0].y - verts[0].x * verts[2].y; int C2 = verts[0].x * verts[1].y - verts[1].x * verts[0].y; - // Compute triangle area + // Compute triangle area. + // TODO: Cull really small triangles here - we can just raise the comparison value below. int triArea = A0 * verts[0].x + B0 * verts[0].y + C0; if (triArea <= 0) { // Too small to rasterize or backface culled @@ -287,7 +280,7 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, if (!mask) { continue; } - // Compute barycentric-interpolated depth + // Compute barycentric-interpolated depth. Could also compute it incrementally. float depth = zz[0] + beta * zz[1] + gamma * zz[2]; float previousDepthValue = (float)depthBuf[idx]; From c92b3b6521cc0a6976ec2a3c0bf918cab0f5e62f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 20 Dec 2024 08:48:16 +0100 Subject: [PATCH 07/28] Move prototype cross simd wrapper structs to CrossSIMD.h --- Common/Data/Convert/ColorConv.cpp | 4 +- Common/Math/CrossSIMD.h | 96 +++++++++++++++++++++++++++++++ Common/Math/SIMDHeaders.h | 22 +++++++ Common/Math/fast/fast_matrix.c | 2 - GPU/Common/DepthRaster.cpp | 77 ------------------------- 5 files changed, 120 insertions(+), 81 deletions(-) diff --git a/Common/Data/Convert/ColorConv.cpp b/Common/Data/Convert/ColorConv.cpp index 72fac52f2f01..5c4df7fca808 100644 --- a/Common/Data/Convert/ColorConv.cpp +++ b/Common/Data/Convert/ColorConv.cpp @@ -65,7 +65,7 @@ void ConvertBGRA8888ToRGB888(u8 *dst, const u32 *src, u32 numPixels) { } #if PPSSPP_ARCH(SSE2) -// fp64's improved version, see #19751 +// fp64's improved SSE2 version, see #19751. SSE4 no longer required here. static inline void ConvertRGBA8888ToRGBA5551(__m128i *dstp, const __m128i *srcp, u32 sseChunks) { const __m128i maskRB = _mm_set1_epi32(0x00F800F8); const __m128i maskGA = _mm_set1_epi32(0x8000F800); @@ -76,7 +76,7 @@ static inline void ConvertRGBA8888ToRGBA5551(__m128i *dstp, const __m128i *srcp, __m128i c0 = _mm_load_si128(&srcp[i + 0]); __m128i c1 = _mm_load_si128(&srcp[i + 1]); - __m128i rb0 = _mm_and_si128(c0, maskRB); // 00000000bbbbb00000000000rrrrr000 + __m128i rb0 = _mm_and_si128(c0, maskRB); // 00000000bbbbb00000000000rrrrr000 (each 32-bit lane) __m128i rb1 = _mm_and_si128(c1, maskRB); // 00000000bbbbb00000000000rrrrr000 __m128i ga0 = _mm_and_si128(c0, maskGA); // a000000000000000ggggg00000000000 __m128i ga1 = _mm_and_si128(c1, maskGA); // a000000000000000ggggg00000000000 diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index 94b2d3933b52..11ed21702486 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -5,3 +5,99 @@ #pragma once #include "Common/Math/SIMDHeaders.h" + +#if PPSSPP_ARCH(SSE2) + +struct Vec4S32 { + __m128i v; + + Vec4S32 operator +(Vec4S32 other) const { + return Vec4S32{ _mm_add_epi32(v, other.v) }; + } + Vec4S32 operator -(Vec4S32 other) const { + return Vec4S32{ _mm_sub_epi32(v, other.v) }; + } + // NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow. + Vec4S32 operator *(Vec4S32 other) const { + return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; // (ab3,ab2,ab1,ab0) + } +}; + +struct Vec4F32 { + __m128 v; + + static Vec4F32 FromVec4S32(Vec4S32 other) { + return Vec4F32{ _mm_cvtepi32_ps(other.v) }; + } + + Vec4F32 operator +(Vec4F32 other) const { + return Vec4F32{ _mm_add_ps(v, other.v) }; + } + Vec4F32 operator -(Vec4F32 other) const { + return Vec4F32{ _mm_sub_ps(v, other.v) }; + } + Vec4F32 operator *(Vec4F32 other) const { + return Vec4F32{ _mm_mul_ps(v, other.v) }; + } +}; + +struct Vec4U16 { + __m128i v; // we only use the lower 64 bits. + static Vec4U16 Load(void *mem) { + return Vec4U16{ _mm_loadl_epi64((__m128i *)mem) }; + } + void Store(void *mem) { + _mm_storel_epi64((__m128i *)mem, v); + } + static Vec4U16 Max(Vec4U16 a, Vec4U16 b) { + return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) }; + } + static Vec4U16 Min(Vec4U16 a, Vec4U16 b) { + return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) }; + } + Vec4U16 CompareLT(Vec4U16 other) { + return Vec4U16{ _mm_cmplt_epu16(v, other.v) }; + } +}; + +#elif PPSSPP_ARCH(ARM_NEON) + +struct Vec4S32 { + int32x4_t v; + + Vec4S32 operator +(Vec4S32 other) const { + return Vec4S32{ vaddq_s32(v, other.v) }; + } + Vec4S32 operator -(Vec4S32 other) const { + return Vec4S32{ vsubq_s32(v, other.v) }; + } + Vec4S32 operator *(Vec4S32 other) const { + return Vec4S32{ vmulq_s32(v, other.v) }; + } +}; + +struct Vec4F32 { + float32x4_t v; + + static Vec4F32 FromVec4S32(Vec4S32 other) { + return Vec4F32{ vcvtq_f32_s32(other.v) }; + } + + Vec4F32 operator +(Vec4F32 other) const { + return Vec4F32{ vaddq_f32(v, other.v) }; + } + Vec4F32 operator -(Vec4F32 other) const { + return Vec4F32{ vsubq_f32(v, other.v) }; + } + Vec4F32 operator *(Vec4F32 other) const { + return Vec4F32{ vmulq_f32(v, other.v) }; + } +}; + +#else + +struct Vec4S32 { + s32 v[4]; +}; + +#endif diff --git a/Common/Math/SIMDHeaders.h b/Common/Math/SIMDHeaders.h index 8e812a7819e6..3f8500dfb273 100644 --- a/Common/Math/SIMDHeaders.h +++ b/Common/Math/SIMDHeaders.h @@ -128,4 +128,26 @@ inline __m128i _mm_packu2_epi32_SSE2(const __m128i v0, const __m128i v1) { return _mm_castps_si128(_mm_shuffle_ps(packed0, packed1, _MM_SHUFFLE(2, 0, 2, 0))); } +// The below are not real SSE instructions in any generation, but should exist. + +// Return 0xFFFF where x <= y, else 0x0000. +inline __m128i _mm_cmple_epu16(__m128i x, __m128i y) { + return _mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128()); +} + +// Return 0xFFFF where x >= y, else 0x0000. +inline __m128i _mm_cmpge_epu16(__m128i x, __m128i y) { + return _mm_cmple_epu16(y, x); +} + +// Return 0xFFFF where x > y, else 0x0000. +inline __m128i _mm_cmpgt_epu16(__m128i x, __m128i y) { + return _mm_andnot_si128(_mm_cmpeq_epi16(x, y), _mm_cmple_epu16(y, x)); +} + +// Return 0xFFFF where x < y, else 0x0000. +inline __m128i _mm_cmplt_epu16(__m128i x, __m128i y) { + return _mm_cmpgt_epu16(y, x); +} + #endif diff --git a/Common/Math/fast/fast_matrix.c b/Common/Math/fast/fast_matrix.c index 0402f366297e..d23ce3b0e0b2 100644 --- a/Common/Math/fast/fast_matrix.c +++ b/Common/Math/fast/fast_matrix.c @@ -6,8 +6,6 @@ #if PPSSPP_ARCH(SSE2) -#include "fast_matrix.h" - void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b) { int i; __m128 a_col_1 = _mm_loadu_ps(a); diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index 2bb83831bbb5..8068df066f29 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -8,83 +8,6 @@ #include "Common/Math/math_util.h" #include "GPU/Common/VertexDecoderCommon.h" -#if PPSSPP_ARCH(SSE2) - -struct Vec4S32 { - __m128i v; - - Vec4S32 operator +(Vec4S32 other) const { - return Vec4S32{ _mm_add_epi32(v, other.v) }; - } - Vec4S32 operator -(Vec4S32 other) const { - return Vec4S32{ _mm_sub_epi32(v, other.v) }; - } - // NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow. - Vec4S32 operator *(Vec4S32 other) const { - return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; // (ab3,ab2,ab1,ab0) - } -}; - -struct Vec4F32 { - __m128 v; - - static Vec4F32 FromVec4S32(Vec4S32 other) { - return Vec4F32{ _mm_cvtepi32_ps(other.v) }; - } - - Vec4F32 operator +(Vec4F32 other) const { - return Vec4F32{ _mm_add_ps(v, other.v) }; - } - Vec4F32 operator -(Vec4F32 other) const { - return Vec4F32{ _mm_sub_ps(v, other.v) }; - } - Vec4F32 operator *(Vec4F32 other) const { - return Vec4F32{ _mm_mul_ps(v, other.v) }; - } -}; - -#elif PPSSPP_ARCH(ARM_NEON) - -struct Vec4S32 { - uint32x4_t v; - - Vec4S32 operator +(Vec4S32 other) const { - return Vec4S32{ vaddq_s32(v, other.v) }; - } - Vec4S32 operator -(Vec4S32 other) const { - return Vec4S32{ vsubq_s32(v, other.v) }; - } - Vec4S32 operator *(Vec4S32 other) const { - return Vec4S32{ vmulq_s32(v, other.v) }; - } -}; - -struct Vec4F32 { - float32x4_t v; - - static Vec4F32 FromVec4S32(Vec4S32 other) { - return Vec4F32{ _mm_cvtepi32_ps(other.v) }; - } - - Vec4F32 operator +(Vec4F32 other) const { - return Vec4F32{ vaddq_f32(v, other.v) }; - } - Vec4F32 operator -(Vec4F32 other) const { - return Vec4F32{ vsubq_f32(v, other.v) }; - } - Vec4F32 operator *(Vec4F32 other) const { - return Vec4F32{ vmulq_f32(v, other.v) }; - } -}; - -#else - -struct Vec4S32 { - s32 v[4]; -}; - -#endif - struct ScreenVert { int x; int y; From c7f0eabc6509002d440c428d4069b254278542e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 20 Dec 2024 08:58:37 +0100 Subject: [PATCH 08/28] DepthRaster: Premultiply world-view-proj matrices --- GPU/Common/DepthRaster.cpp | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index 8068df066f29..da85632c38ed 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -85,15 +85,6 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, #endif } -using namespace Math3D; -struct int2 { - int x, y; - int2(float a, float b) { - x = (int)(a + 0.5f); - y = (int)(b + 0.5f); - } -}; - // Adapted from Intel's depth rasterizer example. // Started with the scalar version, will SIMD-ify later. // x1/y1 etc are the scissor rect. @@ -127,7 +118,7 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int startY = std::max(std::min(std::min(verts[0].y, verts[1].y), verts[2].y), tileStartY); int endY = std::min(std::max(std::max(verts[0].y, verts[1].y), verts[2].y) + 1, tileEndY); if (endX == startX || endY == startY) { - // No pixels + // No pixels, or outside screen. return; } // TODO: Cull really small triangles here. @@ -300,6 +291,15 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i break; } + float world[16]; + float view[16]; + float worldview[16]; + float worldviewproj[16]; + ConvertMatrix4x3To4x4(world, gstate.worldMatrix); + ConvertMatrix4x3To4x4(view, gstate.viewMatrix); + Matrix4ByMatrix4(worldview, world, view); + Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix); + // OK, we now have the coordinates. Let's transform, we can actually do this in-place. if (!(vertTypeID & GE_VTYPE_THROUGH_MASK)) { cullEnabled = gstate.isCullEnabled(); @@ -316,12 +316,8 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i bool allBehind = true; for (int i = 0; i < count; i++) { - float world[3]; - float view[3]; float proj[4]; - Vec3ByMatrix43(world, verts + i * 3, gstate.worldMatrix); - Vec3ByMatrix43(view, world, gstate.viewMatrix); - Vec3ByMatrix44(proj, view, gstate.projMatrix); // TODO: Include adjustments to the proj matrix? + Vec3ByMatrix44(proj, verts + i * 3, worldviewproj); // TODO: Include adjustments to the proj matrix? float w = proj[3]; From dd315182723a0848977ea12c71c89e2c63d00397 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 20 Dec 2024 09:09:18 +0100 Subject: [PATCH 09/28] DepthRaster: Merge the decode and transform steps --- GPU/Common/DepthRaster.cpp | 118 +++++++++++++++++++++++-------------- 1 file changed, 73 insertions(+), 45 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index da85632c38ed..7faefdb61114 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -255,56 +255,26 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i bool cullCCW = false; // Turn the input data into a raw float array that we can pass to an optimized triangle rasterizer. - float *verts = (float *)bufferData; + float *transformed = (float *)bufferData; + ScreenVert *screenVerts = (ScreenVert *)((uint8_t *)bufferData + 65536 * 8); // Simple, most common case. int vertexStride = dec->VertexSize(); int offset = dec->posoff; - float factor = 1.0f; - switch (vertTypeID & GE_VTYPE_POS_MASK) { - case GE_VTYPE_POS_8BIT: - if (!isThroughMode) { - factor = 1.0f / 128.0f; - } - for (int i = 0; i < count; i++) { - const s8 *data = (const s8 *)vertexData + i * vertexStride + offset; - for (int j = 0; j < 3; j++) { - verts[i * 3 + j] = data[j] * factor; - } - } - break; - case GE_VTYPE_POS_16BIT: - if (!isThroughMode) { - factor = 1.0f / 32768.0f; - } - for (int i = 0; i < count; i++) { - const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset)); - for (int j = 0; j < 3; j++) { - verts[i * 3 + j] = data[j] * factor; - } - } - break; - case GE_VTYPE_POS_FLOAT: - for (int i = 0; i < count; i++) - memcpy(&verts[i * 3], (const u8 *)vertexData + vertexStride * i + offset, sizeof(float) * 3); - break; - } - - float world[16]; - float view[16]; - float worldview[16]; - float worldviewproj[16]; - ConvertMatrix4x3To4x4(world, gstate.worldMatrix); - ConvertMatrix4x3To4x4(view, gstate.viewMatrix); - Matrix4ByMatrix4(worldview, world, view); - Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix); // OK, we now have the coordinates. Let's transform, we can actually do this in-place. if (!(vertTypeID & GE_VTYPE_THROUGH_MASK)) { - cullEnabled = gstate.isCullEnabled(); + float world[16]; + float view[16]; + float worldview[16]; + float worldviewproj[16]; + ConvertMatrix4x3To4x4(world, gstate.worldMatrix); + ConvertMatrix4x3To4x4(view, gstate.viewMatrix); + Matrix4ByMatrix4(worldview, world, view); + Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix); // TODO: Include adjustments to the proj matrix? - // TODO: This is very suboptimal. This should be one matrix multiplication per vertex. + cullEnabled = gstate.isCullEnabled(); float viewportX = gstate.getViewportXCenter(); float viewportY = gstate.getViewportYCenter(); @@ -315,9 +285,39 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i bool allBehind = true; + float temp[3]; + for (int i = 0; i < count; i++) { + switch (vertTypeID & GE_VTYPE_POS_MASK) { + case GE_VTYPE_POS_8BIT: + for (int i = 0; i < count; i++) { + const s8 *data = (const s8 *)vertexData + i * vertexStride + offset; + for (int j = 0; j < 3; j++) { + temp[j] = data[j] * (1.0f / 128.0f); // TODO: Can we bake this factor in somewhere? + } + Vec3ByMatrix44(transformed + i * 4, temp, worldviewproj); + } + break; + case GE_VTYPE_POS_16BIT: + for (int i = 0; i < count; i++) { + const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset)); + for (int j = 0; j < 3; j++) { + temp[j] = data[j] * (1.0f / 32768.0f); // TODO: Can we bake this factor in somewhere? + } + Vec3ByMatrix44(transformed + i * 4, temp, worldviewproj); + } + break; + case GE_VTYPE_POS_FLOAT: + for (int i = 0; i < count; i++) { + const float *data = (const float *)((const u8 *)vertexData + vertexStride * i + offset); + Vec3ByMatrix44(transformed + i * 4, data, worldviewproj); + } + break; + } + } + for (int i = 0; i < count; i++) { float proj[4]; - Vec3ByMatrix44(proj, verts + i * 3, worldviewproj); // TODO: Include adjustments to the proj matrix? + memcpy(proj, transformed + i * 4, 4 * sizeof(float)); float w = proj[3]; @@ -353,10 +353,38 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i return; } } else { + float factor = 1.0f; + switch (vertTypeID & GE_VTYPE_POS_MASK) { + case GE_VTYPE_POS_8BIT: + for (int i = 0; i < count; i++) { + const s8 *data = (const s8 *)vertexData + i * vertexStride + offset; + for (int j = 0; j < 3; j++) { + transformed[i * 4 + j] = data[j] * factor; + } + transformed[i * 4 + 3] = 1.0f; + } + break; + case GE_VTYPE_POS_16BIT: + for (int i = 0; i < count; i++) { + const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset)); + for (int j = 0; j < 3; j++) { + transformed[i * 4 + j] = data[j] * factor; + } + transformed[i * 4 + 3] = 1.0f; + } + break; + case GE_VTYPE_POS_FLOAT: + for (int i = 0; i < count; i++) { + memcpy(&transformed[i * 4], (const u8 *)vertexData + vertexStride * i + offset, sizeof(float) * 3); + transformed[i * 4 + 3] = 1.0f; + } + break; + } + for (int i = 0; i < count; i++) { - screenVerts[i].x = (int)verts[i * 3 + 0]; - screenVerts[i].y = (int)verts[i * 3 + 1]; - screenVerts[i].z = (u16)clamp_value(verts[i * 3 + 2], 0.0f, 65535.0f); + screenVerts[i].x = (int)transformed[i * 4 + 0]; + screenVerts[i].y = (int)transformed[i * 4 + 1]; + screenVerts[i].z = (u16)clamp_value(transformed[i * 4 + 2], 0.0f, 65535.0f); } } From bdb5f3a91b64d5db894338bff574ba1cbba9c93f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 20 Dec 2024 10:30:23 +0100 Subject: [PATCH 10/28] Reorganize the depth vertex pipeline for future optimizations --- GPU/Common/DepthRaster.cpp | 267 +++++++++++--------------------- GPU/Common/DepthRaster.h | 13 +- GPU/Common/DrawEngineCommon.cpp | 74 ++++++++- GPU/Common/DrawEngineCommon.h | 15 ++ GPU/GPUCommonHW.cpp | 9 -- GPU/Vulkan/DrawEngineVulkan.cpp | 9 ++ 6 files changed, 201 insertions(+), 186 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index 7faefdb61114..e99160f89bc3 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -8,13 +8,6 @@ #include "Common/Math/math_util.h" #include "GPU/Common/VertexDecoderCommon.h" -struct ScreenVert { - int x; - int y; - uint16_t z; - uint16_t behind; -}; - void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, short depthValue, GEComparison depthCompare) { // Swap coordinates if needed, we don't back-face-cull rects. // We also ignore the UV rotation here. @@ -88,7 +81,7 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, // Adapted from Intel's depth rasterizer example. // Started with the scalar version, will SIMD-ify later. // x1/y1 etc are the scissor rect. -void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const ScreenVert vertsSub[3], GEComparison compareMode) { +void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const DepthScreenVertex vertsSub[3], GEComparison compareMode) { int tileStartX = x1; int tileEndX = x2; @@ -100,7 +93,7 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, // are slow on SSE2. // Convert to whole pixels for now. Later subpixel precision. - ScreenVert verts[3]; + DepthScreenVertex verts[3]; verts[0].x = vertsSub[0].x; verts[0].y = vertsSub[0].y; verts[0].z = vertsSub[0].z; @@ -218,179 +211,123 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, } // for each row } -// We ignore lots of primitive types for now. -void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, int y2, void *bufferData, - const void *vertexData, const void *indexData, GEPrimitiveType prim, int count, VertexDecoder *dec, u32 vertTypeID, bool clockwise) { +void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int count, VertexDecoder *dec, u32 vertTypeID) { + // TODO: Ditch skinned and morphed prims for now since we don't have a fast way to skin without running the full decoder. + _dbg_assert_((vertTypeID & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) == 0); - GEComparison compareMode = gstate.getDepthTestFunction(); - if (gstate.isModeClear()) { - if (!gstate.isClearModeDepthMask()) { - return; - } - compareMode = GE_COMP_ALWAYS; - } else { - if (!gstate.isDepthTestEnabled() || !gstate.isDepthWriteEnabled()) - return; - } + int vertexStride = dec->VertexSize(); + int offset = dec->posoff; - switch (prim) { - case GE_PRIM_INVALID: - case GE_PRIM_KEEP_PREVIOUS: - case GE_PRIM_LINES: - case GE_PRIM_LINE_STRIP: - case GE_PRIM_POINTS: - return; - default: + float temp[3]; + switch (vertTypeID & GE_VTYPE_POS_MASK) { + case GE_VTYPE_POS_8BIT: + for (int i = 0; i < count; i++) { + const s8 *data = (const s8 *)vertexData + i * vertexStride + offset; + for (int j = 0; j < 3; j++) { + temp[j] = data[j] * (1.0f / 128.0f); // TODO: Can we bake this factor in somewhere? + } + Vec3ByMatrix44(dest + i * 4, temp, worldviewproj); + } + break; + case GE_VTYPE_POS_16BIT: + for (int i = 0; i < count; i++) { + const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset)); + for (int j = 0; j < 3; j++) { + temp[j] = data[j] * (1.0f / 32768.0f); // TODO: Can we bake this factor in somewhere? + } + Vec3ByMatrix44(dest + i * 4, temp, worldviewproj); + } + break; + case GE_VTYPE_POS_FLOAT: + for (int i = 0; i < count; i++) { + const float *data = (const float *)((const u8 *)vertexData + vertexStride * i + offset); + Vec3ByMatrix44(dest + i * 4, data, worldviewproj); + } break; } +} - // TODO: Ditch indexed primitives for now, also ditched skinned ones since we don't have a fast way to skin without - // running the full decoder. - if (vertTypeID & (GE_VTYPE_IDX_MASK | GE_VTYPE_WEIGHT_MASK)) { - return; +void DepthRasterConvertTransformed(DepthScreenVertex *screenVerts, const TransformedVertex *transformed, int count) { + for (int i = 0; i < count; i++) { + screenVerts[i].x = (int)transformed[i].pos[0]; + screenVerts[i].y = (int)transformed[i].pos[1]; + screenVerts[i].z = (u16)transformed[i].pos[2]; } +} - bool isThroughMode = (vertTypeID & GE_VTYPE_THROUGH_MASK) != 0; - bool cullEnabled = false; - bool cullCCW = false; - - // Turn the input data into a raw float array that we can pass to an optimized triangle rasterizer. - float *transformed = (float *)bufferData; +int DepthRasterClipIndexedTriangles(DepthScreenVertex *screenVerts, const float *transformed, const uint16_t *indexBuffer, int count) { + bool cullEnabled = gstate.isCullEnabled(); - ScreenVert *screenVerts = (ScreenVert *)((uint8_t *)bufferData + 65536 * 8); + const float viewportX = gstate.getViewportXCenter(); + const float viewportY = gstate.getViewportYCenter(); + const float viewportZ = gstate.getViewportZCenter(); + const float viewportScaleX = gstate.getViewportXScale(); + const float viewportScaleY = gstate.getViewportYScale(); + const float viewportScaleZ = gstate.getViewportZScale(); - // Simple, most common case. - int vertexStride = dec->VertexSize(); - int offset = dec->posoff; + bool cullCCW = false; // OK, we now have the coordinates. Let's transform, we can actually do this in-place. - if (!(vertTypeID & GE_VTYPE_THROUGH_MASK)) { - float world[16]; - float view[16]; - float worldview[16]; - float worldviewproj[16]; - ConvertMatrix4x3To4x4(world, gstate.worldMatrix); - ConvertMatrix4x3To4x4(view, gstate.viewMatrix); - Matrix4ByMatrix4(worldview, world, view); - Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix); // TODO: Include adjustments to the proj matrix? - - cullEnabled = gstate.isCullEnabled(); - - float viewportX = gstate.getViewportXCenter(); - float viewportY = gstate.getViewportYCenter(); - float viewportZ = gstate.getViewportZCenter(); - float viewportScaleX = gstate.getViewportXScale(); - float viewportScaleY = gstate.getViewportYScale(); - float viewportScaleZ = gstate.getViewportZScale(); - - bool allBehind = true; - - float temp[3]; - for (int i = 0; i < count; i++) { - switch (vertTypeID & GE_VTYPE_POS_MASK) { - case GE_VTYPE_POS_8BIT: - for (int i = 0; i < count; i++) { - const s8 *data = (const s8 *)vertexData + i * vertexStride + offset; - for (int j = 0; j < 3; j++) { - temp[j] = data[j] * (1.0f / 128.0f); // TODO: Can we bake this factor in somewhere? - } - Vec3ByMatrix44(transformed + i * 4, temp, worldviewproj); - } - break; - case GE_VTYPE_POS_16BIT: - for (int i = 0; i < count; i++) { - const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset)); - for (int j = 0; j < 3; j++) { - temp[j] = data[j] * (1.0f / 32768.0f); // TODO: Can we bake this factor in somewhere? - } - Vec3ByMatrix44(transformed + i * 4, temp, worldviewproj); - } - break; - case GE_VTYPE_POS_FLOAT: - for (int i = 0; i < count; i++) { - const float *data = (const float *)((const u8 *)vertexData + vertexStride * i + offset); - Vec3ByMatrix44(transformed + i * 4, data, worldviewproj); - } - break; - } - } - for (int i = 0; i < count; i++) { - float proj[4]; - memcpy(proj, transformed + i * 4, 4 * sizeof(float)); + int outCount = 0; - float w = proj[3]; + for (int i = 0; i < count; i += 3) { + const float *verts[3] = { + transformed + indexBuffer[i] * 4, + transformed + indexBuffer[i + 1] * 4, + transformed + indexBuffer[i + 2] * 4, + }; - bool inFront = w > 0.0f; - screenVerts[i].behind = !inFront; - if (inFront) { - allBehind = false; - } + // Check if any vertex is behind the 0 plane. + if (verts[0][3] < 0.0f || verts[1][3] < 0.0f || verts[2][3] < 0.0f) { + // Ditch this triangle. Later we should clip here. + continue; + } + + for (int c = 0; c < 3; c++) { + const float *src = verts[c]; + float invW = 1.0f / src[3]; - // Clip to the w=0 plane. - proj[0] /= w; - proj[1] /= w; - proj[2] /= w; + float x = src[0] * invW; + float y = src[1] * invW; + float z = src[2] * invW; - // Then transform by the viewport and offset to finally get subpixel coordinates. Normally, this is done by the viewport - // and offset params. float screen[3]; - screen[0] = (proj[0] * viewportScaleX + viewportX) * 16.0f - gstate.getOffsetX16(); - screen[1] = (proj[1] * viewportScaleY + viewportY) * 16.0f - gstate.getOffsetY16(); - screen[2] = (proj[2] * viewportScaleZ + viewportZ); + screen[0] = (x * viewportScaleX + viewportX) * 16.0f - gstate.getOffsetX16(); + screen[1] = (y * viewportScaleY + viewportY) * 16.0f - gstate.getOffsetY16(); + screen[2] = (z * viewportScaleZ + viewportZ); if (screen[2] < 0.0f) { screen[2] = 0.0f; } if (screen[2] >= 65535.0f) { screen[2] = 65535.0f; } - screenVerts[i].x = screen[0] * (1.0f / 16.0f); // We ditch the subpixel precision here. - screenVerts[i].y = screen[1] * (1.0f / 16.0f); - screenVerts[i].z = screen[2]; + screenVerts[outCount].x = screen[0] * (1.0f / 16.0f); // We ditch the subpixel precision here. + screenVerts[outCount].y = screen[1] * (1.0f / 16.0f); + screenVerts[outCount].z = screen[2]; + + outCount++; } - if (allBehind) { - // Cull the whole draw. + } + return outCount; +} + +// Rasterizes screen-space vertices. +void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const DepthScreenVertex *screenVerts, int count) { + // Prim should now be either TRIANGLES or RECTs. + _dbg_assert_(prim == GE_PRIM_RECTANGLES || prim == GE_PRIM_TRIANGLES); + + GEComparison compareMode = gstate.getDepthTestFunction(); + if (gstate.isModeClear()) { + if (!gstate.isClearModeDepthMask()) { return; } + compareMode = GE_COMP_ALWAYS; } else { - float factor = 1.0f; - switch (vertTypeID & GE_VTYPE_POS_MASK) { - case GE_VTYPE_POS_8BIT: - for (int i = 0; i < count; i++) { - const s8 *data = (const s8 *)vertexData + i * vertexStride + offset; - for (int j = 0; j < 3; j++) { - transformed[i * 4 + j] = data[j] * factor; - } - transformed[i * 4 + 3] = 1.0f; - } - break; - case GE_VTYPE_POS_16BIT: - for (int i = 0; i < count; i++) { - const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset)); - for (int j = 0; j < 3; j++) { - transformed[i * 4 + j] = data[j] * factor; - } - transformed[i * 4 + 3] = 1.0f; - } - break; - case GE_VTYPE_POS_FLOAT: - for (int i = 0; i < count; i++) { - memcpy(&transformed[i * 4], (const u8 *)vertexData + vertexStride * i + offset, sizeof(float) * 3); - transformed[i * 4 + 3] = 1.0f; - } - break; - } - - for (int i = 0; i < count; i++) { - screenVerts[i].x = (int)transformed[i * 4 + 0]; - screenVerts[i].y = (int)transformed[i * 4 + 1]; - screenVerts[i].z = (u16)clamp_value(transformed[i * 4 + 2], 0.0f, 65535.0f); - } + if (!gstate.isDepthTestEnabled() || !gstate.isDepthWriteEnabled()) + return; } - // Then we need to stitch primitives from strips, etc etc... - // For now we'll just do it tri by tri. Later let's be more efficient. - switch (prim) { case GE_PRIM_RECTANGLES: for (int i = 0; i < count / 2; i++) { @@ -403,30 +340,10 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i break; case GE_PRIM_TRIANGLES: for (int i = 0; i < count / 3; i++) { - if (screenVerts[i * 3].behind || screenVerts[i * 3 + 1].behind || screenVerts[i * 3 + 2].behind) { - continue; - } DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, screenVerts + i * 3, compareMode); } break; - case GE_PRIM_TRIANGLE_STRIP: - { - int wind = 2; - for (int i = 0; i < count - 2; i++) { - int i0 = i; - int i1 = i + wind; - wind ^= 3; - int i2 = i + wind; - if (screenVerts[i0].behind || screenVerts[i1].behind || screenVerts[i2].behind) { - continue; - } - ScreenVert v[3]; - v[0] = screenVerts[i0]; - v[1] = screenVerts[i1]; - v[2] = screenVerts[i2]; - DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, v, compareMode); - } - break; - } + default: + _dbg_assert_(false); } } diff --git a/GPU/Common/DepthRaster.h b/GPU/Common/DepthRaster.h index 01fa60e257d1..d639103aaaa2 100644 --- a/GPU/Common/DepthRaster.h +++ b/GPU/Common/DepthRaster.h @@ -3,10 +3,21 @@ #include "Common/CommonTypes.h" #include "GPU/ge_constants.h" +struct DepthScreenVertex { + int x; + int y; + uint16_t z; +}; + // Specialized, very limited depth-only rasterizer. // Meant to run in parallel with hardware rendering, in games that read back the depth buffer // for effects like lens flare. // So, we can be quite inaccurate without any issues, and skip a lot of functionality. class VertexDecoder; -void DepthRasterPrim(uint16_t *dest, int stride, int x1, int x2, int y1, int y2, void *bufferData, const void *vertexData, const void *indexData, GEPrimitiveType prim, int count, VertexDecoder *decoder, u32 vertexTypeID, bool clockwise); +struct TransformedVertex; + +int DepthRasterClipIndexedTriangles(DepthScreenVertex *screenVerts, const float *transformed, const uint16_t *indexBuffer, int count); +void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int count, VertexDecoder *dec, u32 vertTypeID); +void DepthRasterConvertTransformed(DepthScreenVertex *screenVerts, const TransformedVertex *transformed, int count); +void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const DepthScreenVertex *screenVerts, int count); diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index 818021a79b3a..df80511b4691 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -23,9 +23,11 @@ #include "Common/LogReporting.h" #include "Common/Math/SIMDHeaders.h" #include "Common/Math/lin/matrix4x4.h" +#include "Core/System.h" #include "Core/Config.h" #include "GPU/Common/DrawEngineCommon.h" #include "GPU/Common/SplineCommon.h" +#include "GPU/Common/DepthRaster.h" #include "GPU/Common/VertexDecoderCommon.h" #include "GPU/Common/SoftwareTransformCommon.h" #include "GPU/ge_constants.h" @@ -34,7 +36,9 @@ #define QUAD_INDICES_MAX 65536 enum { - TRANSFORMED_VERTEX_BUFFER_SIZE = VERTEX_BUFFER_MAX * sizeof(TransformedVertex) + TRANSFORMED_VERTEX_BUFFER_SIZE = VERTEX_BUFFER_MAX * sizeof(TransformedVertex), + DEPTH_TRANSFORMED_SIZE = VERTEX_BUFFER_MAX * 4, + DEPTH_SCREENVERTS_SIZE = VERTEX_BUFFER_MAX * sizeof(DepthScreenVertex), }; DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) { @@ -46,6 +50,12 @@ DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) { decoded_ = (u8 *)AllocateMemoryPages(DECODED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); decIndex_ = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); indexGen.Setup(decIndex_); + + useDepthRaster_ = PSP_CoreParameter().compat.flags().SoftwareRasterDepth; + if (useDepthRaster_) { + depthTransformed_ = (float *)AllocateMemoryPages(DEPTH_TRANSFORMED_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); + depthScreenVerts_ = (DepthScreenVertex *)AllocateMemoryPages(DEPTH_SCREENVERTS_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); + } } DrawEngineCommon::~DrawEngineCommon() { @@ -53,6 +63,10 @@ DrawEngineCommon::~DrawEngineCommon() { FreeMemoryPages(decIndex_, DECODED_INDEX_BUFFER_SIZE); FreeMemoryPages(transformed_, TRANSFORMED_VERTEX_BUFFER_SIZE); FreeMemoryPages(transformedExpanded_, 3 * TRANSFORMED_VERTEX_BUFFER_SIZE); + if (depthTransformed_) { + FreeMemoryPages(depthTransformed_, DEPTH_TRANSFORMED_SIZE); + FreeMemoryPages(depthScreenVerts_, DEPTH_SCREENVERTS_SIZE); + } delete decJitCache_; decoderMap_.Iterate([&](const uint32_t vtype, VertexDecoder *decoder) { delete decoder; @@ -886,3 +900,61 @@ bool DrawEngineCommon::DescribeCodePtr(const u8 *ptr, std::string &name) const { return false; } } + +void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID) { + switch (prim) { + case GE_PRIM_INVALID: + case GE_PRIM_KEEP_PREVIOUS: + case GE_PRIM_LINES: + case GE_PRIM_LINE_STRIP: + case GE_PRIM_POINTS: + return; + default: + break; + } + + if (vertTypeID & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) { + return; + } + + float world[16]; + float view[16]; + float worldview[16]; + float worldviewproj[16]; + ConvertMatrix4x3To4x4(world, gstate.worldMatrix); + ConvertMatrix4x3To4x4(view, gstate.viewMatrix); + Matrix4ByMatrix4(worldview, world, view); + Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix); // TODO: Include adjustments to the proj matrix? + + // Decode. + int numDec = 0; + for (int i = 0; i < numDrawVerts_; i++) { + DecodeAndTransformForDepthRaster(depthTransformed_ + numDec * 4, prim, worldviewproj, drawVerts_[i].verts, drawVerts_[i].vertexCount, dec, vertTypeID); + numDec += drawVerts_[i].vertexCount; + } + + // Clip and triangulate using the index buffer. + int outVertCount = DepthRasterClipIndexedTriangles(depthScreenVerts_, depthTransformed_, decIndex_, numDec); + + DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(), + GE_PRIM_TRIANGLES, gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), + depthScreenVerts_, outVertCount); +} + +void DrawEngineCommon::DepthRasterPretransformed(GEPrimitiveType prim, const TransformedVertex *inVerts, int count) { + switch (prim) { + case GE_PRIM_INVALID: + case GE_PRIM_KEEP_PREVIOUS: + case GE_PRIM_LINES: + case GE_PRIM_LINE_STRIP: + case GE_PRIM_POINTS: + return; + default: + break; + } + + DepthRasterConvertTransformed(depthScreenVerts_, inVerts, count); + DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(), + prim, gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), + depthScreenVerts_, count); +} diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h index 595ab929aab4..64e8478cd06e 100644 --- a/GPU/Common/DrawEngineCommon.h +++ b/GPU/Common/DrawEngineCommon.h @@ -27,6 +27,7 @@ #include "GPU/Common/GPUStateUtils.h" #include "GPU/Common/IndexGenerator.h" #include "GPU/Common/VertexDecoderCommon.h" +#include "GPU/Common/DepthRaster.h" class VertexDecoder; @@ -174,6 +175,9 @@ class DrawEngineCommon { void ApplyFramebufferRead(FBOTexState *fboTexState); + void DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID); + void DepthRasterPretransformed(GEPrimitiveType prim, const TransformedVertex *inVerts, int count); + static inline int IndexSize(u32 vtype) { const u32 indexType = (vtype & GE_VTYPE_IDX_MASK); if (indexType == GE_VTYPE_IDX_16BIT) { @@ -228,6 +232,11 @@ class DrawEngineCommon { } inline bool CollectedPureDraw() const { + // TODO: Do something faster. + if (useDepthRaster_) { + return false; + } + switch (seenPrims_) { case 1 << GE_PRIM_TRIANGLE_STRIP: return !anyCCWOrIndexed_ && numDrawInds_ == 1; @@ -343,4 +352,10 @@ class DrawEngineCommon { bool offsetOutsideEdge_; GPUCommon *gpuCommon_; + + // Software depth raster + bool useDepthRaster_ = false; + + float *depthTransformed_ = nullptr; + DepthScreenVertex *depthScreenVerts_ = nullptr; }; diff --git a/GPU/GPUCommonHW.cpp b/GPU/GPUCommonHW.cpp index f5383cf6be59..9b5389c8750a 100644 --- a/GPU/GPUCommonHW.cpp +++ b/GPU/GPUCommonHW.cpp @@ -13,7 +13,6 @@ #include "GPU/Common/DrawEngineCommon.h" #include "GPU/Common/TextureCacheCommon.h" #include "GPU/Common/FramebufferManagerCommon.h" -#include "GPU/Common/DepthRaster.h" struct CommonCommandTableEntry { uint8_t cmd; @@ -1040,10 +1039,6 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) { if (passCulling) { if (!drawEngineCommon_->SubmitPrim(verts, inds, prim, count, decoder, vertTypeID, true, &bytesRead)) { canExtend = false; - } else if (PSP_CoreParameter().compat.flags().SoftwareRasterDepth) { - DepthRasterPrim((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(), - gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), drawEngineCommon_->GetTempSpace(), - verts, inds, prim, count, decoder, vertTypeID, false); } onePassed = true; } else { @@ -1122,10 +1117,6 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) { if (passCulling) { if (!drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, decoder, vertTypeID, clockwise, &bytesRead)) { canExtend = false; - } else if (PSP_CoreParameter().compat.flags().SoftwareRasterDepth) { - DepthRasterPrim((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(), - gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), drawEngineCommon_->GetTempSpace(), - verts, inds, newPrim, count, decoder, vertTypeID, clockwise); } // As soon as one passes, assume we don't need to check the rest of this batch. onePassed = true; diff --git a/GPU/Vulkan/DrawEngineVulkan.cpp b/GPU/Vulkan/DrawEngineVulkan.cpp index f1279b855a69..e1775722ee3e 100644 --- a/GPU/Vulkan/DrawEngineVulkan.cpp +++ b/GPU/Vulkan/DrawEngineVulkan.cpp @@ -370,6 +370,9 @@ void DrawEngineVulkan::Flush() { } else { renderManager->Draw(descSetIndex, ARRAY_SIZE(dynamicUBOOffsets), dynamicUBOOffsets, vbuf, vbOffset, vertexCount); } + if (useDepthRaster_) { + DepthRasterTransform(prim, dec_, dec_->VertexType()); + } } else { PROFILE_THIS_SCOPE("soft"); VertexDecoder *swDec = dec_; @@ -438,6 +441,12 @@ void DrawEngineVulkan::Flush() { swTransform.SetProjMatrix(gstate.projMatrix, gstate_c.vpWidth < 0, gstate_c.vpHeight < 0, trans, scale); swTransform.Transform(prim, swDec->VertexType(), swDec->GetDecVtxFmt(), numDecodedVerts_, &result); + + // At this point, rect and line primitives are still preserved as such. So, it's the best time to do software depth raster. + if (useDepthRaster_) { + DepthRasterPretransformed(prim, transformed_, numDecodedVerts_); + } + // Non-zero depth clears are unusual, but some drivers don't match drawn depth values to cleared values. // Games sometimes expect exact matches (see #12626, for example) for equal comparisons. if (result.action == SW_CLEAR && everUsedEqualDepth_ && gstate.isClearModeDepthMask() && result.depth > 0.0f && result.depth < 1.0f) From bdf4b692073479b52631472e118c5aa0e500b08b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 20 Dec 2024 11:05:28 +0100 Subject: [PATCH 11/28] Warning fixes, minor cleanup --- GPU/Common/DepthRaster.cpp | 18 ++++++++++-------- GPU/Common/DepthRaster.h | 2 +- GPU/Common/DrawEngineCommon.cpp | 4 +++- GPU/Common/IndexGenerator.h | 2 +- unittest/UnitTest.cpp | 2 +- 5 files changed, 16 insertions(+), 12 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index e99160f89bc3..4e5c99a12790 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -247,14 +247,6 @@ void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const f } } -void DepthRasterConvertTransformed(DepthScreenVertex *screenVerts, const TransformedVertex *transformed, int count) { - for (int i = 0; i < count; i++) { - screenVerts[i].x = (int)transformed[i].pos[0]; - screenVerts[i].y = (int)transformed[i].pos[1]; - screenVerts[i].z = (u16)transformed[i].pos[2]; - } -} - int DepthRasterClipIndexedTriangles(DepthScreenVertex *screenVerts, const float *transformed, const uint16_t *indexBuffer, int count) { bool cullEnabled = gstate.isCullEnabled(); @@ -312,6 +304,16 @@ int DepthRasterClipIndexedTriangles(DepthScreenVertex *screenVerts, const float return outCount; } +void DepthRasterConvertTransformed(DepthScreenVertex *screenVerts, GEPrimitiveType prim, const TransformedVertex *transformed, int count) { + _dbg_assert_(prim == GE_PRIM_RECTANGLES || prim == GE_PRIM_TRIANGLES); + + for (int i = 0; i < count; i++) { + screenVerts[i].x = (int)transformed[i].pos[0]; + screenVerts[i].y = (int)transformed[i].pos[1]; + screenVerts[i].z = (u16)transformed[i].pos[2]; + } +} + // Rasterizes screen-space vertices. void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const DepthScreenVertex *screenVerts, int count) { // Prim should now be either TRIANGLES or RECTs. diff --git a/GPU/Common/DepthRaster.h b/GPU/Common/DepthRaster.h index d639103aaaa2..50ef309d936c 100644 --- a/GPU/Common/DepthRaster.h +++ b/GPU/Common/DepthRaster.h @@ -19,5 +19,5 @@ struct TransformedVertex; int DepthRasterClipIndexedTriangles(DepthScreenVertex *screenVerts, const float *transformed, const uint16_t *indexBuffer, int count); void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int count, VertexDecoder *dec, u32 vertTypeID); -void DepthRasterConvertTransformed(DepthScreenVertex *screenVerts, const TransformedVertex *transformed, int count); +void DepthRasterConvertTransformed(DepthScreenVertex *screenVerts, GEPrimitiveType prim, const TransformedVertex *transformed, int count); void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const DepthScreenVertex *screenVerts, int count); diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index df80511b4691..60024326fbe5 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -953,7 +953,9 @@ void DrawEngineCommon::DepthRasterPretransformed(GEPrimitiveType prim, const Tra break; } - DepthRasterConvertTransformed(depthScreenVerts_, inVerts, count); + _dbg_assert_(prim != GE_PRIM_TRIANGLE_STRIP && prim != GE_PRIM_TRIANGLE_FAN); + + DepthRasterConvertTransformed(depthScreenVerts_, prim, inVerts, count); DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(), prim, gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), depthScreenVerts_, count); diff --git a/GPU/Common/IndexGenerator.h b/GPU/Common/IndexGenerator.h index 723f4caabd7c..48df11e97291 100644 --- a/GPU/Common/IndexGenerator.h +++ b/GPU/Common/IndexGenerator.h @@ -54,7 +54,7 @@ class IndexGenerator { void TranslatePrim(int prim, int numInds, const u32_le *inds, int indexOffset, bool clockwise); // This is really the number of generated indices, or 3x the number of triangles. - int VertexCount() const { return inds_ - indsBase_; } + int VertexCount() const { return (int)(inds_ - indsBase_); } private: // Points (why index these? code simplicity) diff --git a/unittest/UnitTest.cpp b/unittest/UnitTest.cpp index 45c664a29af8..fe3fba0af4b9 100644 --- a/unittest/UnitTest.cpp +++ b/unittest/UnitTest.cpp @@ -1112,7 +1112,7 @@ bool TestSIMD() { EXPECT_EQ_INT(testdata[1], 0); __m128i a = _mm_set_epi16(0, 0x4444, 0, 0x3333, 0, 0x2222, 0, 0x1111); - __m128i b = _mm_set_epi16(0, 0x8888, 0, 0x7777, 0, 0x6666, 0, 0x5555); + __m128i b = _mm_set_epi16(0, (int16_t)0x8888, 0, 0x7777, 0, 0x6666, 0, 0x5555); __m128i c = _mm_packu2_epi32_SSE2(a, b); __m128i d = _mm_packus_epi32(a, b); From de45960420c3455d68818dec4948ef220855a5fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 20 Dec 2024 12:38:03 +0100 Subject: [PATCH 12/28] Reformat CrossSIMD.h for easier editing. Add some new methods. --- Common/Math/CrossSIMD.h | 102 +++++++++++++++++++--------------------- 1 file changed, 48 insertions(+), 54 deletions(-) diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index 11ed21702486..056cc5f0c7b5 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -11,53 +11,40 @@ struct Vec4S32 { __m128i v; - Vec4S32 operator +(Vec4S32 other) const { - return Vec4S32{ _mm_add_epi32(v, other.v) }; - } - Vec4S32 operator -(Vec4S32 other) const { - return Vec4S32{ _mm_sub_epi32(v, other.v) }; - } + static Vec4S32 Load(int *src) { return Vec4S32{ _mm_loadu_si128((const __m128i *)src) }; } + static Vec4S32 LoadAligned(int *src) { return Vec4S32{ _mm_load_si128((const __m128i *)src) }; } + void Store(int *dst) { _mm_storeu_si128((__m128i *)dst, v); } + void StoreAligned(int *dst) { _mm_store_si128((__m128i *)dst, v);} + + Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ _mm_add_epi32(v, other.v) }; } + Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ _mm_sub_epi32(v, other.v) }; } // NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow. - Vec4S32 operator *(Vec4S32 other) const { - return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; // (ab3,ab2,ab1,ab0) - } + Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; } // (ab3,ab2,ab1,ab0) }; struct Vec4F32 { __m128 v; - static Vec4F32 FromVec4S32(Vec4S32 other) { - return Vec4F32{ _mm_cvtepi32_ps(other.v) }; - } + static Vec4F32 Load(float *src) { return Vec4F32{ _mm_loadu_ps(src) }; } + static Vec4F32 LoadAligned(float *src) { return Vec4F32{ _mm_load_ps(src) }; } + void Store(float *dst) { _mm_storeu_ps(dst, v); } + void StoreAligned (float *dst) { _mm_store_ps(dst, v); } - Vec4F32 operator +(Vec4F32 other) const { - return Vec4F32{ _mm_add_ps(v, other.v) }; - } - Vec4F32 operator -(Vec4F32 other) const { - return Vec4F32{ _mm_sub_ps(v, other.v) }; - } - Vec4F32 operator *(Vec4F32 other) const { - return Vec4F32{ _mm_mul_ps(v, other.v) }; - } + static Vec4F32 FromVec4S32(Vec4S32 other) { return Vec4F32{ _mm_cvtepi32_ps(other.v) }; } + + Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ _mm_add_ps(v, other.v) }; } + Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ _mm_sub_ps(v, other.v) }; } + Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ _mm_mul_ps(v, other.v) }; } }; struct Vec4U16 { __m128i v; // we only use the lower 64 bits. - static Vec4U16 Load(void *mem) { - return Vec4U16{ _mm_loadl_epi64((__m128i *)mem) }; - } - void Store(void *mem) { - _mm_storel_epi64((__m128i *)mem, v); - } - static Vec4U16 Max(Vec4U16 a, Vec4U16 b) { - return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) }; - } - static Vec4U16 Min(Vec4U16 a, Vec4U16 b) { - return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) }; - } - Vec4U16 CompareLT(Vec4U16 other) { - return Vec4U16{ _mm_cmplt_epu16(v, other.v) }; - } + static Vec4U16 Load(uint16_t *mem) { return Vec4U16{ _mm_loadl_epi64((__m128i *)mem) }; } + void Store(uint16_t *mem) { _mm_storel_epi64((__m128i *)mem, v); } + + static Vec4U16 Max(Vec4U16 a, Vec4U16 b) { return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) }; } + static Vec4U16 Min(Vec4U16 a, Vec4U16 b) { return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) }; } + Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ _mm_cmplt_epu16(v, other.v) }; } }; #elif PPSSPP_ARCH(ARM_NEON) @@ -65,33 +52,40 @@ struct Vec4U16 { struct Vec4S32 { int32x4_t v; - Vec4S32 operator +(Vec4S32 other) const { - return Vec4S32{ vaddq_s32(v, other.v) }; - } - Vec4S32 operator -(Vec4S32 other) const { - return Vec4S32{ vsubq_s32(v, other.v) }; - } - Vec4S32 operator *(Vec4S32 other) const { - return Vec4S32{ vmulq_s32(v, other.v) }; - } + static Vec4F32 Load(int *src) { return Vec4F32{ vld1q_s32(src) }; } + static Vec4F32 LoadAligned(int *src) { return Vec4F32{ vld1q_s32(src) }; } + void Store(int *dst) { vst1q_s32(dst, v); } + void StoreAligned(int *dst) { vst1q_s32(dst, v); } + + Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ vaddq_s32(v, other.v) }; } + Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ vsubq_s32(v, other.v) }; } + Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; } }; struct Vec4F32 { float32x4_t v; + static Vec4F32 Load(float *src) { return Vec4F32{ vld1q_f32(src) }; } + static Vec4F32 LoadAligned(float *src) { return Vec4F32{ vld1q_f32(src) }; } + void Store(float *dst) { vst1q_f32(dst, v); } + void StoreAligned(float *dst) { vst1q_f32(dst, v); } + static Vec4F32 FromVec4S32(Vec4S32 other) { return Vec4F32{ vcvtq_f32_s32(other.v) }; } - Vec4F32 operator +(Vec4F32 other) const { - return Vec4F32{ vaddq_f32(v, other.v) }; - } - Vec4F32 operator -(Vec4F32 other) const { - return Vec4F32{ vsubq_f32(v, other.v) }; - } - Vec4F32 operator *(Vec4F32 other) const { - return Vec4F32{ vmulq_f32(v, other.v) }; - } + Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ vaddq_f32(v, other.v) }; } + Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ vsubq_f32(v, other.v) }; } + Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ vmulq_f32(v, other.v) }; } +}; + +struct Vec4U16 { + uint16x4_t v; // we only use the lower 64 bits. + static Vec4U16 Load(uint16_t *mem) { return Vec4U16{ vld1_u16(mem) }; } + void Store(uint16_t *mem) { vst1_u16(mem, v); } + static Vec4U16 Max(Vec4U16 a, Vec4U16 b) { return Vec4U16{ vmax_u16(a.v, b.v) }; } + static Vec4U16 Min(Vec4U16 a, Vec4U16 b) { return Vec4U16{ vmin_u16(a.v, b.v) }; } + Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ vclt_u16(v, other.v) }; } }; #else From 03b9f9805568e1b400ac8b6f00ce1f0a31517b5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 20 Dec 2024 13:19:20 +0100 Subject: [PATCH 13/28] Add more funcionality to CrossSIMD.h, like fast matrix mul and some conversion --- Common/Math/CrossSIMD.h | 137 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 127 insertions(+), 10 deletions(-) diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index 056cc5f0c7b5..b240678d17a4 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -8,11 +8,32 @@ #if PPSSPP_ARCH(SSE2) +struct Mat4F32 { + Mat4F32(const float *matrix) { + col0 = _mm_loadu_ps(matrix); + col1 = _mm_loadu_ps(matrix + 4); + col2 = _mm_loadu_ps(matrix + 8); + col3 = _mm_loadu_ps(matrix + 12); + } + + void Transpose() { + _MM_TRANSPOSE4_PS(col0, col1, col2, col3); + } + + __m128 col0; + __m128 col1; + __m128 col2; + __m128 col3; +}; + struct Vec4S32 { __m128i v; - static Vec4S32 Load(int *src) { return Vec4S32{ _mm_loadu_si128((const __m128i *)src) }; } - static Vec4S32 LoadAligned(int *src) { return Vec4S32{ _mm_load_si128((const __m128i *)src) }; } + static Vec4S32 Zero() { return Vec4S32{ _mm_setzero_si128() }; } + static Vec4S32 Splat(int lane) { return Vec4S32{ _mm_set1_epi32(lane) }; } + + static Vec4S32 Load(const int *src) { return Vec4S32{ _mm_loadu_si128((const __m128i *)src) }; } + static Vec4S32 LoadAligned(const int *src) { return Vec4S32{ _mm_load_si128((const __m128i *)src) }; } void Store(int *dst) { _mm_storeu_si128((__m128i *)dst, v); } void StoreAligned(int *dst) { _mm_store_si128((__m128i *)dst, v);} @@ -25,21 +46,56 @@ struct Vec4S32 { struct Vec4F32 { __m128 v; - static Vec4F32 Load(float *src) { return Vec4F32{ _mm_loadu_ps(src) }; } - static Vec4F32 LoadAligned(float *src) { return Vec4F32{ _mm_load_ps(src) }; } + static Vec4F32 Zero() { return Vec4F32{ _mm_setzero_ps() }; } + static Vec4F32 Splat(float lane) { return Vec4F32{ _mm_set1_ps(lane) }; } + + static Vec4F32 Load(const float *src) { return Vec4F32{ _mm_loadu_ps(src) }; } + static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ _mm_load_ps(src) }; } void Store(float *dst) { _mm_storeu_ps(dst, v); } void StoreAligned (float *dst) { _mm_store_ps(dst, v); } + static Vec4F32 LoadConvertS16(const int16_t *src) { // Note: will load 8 bytes + __m128i value = _mm_loadl_epi64((const __m128i *)src); + // 16-bit to 32-bit, use the upper words and an arithmetic shift right to sign extend + return Vec4F32{ _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(value, value), 16)) }; + } + + static Vec4F32 LoadConvertS8(const int8_t *src) { // Note: will load 8 bytes + __m128i value = _mm_loadl_epi64((const __m128i *)src); + __m128i value16 = _mm_unpacklo_epi8(value, value); + // 16-bit to 32-bit, use the upper words and an arithmetic shift right to sign extend + return Vec4F32{ _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(value16, value16), 16)) }; + } + static Vec4F32 FromVec4S32(Vec4S32 other) { return Vec4F32{ _mm_cvtepi32_ps(other.v) }; } Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ _mm_add_ps(v, other.v) }; } Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ _mm_sub_ps(v, other.v) }; } Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ _mm_mul_ps(v, other.v) }; } + + Vec4F32 Mul(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; } + + inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) { + return Vec4F32{ _mm_add_ps( + _mm_add_ps( + _mm_mul_ps(m.col0, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0))), + _mm_mul_ps(m.col1, _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1))) + ), + _mm_add_ps( + _mm_mul_ps(m.col2, _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2))), + m.col3) + ) + }; + } }; struct Vec4U16 { __m128i v; // we only use the lower 64 bits. - static Vec4U16 Load(uint16_t *mem) { return Vec4U16{ _mm_loadl_epi64((__m128i *)mem) }; } + + static Vec4U16 Zero() { return Vec4U16{ _mm_setzero_si128() }; } + // static Vec4U16 AllOnes() { return Vec4U16{ _mm_cmpeq_epi16(_mm_setzero_si128(), _mm_setzero_si128()) }; } + + static Vec4U16 Load(const uint16_t *mem) { return Vec4U16{ _mm_loadl_epi64((__m128i *)mem) }; } void Store(uint16_t *mem) { _mm_storel_epi64((__m128i *)mem, v); } static Vec4U16 Max(Vec4U16 a, Vec4U16 b) { return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) }; } @@ -49,11 +105,37 @@ struct Vec4U16 { #elif PPSSPP_ARCH(ARM_NEON) +struct Mat4F32 { + Mat4F32(const float *matrix) { + col0 = vld1q_f32(matrix); + col1 = vld1q_f32(matrix + 4); + col2 = vld1q_f32(matrix + 8); + col3 = vld1q_f32(matrix + 12); + } + void Transpose() { + float32x4_t temp0 = vzip1q_s32(col0, col2); + float32x4_t temp1 = vzip2q_s32(col0, col2); + float32x4_t temp2 = vzip1q_s32(col1, col3); + float32x4_t temp3 = vzip2q_s32(col1, col3); + col0 = vzip1q_s32(temp0, temp2); + col1 = vzip2q_s32(temp0, temp2); + col2 = vzip1q_s32(temp1, temp3); + col3 = vzip2q_s32(temp1, temp3); + } + float32x4_t col0; + float32x4_t col1; + float32x4_t col2; + float32x4_t col3; +}; + struct Vec4S32 { int32x4_t v; - static Vec4F32 Load(int *src) { return Vec4F32{ vld1q_s32(src) }; } - static Vec4F32 LoadAligned(int *src) { return Vec4F32{ vld1q_s32(src) }; } + static Vec4S32 Zero() { return Vec4S32{ vdupq_n_s32(0) }; } + static Vec4S32 Splat(int lane) { return Vec4S32{ vdupq_n_s32(lane) }; } + + static Vec4S32 Load(const int *src) { return Vec4S32{ vld1q_s32(src) }; } + static Vec4S32 LoadAligned(const int *src) { return Vec4S32{ vld1q_s32(src) }; } void Store(int *dst) { vst1q_s32(dst, v); } void StoreAligned(int *dst) { vst1q_s32(dst, v); } @@ -65,11 +147,27 @@ struct Vec4S32 { struct Vec4F32 { float32x4_t v; - static Vec4F32 Load(float *src) { return Vec4F32{ vld1q_f32(src) }; } - static Vec4F32 LoadAligned(float *src) { return Vec4F32{ vld1q_f32(src) }; } + static Vec4F32 Zero() { return Vec4F32{ vdupq_n_f32(0.0f) }; } + static Vec4F32 Splat(float lane) { return Vec4F32{ vdupq_n_f32(lane) }; } + + static Vec4F32 Load(const float *src) { return Vec4F32{ vld1q_f32(src) }; } + static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ vld1q_f32(src) }; } void Store(float *dst) { vst1q_f32(dst, v); } void StoreAligned(float *dst) { vst1q_f32(dst, v); } + static Vec4F32 LoadConvertS16(const int16_t *src) { // Note: will load 8 bytes + int16x4_t value = vld1_s16(src); + // 16-bit to 32-bit, use the upper words and an arithmetic shift right to sign extend + return Vec4F32{ vcvtq_f32_s32(vmovl_s16(value)) }; + } + + static Vec4F32 LoadConvertS8(const int8_t *src) { // Note: will load 8 bytes + int8x8_t value = vld1_s8(src); + int16x4_t value16 = vget_low_s16(vmovl_s8(value)); + // 16-bit to 32-bit, use the upper words and an arithmetic shift right to sign extend + return Vec4F32{ vcvtq_f32_s32(vmovl_s16(value)) }; + } + static Vec4F32 FromVec4S32(Vec4S32 other) { return Vec4F32{ vcvtq_f32_s32(other.v) }; } @@ -77,12 +175,31 @@ struct Vec4F32 { Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ vaddq_f32(v, other.v) }; } Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ vsubq_f32(v, other.v) }; } Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ vmulq_f32(v, other.v) }; } + + Vec4F32 Mul(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; } + + inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) { +#if PPSSPP_ARCH(ARM64_NEON) + float32x4_t sum = vaddq_f32( + vaddq_f32(vmulq_laneq_f32(m.col0, v, 0), vmulq_laneq_f32(m.col1, v, 1)), + vaddq_f32(vmulq_laneq_f32(m.col2, v, 2), m.col3)); +#else + float32x4_t sum = vaddq_f32( + vaddq_f32(vmulq_lane_f32(m.col0, vget_low_f32(v), 0), vmulq_lane_f32(m.col1, vget_low_f32(v), 1)), + vaddq_f32(vmulq_lane_f32(m.col2, vget_high_f32(v), 0), m.col3)); +#endif + return Vec4F32{ sum }; + } }; struct Vec4U16 { uint16x4_t v; // we only use the lower 64 bits. - static Vec4U16 Load(uint16_t *mem) { return Vec4U16{ vld1_u16(mem) }; } + + static Vec4U16 Zero() { return Vec4U16{ vdup_n_u16(0) }; } + + static Vec4U16 Load(const uint16_t *mem) { return Vec4U16{ vld1_u16(mem) }; } void Store(uint16_t *mem) { vst1_u16(mem, v); } + static Vec4U16 Max(Vec4U16 a, Vec4U16 b) { return Vec4U16{ vmax_u16(a.v, b.v) }; } static Vec4U16 Min(Vec4U16 a, Vec4U16 b) { return Vec4U16{ vmin_u16(a.v, b.v) }; } Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ vclt_u16(v, other.v) }; } From 6a1010afb0e3b031c7c0c14eace5c7765c6b2900 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 20 Dec 2024 13:24:41 +0100 Subject: [PATCH 14/28] Use CrossSIMD to optimize DecodeAndTransformForDepthRaster Checked the output, the generated assembly is great! --- GPU/Common/DepthRaster.cpp | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index 4e5c99a12790..23e506ff7be9 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -218,30 +218,25 @@ void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const f int vertexStride = dec->VertexSize(); int offset = dec->posoff; - float temp[3]; + Mat4F32 mat(worldviewproj); + switch (vertTypeID & GE_VTYPE_POS_MASK) { - case GE_VTYPE_POS_8BIT: + case GE_VTYPE_POS_FLOAT: for (int i = 0; i < count; i++) { - const s8 *data = (const s8 *)vertexData + i * vertexStride + offset; - for (int j = 0; j < 3; j++) { - temp[j] = data[j] * (1.0f / 128.0f); // TODO: Can we bake this factor in somewhere? - } - Vec3ByMatrix44(dest + i * 4, temp, worldviewproj); + const float *data = (const float *)((const u8 *)vertexData + vertexStride * i + offset); + Vec4F32::Load(data).AsVec3ByMatrix44(mat).Store(dest + i * 4); } break; case GE_VTYPE_POS_16BIT: for (int i = 0; i < count; i++) { const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset)); - for (int j = 0; j < 3; j++) { - temp[j] = data[j] * (1.0f / 32768.0f); // TODO: Can we bake this factor in somewhere? - } - Vec3ByMatrix44(dest + i * 4, temp, worldviewproj); + Vec4F32::LoadConvertS16(data).Mul(1.0f / 32768.f).AsVec3ByMatrix44(mat).Store(dest + i * 4); } break; - case GE_VTYPE_POS_FLOAT: + case GE_VTYPE_POS_8BIT: for (int i = 0; i < count; i++) { - const float *data = (const float *)((const u8 *)vertexData + vertexStride * i + offset); - Vec3ByMatrix44(dest + i * 4, data, worldviewproj); + const s8 *data = (const s8 *)vertexData + i * vertexStride + offset; + Vec4F32::LoadConvertS8(data).Mul(1.0f / 128.0f).AsVec3ByMatrix44(mat).Store(dest + i * 4); } break; } From 0b009c10bec5fb9703bf986a3288efc67c23f178 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 20 Dec 2024 15:42:52 +0100 Subject: [PATCH 15/28] CrossSIMD: Add reciprocal, clamp, swaplowerelements, etc --- Common/Math/CrossSIMD.h | 82 +++++++++++++++++++++++++++++++++-------- 1 file changed, 67 insertions(+), 15 deletions(-) diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index b240678d17a4..65fd02b3d3df 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -16,10 +16,6 @@ struct Mat4F32 { col3 = _mm_loadu_ps(matrix + 12); } - void Transpose() { - _MM_TRANSPOSE4_PS(col0, col1, col2, col3); - } - __m128 col0; __m128 col1; __m128 col2; @@ -37,6 +33,13 @@ struct Vec4S32 { void Store(int *dst) { _mm_storeu_si128((__m128i *)dst, v); } void StoreAligned(int *dst) { _mm_store_si128((__m128i *)dst, v);} + // Swaps the two lower elements. Useful for reversing triangles.. + Vec4S32 SwapLowerElements() { + return Vec4S32{ + _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 0, 1)) + }; + } + Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ _mm_add_epi32(v, other.v) }; } Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ _mm_sub_epi32(v, other.v) }; } // NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow. @@ -64,7 +67,7 @@ struct Vec4F32 { __m128i value = _mm_loadl_epi64((const __m128i *)src); __m128i value16 = _mm_unpacklo_epi8(value, value); // 16-bit to 32-bit, use the upper words and an arithmetic shift right to sign extend - return Vec4F32{ _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(value16, value16), 16)) }; + return Vec4F32{ _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(value16, value16), 24)) }; } static Vec4F32 FromVec4S32(Vec4S32 other) { return Vec4F32{ _mm_cvtepi32_ps(other.v) }; } @@ -72,8 +75,20 @@ struct Vec4F32 { Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ _mm_add_ps(v, other.v) }; } Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ _mm_sub_ps(v, other.v) }; } Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ _mm_mul_ps(v, other.v) }; } + void operator +=(Vec4F32 other) { v = _mm_add_ps(v, other.v); } + void operator -=(Vec4F32 other) { v = _mm_sub_ps(v, other.v); } + void operator *=(Vec4F32 other) { v = _mm_mul_ps(v, other.v); } + void operator /=(Vec4F32 other) { v = _mm_div_ps(v, other.v); } + Vec4F32 operator *(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; } Vec4F32 Mul(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; } + Vec4F32 Recip() { return Vec4F32{ _mm_rcp_ps(v) }; } + + Vec4F32 Clamp(float lower, float higher) { + return Vec4F32{ + _mm_min_ps(_mm_max_ps(v, _mm_set1_ps(lower)), _mm_set1_ps(higher)) + }; + } inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) { return Vec4F32{ _mm_add_ps( @@ -87,8 +102,14 @@ struct Vec4F32 { ) }; } + + static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) { + _MM_TRANSPOSE4_PS(col0.v, col1.v, col2.v, col3.v); + } }; +inline Vec4S32 VecS32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvtps_epi32(f.v) }; } + struct Vec4U16 { __m128i v; // we only use the lower 64 bits. @@ -112,16 +133,6 @@ struct Mat4F32 { col2 = vld1q_f32(matrix + 8); col3 = vld1q_f32(matrix + 12); } - void Transpose() { - float32x4_t temp0 = vzip1q_s32(col0, col2); - float32x4_t temp1 = vzip2q_s32(col0, col2); - float32x4_t temp2 = vzip1q_s32(col1, col3); - float32x4_t temp3 = vzip2q_s32(col1, col3); - col0 = vzip1q_s32(temp0, temp2); - col1 = vzip2q_s32(temp0, temp2); - col2 = vzip1q_s32(temp1, temp3); - col3 = vzip2q_s32(temp1, temp3); - } float32x4_t col0; float32x4_t col1; float32x4_t col2; @@ -139,6 +150,14 @@ struct Vec4S32 { void Store(int *dst) { vst1q_s32(dst, v); } void StoreAligned(int *dst) { vst1q_s32(dst, v); } + // Swaps the two lower elements, but NOT the two upper ones. Useful for reversing triangles.. + // This is quite awkward on ARM64 :/ Maybe there's a better solution? + Vec4S32 SwapLowerElements() { + float32x2_t upper = vget_high_s32(v); + float32x2_t lowerSwapped = vrev64_s32(vget_low_s32(v)); + return Vec4S32{ vcombine_s32(lowerSwapped, upper) }; + }; + Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ vaddq_s32(v, other.v) }; } Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ vsubq_s32(v, other.v) }; } Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; } @@ -175,8 +194,39 @@ struct Vec4F32 { Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ vaddq_f32(v, other.v) }; } Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ vsubq_f32(v, other.v) }; } Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ vmulq_f32(v, other.v) }; } + void operator +=(Vec4F32 other) { v = vaddq_f32(v, other.v); } + void operator -=(Vec4F32 other) { v = vsubq_f32(v, other.v); } + void operator *=(Vec4F32 other) { v = vmulq_f32(v, other.v); } + void operator /=(Vec4F32 other) { v = vmulq_f32(v, other.Recip().v); } + Vec4F32 operator *(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; } Vec4F32 Mul(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; } + Vec4F32 Recip() { + float32x4_t recip = vrecpeq_f32(v); + // Use a couple Newton-Raphson steps to refine the estimate. + // May be able to get away with only one refinement, not sure! + recip = vmulq_f32(vrecpsq_f32(v, recip), recip); + recip = vmulq_f32(vrecpsq_f32(v, recip), recip); + return Vec4F32{ recip }; + } + + Vec4F32 Clamp(float lower, float higher) { + return Vec4F32{ + vminq_f32(vmaxq_f32(v, vdupq_n_f32(lower)), vdupq_n_f32(higher)) + }; + } + + // One of many possible solutions. Sometimes we could also use vld4q_f32 probably.. + static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) { + float32x4_t temp0 = vzip1q_s32(col0.v, col2.v); + float32x4_t temp1 = vzip2q_s32(col0.v, col2.v); + float32x4_t temp2 = vzip1q_s32(col1.v, col3.v); + float32x4_t temp3 = vzip2q_s32(col1.v, col3.v); + col0.v = vzip1q_s32(temp0, temp2); + col1.v = vzip2q_s32(temp0, temp2); + col2.v = vzip1q_s32(temp1, temp3); + col3.v = vzip2q_s32(temp1, temp3); + } inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) { #if PPSSPP_ARCH(ARM64_NEON) @@ -192,6 +242,8 @@ struct Vec4F32 { } }; +inline Vec4S32 VecS32FromF32(Vec4F32 f) { return Vec4S32{ vcvtq_s32_f32(f.v) }; } + struct Vec4U16 { uint16x4_t v; // we only use the lower 64 bits. From 67078d439b3d66e4c6b4b9ed7986791d8068c1e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 20 Dec 2024 15:28:56 +0100 Subject: [PATCH 16/28] Depth raster: Switch to a SoA data layout for the screen space verts --- GPU/Common/DepthRaster.cpp | 51 ++++++++++++++++----------------- GPU/Common/DepthRaster.h | 8 +++--- GPU/Common/DrawEngineCommon.cpp | 22 ++++++++++---- GPU/Common/DrawEngineCommon.h | 2 +- 4 files changed, 46 insertions(+), 37 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index 23e506ff7be9..dbf3a2f06824 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -81,7 +81,7 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, // Adapted from Intel's depth rasterizer example. // Started with the scalar version, will SIMD-ify later. // x1/y1 etc are the scissor rect. -void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const DepthScreenVertex vertsSub[3], GEComparison compareMode) { +void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, GEComparison compareMode) { int tileStartX = x1; int tileEndX = x2; @@ -94,15 +94,15 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, // Convert to whole pixels for now. Later subpixel precision. DepthScreenVertex verts[3]; - verts[0].x = vertsSub[0].x; - verts[0].y = vertsSub[0].y; - verts[0].z = vertsSub[0].z; - verts[1].x = vertsSub[2].x; - verts[1].y = vertsSub[2].y; - verts[1].z = vertsSub[2].z; - verts[2].x = vertsSub[1].x; - verts[2].y = vertsSub[1].y; - verts[2].z = vertsSub[1].z; + verts[0].x = tx[0]; + verts[0].y = ty[0]; + verts[0].z = tz[0]; + verts[1].x = tx[2]; + verts[1].y = ty[2]; + verts[1].z = tz[2]; + verts[2].x = tx[1]; + verts[2].y = ty[1]; + verts[2].z = tz[1]; // use fixed-point only for X and Y. Avoid work for Z and W. int startX = std::max(std::min(std::min(verts[0].x, verts[1].x), verts[2].x), tileStartX); @@ -242,7 +242,7 @@ void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const f } } -int DepthRasterClipIndexedTriangles(DepthScreenVertex *screenVerts, const float *transformed, const uint16_t *indexBuffer, int count) { +int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count) { bool cullEnabled = gstate.isCullEnabled(); const float viewportX = gstate.getViewportXCenter(); @@ -289,28 +289,28 @@ int DepthRasterClipIndexedTriangles(DepthScreenVertex *screenVerts, const float if (screen[2] >= 65535.0f) { screen[2] = 65535.0f; } - screenVerts[outCount].x = screen[0] * (1.0f / 16.0f); // We ditch the subpixel precision here. - screenVerts[outCount].y = screen[1] * (1.0f / 16.0f); - screenVerts[outCount].z = screen[2]; - + tx[outCount] = screen[0] * (1.0f / 16.0f); // We ditch the subpixel precision here. + ty[outCount] = screen[1] * (1.0f / 16.0f); + tz[outCount] = screen[2]; outCount++; } } return outCount; } -void DepthRasterConvertTransformed(DepthScreenVertex *screenVerts, GEPrimitiveType prim, const TransformedVertex *transformed, int count) { +void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, GEPrimitiveType prim, const TransformedVertex *transformed, int count) { _dbg_assert_(prim == GE_PRIM_RECTANGLES || prim == GE_PRIM_TRIANGLES); + // TODO: This is basically a transpose, or AoS->SoA conversion. There may be fast ways. for (int i = 0; i < count; i++) { - screenVerts[i].x = (int)transformed[i].pos[0]; - screenVerts[i].y = (int)transformed[i].pos[1]; - screenVerts[i].z = (u16)transformed[i].pos[2]; + tx[i] = (int)transformed[i].pos[0]; + ty[i] = (int)transformed[i].pos[1]; + tz[i] = (u16)transformed[i].pos[2]; } } // Rasterizes screen-space vertices. -void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const DepthScreenVertex *screenVerts, int count) { +void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, int count) { // Prim should now be either TRIANGLES or RECTs. _dbg_assert_(prim == GE_PRIM_RECTANGLES || prim == GE_PRIM_TRIANGLES); @@ -327,17 +327,16 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType pr switch (prim) { case GE_PRIM_RECTANGLES: - for (int i = 0; i < count / 2; i++) { - uint16_t z = screenVerts[i + 1].z; // depth from second vertex + for (int i = 0; i < count; i += 2) { + uint16_t z = tz[i + 1]; // depth from second vertex // TODO: Should clip coordinates to the scissor rectangle. // We remove the subpixel information here. - DepthRasterRect(depth, depthStride, screenVerts[i].x, screenVerts[i].y, screenVerts[i + 1].x, screenVerts[i + 1].y, - z, compareMode); + DepthRasterRect(depth, depthStride, tx[i], ty[i], tx[i + 1], ty[i + 1], z, compareMode); } break; case GE_PRIM_TRIANGLES: - for (int i = 0; i < count / 3; i++) { - DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, screenVerts + i * 3, compareMode); + for (int i = 0; i < count; i += 3) { + DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, &tx[i], &ty[i], &tz[i], compareMode); } break; default: diff --git a/GPU/Common/DepthRaster.h b/GPU/Common/DepthRaster.h index 50ef309d936c..50e7a2577ddb 100644 --- a/GPU/Common/DepthRaster.h +++ b/GPU/Common/DepthRaster.h @@ -6,7 +6,7 @@ struct DepthScreenVertex { int x; int y; - uint16_t z; + int z; }; // Specialized, very limited depth-only rasterizer. @@ -17,7 +17,7 @@ struct DepthScreenVertex { class VertexDecoder; struct TransformedVertex; -int DepthRasterClipIndexedTriangles(DepthScreenVertex *screenVerts, const float *transformed, const uint16_t *indexBuffer, int count); +int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count); void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int count, VertexDecoder *dec, u32 vertTypeID); -void DepthRasterConvertTransformed(DepthScreenVertex *screenVerts, GEPrimitiveType prim, const TransformedVertex *transformed, int count); -void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const DepthScreenVertex *screenVerts, int count); +void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, GEPrimitiveType prim, const TransformedVertex *transformed, int count); +void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, int count); diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index 60024326fbe5..8f857f20acb9 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -38,7 +38,9 @@ enum { TRANSFORMED_VERTEX_BUFFER_SIZE = VERTEX_BUFFER_MAX * sizeof(TransformedVertex), DEPTH_TRANSFORMED_SIZE = VERTEX_BUFFER_MAX * 4, - DEPTH_SCREENVERTS_SIZE = VERTEX_BUFFER_MAX * sizeof(DepthScreenVertex), + DEPTH_SCREENVERTS_COMPONENT_COUNT = VERTEX_BUFFER_MAX, + DEPTH_SCREENVERTS_COMPONENT_SIZE = DEPTH_SCREENVERTS_COMPONENT_COUNT * sizeof(int) + 384, + DEPTH_SCREENVERTS_SIZE = DEPTH_SCREENVERTS_COMPONENT_SIZE * 3, }; DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) { @@ -54,7 +56,7 @@ DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) { useDepthRaster_ = PSP_CoreParameter().compat.flags().SoftwareRasterDepth; if (useDepthRaster_) { depthTransformed_ = (float *)AllocateMemoryPages(DEPTH_TRANSFORMED_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); - depthScreenVerts_ = (DepthScreenVertex *)AllocateMemoryPages(DEPTH_SCREENVERTS_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); + depthScreenVerts_ = (int *)AllocateMemoryPages(DEPTH_SCREENVERTS_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); } } @@ -933,12 +935,16 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder numDec += drawVerts_[i].vertexCount; } + int *tx = depthScreenVerts_; + int *ty = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT; + int *tz = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2; + // Clip and triangulate using the index buffer. - int outVertCount = DepthRasterClipIndexedTriangles(depthScreenVerts_, depthTransformed_, decIndex_, numDec); + int outVertCount = DepthRasterClipIndexedTriangles(tx, ty, tz, depthTransformed_, decIndex_, numDec); DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(), GE_PRIM_TRIANGLES, gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), - depthScreenVerts_, outVertCount); + tx, ty, tz, outVertCount); } void DrawEngineCommon::DepthRasterPretransformed(GEPrimitiveType prim, const TransformedVertex *inVerts, int count) { @@ -955,8 +961,12 @@ void DrawEngineCommon::DepthRasterPretransformed(GEPrimitiveType prim, const Tra _dbg_assert_(prim != GE_PRIM_TRIANGLE_STRIP && prim != GE_PRIM_TRIANGLE_FAN); - DepthRasterConvertTransformed(depthScreenVerts_, prim, inVerts, count); + int *tx = depthScreenVerts_; + int *ty = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT; + int *tz = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2; + + DepthRasterConvertTransformed(tx, ty, tz, prim, inVerts, count); DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(), prim, gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), - depthScreenVerts_, count); + tx, ty, tz, count); } diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h index 64e8478cd06e..2df31f2fba96 100644 --- a/GPU/Common/DrawEngineCommon.h +++ b/GPU/Common/DrawEngineCommon.h @@ -357,5 +357,5 @@ class DrawEngineCommon { bool useDepthRaster_ = false; float *depthTransformed_ = nullptr; - DepthScreenVertex *depthScreenVerts_ = nullptr; + int *depthScreenVerts_ = nullptr; }; From 820e7369b9b07f975fbfabca2ebc175b45d3bbb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 20 Dec 2024 16:26:51 +0100 Subject: [PATCH 17/28] Speed up DepthRasterClipIndexedTriangles with CrossSIMD --- GPU/Common/DepthRaster.cpp | 66 +++++++++++++++++---------------- GPU/Common/DrawEngineCommon.cpp | 16 ++++++++ 2 files changed, 50 insertions(+), 32 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index dbf3a2f06824..b0bbb44d2761 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -245,19 +245,21 @@ void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const f int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count) { bool cullEnabled = gstate.isCullEnabled(); - const float viewportX = gstate.getViewportXCenter(); - const float viewportY = gstate.getViewportYCenter(); - const float viewportZ = gstate.getViewportZCenter(); - const float viewportScaleX = gstate.getViewportXScale(); - const float viewportScaleY = gstate.getViewportYScale(); - const float viewportScaleZ = gstate.getViewportZScale(); + // TODO: On ARM we can do better by keeping these in lanes instead of splatting. + // However, hard to find a common abstraction. + const Vec4F32 viewportX = Vec4F32::Splat(gstate.getViewportXCenter()); + const Vec4F32 viewportY = Vec4F32::Splat(gstate.getViewportYCenter()); + const Vec4F32 viewportZ = Vec4F32::Splat(gstate.getViewportZCenter()); + const Vec4F32 viewportScaleX = Vec4F32::Splat(gstate.getViewportXScale()); + const Vec4F32 viewportScaleY = Vec4F32::Splat(gstate.getViewportYScale()); + const Vec4F32 viewportScaleZ = Vec4F32::Splat(gstate.getViewportZScale()); + + const Vec4F32 offsetX = Vec4F32::Splat(gstate.getOffsetX()); // We remove the 16 scale here + const Vec4F32 offsetY = Vec4F32::Splat(gstate.getOffsetY()); bool cullCCW = false; - // OK, we now have the coordinates. Let's transform, we can actually do this in-place. - int outCount = 0; - for (int i = 0; i < count; i += 3) { const float *verts[3] = { transformed + indexBuffer[i] * 4, @@ -271,29 +273,29 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *tran continue; } - for (int c = 0; c < 3; c++) { - const float *src = verts[c]; - float invW = 1.0f / src[3]; - - float x = src[0] * invW; - float y = src[1] * invW; - float z = src[2] * invW; - - float screen[3]; - screen[0] = (x * viewportScaleX + viewportX) * 16.0f - gstate.getOffsetX16(); - screen[1] = (y * viewportScaleY + viewportY) * 16.0f - gstate.getOffsetY16(); - screen[2] = (z * viewportScaleZ + viewportZ); - if (screen[2] < 0.0f) { - screen[2] = 0.0f; - } - if (screen[2] >= 65535.0f) { - screen[2] = 65535.0f; - } - tx[outCount] = screen[0] * (1.0f / 16.0f); // We ditch the subpixel precision here. - ty[outCount] = screen[1] * (1.0f / 16.0f); - tz[outCount] = screen[2]; - outCount++; - } + // These names are wrong .. until we transpose. + Vec4F32 x = Vec4F32::Load(verts[0]); + Vec4F32 y = Vec4F32::Load(verts[1]); + Vec4F32 z = Vec4F32::Load(verts[2]); + Vec4F32 w = Vec4F32::Zero(); + Vec4F32::Transpose(x, y, z, w); + // Now the names are accurate! Since we only have three vertices, the fourth member of each vector is zero + // and will not be stored (well it will be stored, but it'll be overwritten by the next vertex). + Vec4F32 recipW = w.Recip(); + + x *= recipW; + y *= recipW; + z *= recipW; + + Vec4F32 screen[3]; + screen[0] = (x * viewportScaleX + viewportX) - offsetX; + screen[1] = (y * viewportScaleY + viewportY) - offsetY; + screen[2] = (z * viewportScaleZ + viewportZ).Clamp(0.0f, 65535.0f); + + VecS32FromF32(screen[0]).Store(tx + outCount); + VecS32FromF32(screen[1]).Store(ty + outCount); + VecS32FromF32(screen[2]).Store(tz + outCount); + outCount += 3; } return outCount; } diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index 8f857f20acb9..70da71a8fa09 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -941,6 +941,14 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder // Clip and triangulate using the index buffer. int outVertCount = DepthRasterClipIndexedTriangles(tx, ty, tz, depthTransformed_, decIndex_, numDec); + if (outVertCount & 15) { + // Zero padding + for (int i = outVertCount; i < ((outVertCount + 16) & ~15); i++) { + tx[i] = 0; + ty[i] = 0; + tz[i] = 0; + } + } DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(), GE_PRIM_TRIANGLES, gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), @@ -966,6 +974,14 @@ void DrawEngineCommon::DepthRasterPretransformed(GEPrimitiveType prim, const Tra int *tz = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2; DepthRasterConvertTransformed(tx, ty, tz, prim, inVerts, count); + if (count & 15) { + // Zero padding + for (int i = count; i < ((count + 16) & ~15); i++) { + tx[i] = 0; + ty[i] = 0; + tz[i] = 0; + } + } DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(), prim, gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), tx, ty, tz, count); From 65692d036e89b5f85f1c9fd5444849b9b029360e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sat, 21 Dec 2024 01:18:55 +0100 Subject: [PATCH 18/28] CrossSIMD: possible buildfix? --- Common/Math/CrossSIMD.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index 65fd02b3d3df..945ea166b15e 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -108,7 +108,9 @@ struct Vec4F32 { } }; -inline Vec4S32 VecS32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvtps_epi32(f.v) }; } +inline Vec4S32 VecS32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvttps_epi32(f.v) }; } +inline Vec4S32 VecS32FromF32Round(Vec4F32 f) { return Vec4S32{ _mm_cvtps_epi32(f.v) }; } +inline Vec4F32 VecF32FromS32(Vec4S32 f) { return Vec4F32{ _mm_cvtepi32_ps(f.v) }; } struct Vec4U16 { __m128i v; // we only use the lower 64 bits. @@ -218,14 +220,14 @@ struct Vec4F32 { // One of many possible solutions. Sometimes we could also use vld4q_f32 probably.. static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) { - float32x4_t temp0 = vzip1q_s32(col0.v, col2.v); - float32x4_t temp1 = vzip2q_s32(col0.v, col2.v); - float32x4_t temp2 = vzip1q_s32(col1.v, col3.v); - float32x4_t temp3 = vzip2q_s32(col1.v, col3.v); - col0.v = vzip1q_s32(temp0, temp2); - col1.v = vzip2q_s32(temp0, temp2); - col2.v = vzip1q_s32(temp1, temp3); - col3.v = vzip2q_s32(temp1, temp3); + float32x4_t temp0 = vzip1q_f32(col0.v, col2.v); + float32x4_t temp1 = vzip2q_f32(col0.v, col2.v); + float32x4_t temp2 = vzip1q_f32(col1.v, col3.v); + float32x4_t temp3 = vzip2q_f32(col1.v, col3.v); + col0.v = vzip1q_f32(temp0, temp2); + col1.v = vzip2q_f32(temp0, temp2); + col2.v = vzip1q_f32(temp1, temp3); + col3.v = vzip2q_f32(temp1, temp3); } inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) { From a344d0225f2c2cc7337c0d841e4e48bac2b95969 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 20 Dec 2024 20:00:06 +0100 Subject: [PATCH 19/28] DepthRaster: Fix bug where we used the wrong vertex count. --- GPU/Common/DepthRaster.cpp | 13 +++++++++---- GPU/Common/DepthRaster.h | 2 +- GPU/Common/DrawEngineCommon.cpp | 20 ++++++++++++++++---- GPU/Common/DrawEngineCommon.h | 2 +- GPU/Vulkan/DrawEngineVulkan.cpp | 2 +- 5 files changed, 28 insertions(+), 11 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index b0bbb44d2761..0810f1778cde 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -161,6 +161,8 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, // END triangle setup. + // Here we should draw four triangles in a sequence. + // Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY) for (int r = startY; r < endY; r++, row++, @@ -211,7 +213,7 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, } // for each row } -void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int count, VertexDecoder *dec, u32 vertTypeID) { +void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, VertexDecoder *dec, u32 vertTypeID) { // TODO: Ditch skinned and morphed prims for now since we don't have a fast way to skin without running the full decoder. _dbg_assert_((vertTypeID & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) == 0); @@ -220,22 +222,25 @@ void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const f Mat4F32 mat(worldviewproj); + const u8 *startPtr = (const u8 *)vertexData + indexLowerBound * vertexStride; + int count = indexUpperBound - indexLowerBound + 1; + switch (vertTypeID & GE_VTYPE_POS_MASK) { case GE_VTYPE_POS_FLOAT: for (int i = 0; i < count; i++) { - const float *data = (const float *)((const u8 *)vertexData + vertexStride * i + offset); + const float *data = (const float *)(startPtr + i * vertexStride + offset); Vec4F32::Load(data).AsVec3ByMatrix44(mat).Store(dest + i * 4); } break; case GE_VTYPE_POS_16BIT: for (int i = 0; i < count; i++) { - const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset)); + const s16 *data = ((const s16 *)((const s8 *)startPtr + i * vertexStride + offset)); Vec4F32::LoadConvertS16(data).Mul(1.0f / 32768.f).AsVec3ByMatrix44(mat).Store(dest + i * 4); } break; case GE_VTYPE_POS_8BIT: for (int i = 0; i < count; i++) { - const s8 *data = (const s8 *)vertexData + i * vertexStride + offset; + const s8 *data = (const s8 *)startPtr + i * vertexStride + offset; Vec4F32::LoadConvertS8(data).Mul(1.0f / 128.0f).AsVec3ByMatrix44(mat).Store(dest + i * 4); } break; diff --git a/GPU/Common/DepthRaster.h b/GPU/Common/DepthRaster.h index 50e7a2577ddb..e92c1a1348ed 100644 --- a/GPU/Common/DepthRaster.h +++ b/GPU/Common/DepthRaster.h @@ -18,6 +18,6 @@ class VertexDecoder; struct TransformedVertex; int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count); -void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int count, VertexDecoder *dec, u32 vertTypeID); +void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, VertexDecoder *dec, u32 vertTypeID); void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, GEPrimitiveType prim, const TransformedVertex *transformed, int count); void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, int count); diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index 70da71a8fa09..bbe36fb479e7 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -903,7 +903,7 @@ bool DrawEngineCommon::DescribeCodePtr(const u8 *ptr, std::string &name) const { } } -void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID) { +void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID, int vertexCount) { switch (prim) { case GE_PRIM_INVALID: case GE_PRIM_KEEP_PREVIOUS: @@ -931,8 +931,20 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder // Decode. int numDec = 0; for (int i = 0; i < numDrawVerts_; i++) { - DecodeAndTransformForDepthRaster(depthTransformed_ + numDec * 4, prim, worldviewproj, drawVerts_[i].verts, drawVerts_[i].vertexCount, dec, vertTypeID); - numDec += drawVerts_[i].vertexCount; + DeferredVerts &dv = drawVerts_[i]; + + int indexLowerBound = dv.indexLowerBound; + drawVertexOffsets_[i] = numDec - indexLowerBound; + + int indexUpperBound = dv.indexUpperBound; + if (indexUpperBound + 1 - indexLowerBound + numDec >= VERTEX_BUFFER_MAX) { + // Hit our limit! Stop decoding in this draw. + break; + } + + // Decode the verts (and at the same time apply morphing/skinning). Simple. + DecodeAndTransformForDepthRaster(depthTransformed_ + numDec * 4, prim, worldviewproj, dv.verts, indexLowerBound, indexUpperBound, dec, vertTypeID); + numDec += indexUpperBound - indexLowerBound + 1; } int *tx = depthScreenVerts_; @@ -940,7 +952,7 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder int *tz = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2; // Clip and triangulate using the index buffer. - int outVertCount = DepthRasterClipIndexedTriangles(tx, ty, tz, depthTransformed_, decIndex_, numDec); + int outVertCount = DepthRasterClipIndexedTriangles(tx, ty, tz, depthTransformed_, decIndex_, vertexCount); if (outVertCount & 15) { // Zero padding for (int i = outVertCount; i < ((outVertCount + 16) & ~15); i++) { diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h index 2df31f2fba96..053c4c31f55b 100644 --- a/GPU/Common/DrawEngineCommon.h +++ b/GPU/Common/DrawEngineCommon.h @@ -175,7 +175,7 @@ class DrawEngineCommon { void ApplyFramebufferRead(FBOTexState *fboTexState); - void DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID); + void DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID, int vertexCount); void DepthRasterPretransformed(GEPrimitiveType prim, const TransformedVertex *inVerts, int count); static inline int IndexSize(u32 vtype) { diff --git a/GPU/Vulkan/DrawEngineVulkan.cpp b/GPU/Vulkan/DrawEngineVulkan.cpp index e1775722ee3e..ebe3d022df2a 100644 --- a/GPU/Vulkan/DrawEngineVulkan.cpp +++ b/GPU/Vulkan/DrawEngineVulkan.cpp @@ -371,7 +371,7 @@ void DrawEngineVulkan::Flush() { renderManager->Draw(descSetIndex, ARRAY_SIZE(dynamicUBOOffsets), dynamicUBOOffsets, vbuf, vbOffset, vertexCount); } if (useDepthRaster_) { - DepthRasterTransform(prim, dec_, dec_->VertexType()); + DepthRasterTransform(prim, dec_, dec_->VertexType(), vertexCount); } } else { PROFILE_THIS_SCOPE("soft"); From f886578c0e84cd28e482d3f3da9d4e7e80d543c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 20 Dec 2024 20:08:33 +0100 Subject: [PATCH 20/28] DepthRaster: Fix backface culling --- GPU/Common/DepthRaster.cpp | 45 +++++++++++++++++++++++++------------- GPU/GPUState.h | 2 +- GPU/ge_constants.h | 5 +++++ 3 files changed, 36 insertions(+), 16 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index 0810f1778cde..9532e7b3cad7 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -97,12 +97,12 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, verts[0].x = tx[0]; verts[0].y = ty[0]; verts[0].z = tz[0]; - verts[1].x = tx[2]; - verts[1].y = ty[2]; - verts[1].z = tz[2]; - verts[2].x = tx[1]; - verts[2].y = ty[1]; - verts[2].z = tz[1]; + verts[1].x = tx[1]; + verts[1].y = ty[1]; + verts[1].z = tz[1]; + verts[2].x = tx[2]; + verts[2].y = ty[2]; + verts[2].z = tz[2]; // use fixed-point only for X and Y. Avoid work for Z and W. int startX = std::max(std::min(std::min(verts[0].x, verts[1].x), verts[2].x), tileStartX); @@ -249,6 +249,7 @@ void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const f int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count) { bool cullEnabled = gstate.isCullEnabled(); + GECullMode cullMode = gstate.getCullMode(); // TODO: On ARM we can do better by keeping these in lanes instead of splatting. // However, hard to find a common abstraction. @@ -265,11 +266,16 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *tran bool cullCCW = false; int outCount = 0; + + int flipCull = 0; + if (cullEnabled && cullMode == GE_CULL_CW) { + flipCull = 3; + } for (int i = 0; i < count; i += 3) { const float *verts[3] = { transformed + indexBuffer[i] * 4, - transformed + indexBuffer[i + 1] * 4, - transformed + indexBuffer[i + 2] * 4, + transformed + indexBuffer[i + (1 ^ flipCull)] * 4, + transformed + indexBuffer[i + (2 ^ flipCull)] * 4, }; // Check if any vertex is behind the 0 plane. @@ -292,15 +298,24 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *tran y *= recipW; z *= recipW; - Vec4F32 screen[3]; - screen[0] = (x * viewportScaleX + viewportX) - offsetX; - screen[1] = (y * viewportScaleY + viewportY) - offsetY; - screen[2] = (z * viewportScaleZ + viewportZ).Clamp(0.0f, 65535.0f); + Vec4S32 screen[3]; + screen[0] = VecS32FromF32((x * viewportScaleX + viewportX) - offsetX); + screen[1] = VecS32FromF32((y * viewportScaleY + viewportY) - offsetY); + screen[2] = VecS32FromF32((z * viewportScaleZ + viewportZ).Clamp(0.0f, 65535.0f)); - VecS32FromF32(screen[0]).Store(tx + outCount); - VecS32FromF32(screen[1]).Store(ty + outCount); - VecS32FromF32(screen[2]).Store(tz + outCount); + screen[0].Store(tx + outCount); + screen[1].Store(ty + outCount); + screen[2].Store(tz + outCount); outCount += 3; + + if (!cullEnabled) { + // If culling is off, shuffle the three vectors to produce the opposite triangle, and store them after. + // Or on ARM we might be better off just storing individual elements... + screen[0].SwapLowerElements().Store(tx + outCount); + screen[1].SwapLowerElements().Store(ty + outCount); + screen[2].SwapLowerElements().Store(tz + outCount); + outCount += 3; + } } return outCount; } diff --git a/GPU/GPUState.h b/GPU/GPUState.h index b5318331750b..e2814d5a1ce7 100644 --- a/GPU/GPUState.h +++ b/GPU/GPUState.h @@ -227,7 +227,7 @@ struct GPUgstate { // Cull bool isCullEnabled() const { return cullfaceEnable & 1; } - int getCullMode() const { return cullmode & 1; } + GECullMode getCullMode() const { return (GECullMode)(cullmode & 1); } // Color Mask bool isClearModeColorMask() const { return (clearmode&0x100) != 0; } diff --git a/GPU/ge_constants.h b/GPU/ge_constants.h index 1d123e162093..f4a366f80250 100644 --- a/GPU/ge_constants.h +++ b/GPU/ge_constants.h @@ -623,6 +623,11 @@ enum GEPatchPrimType GE_PATCHPRIM_UNKNOWN = 3, }; +enum GECullMode { + GE_CULL_CW = 0, + GE_CULL_CCW = 1, +}; + inline GEPrimitiveType PatchPrimToPrim(GEPatchPrimType type) { switch (type) { case GE_PATCHPRIM_TRIANGLES: return GE_PRIM_TRIANGLES; From ad1809875a8f8da561171209c94aac135f32868a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 20 Dec 2024 20:28:15 +0100 Subject: [PATCH 21/28] Minor sign check optimization --- GPU/Common/DepthRaster.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index 9532e7b3cad7..ef7c4bc78f20 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -184,10 +184,12 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, beta += A1, gamma += A2) { - int mask = alpha >= 0 && beta >= 0 && gamma >= 0; + int mask = alpha | beta | gamma; // Early out if all of this quad's pixels are outside the triangle. - if (!mask) { + if (mask < 0) { continue; + } else { + mask = 1; } // Compute barycentric-interpolated depth. Could also compute it incrementally. float depth = zz[0] + beta * zz[1] + gamma * zz[2]; From d1b50ea543c4ad62a5c5b523d73653c37865252e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sat, 21 Dec 2024 01:19:10 +0100 Subject: [PATCH 22/28] Comment --- GPU/Common/DepthRaster.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index ef7c4bc78f20..1da6e6ac9a2b 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -184,12 +184,10 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, beta += A1, gamma += A2) { - int mask = alpha | beta | gamma; + int mask = (alpha | beta | gamma) >= 0; // Early out if all of this quad's pixels are outside the triangle. - if (mask < 0) { + if (!mask) { continue; - } else { - mask = 1; } // Compute barycentric-interpolated depth. Could also compute it incrementally. float depth = zz[0] + beta * zz[1] + gamma * zz[2]; @@ -312,7 +310,12 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *tran if (!cullEnabled) { // If culling is off, shuffle the three vectors to produce the opposite triangle, and store them after. - // Or on ARM we might be better off just storing individual elements... + + // HOWEVER! I realized that this is not the optimal layout, after all. + // We should group 4 triangles at a time and interleave them (so we first have all X of vertex 0, + // then all X of vertex 1, and so on). This seems solvable with another transpose, if we can easily + // collect four triangles at a time... + screen[0].SwapLowerElements().Store(tx + outCount); screen[1].SwapLowerElements().Store(ty + outCount); screen[2].SwapLowerElements().Store(tz + outCount); From 2051d55c90202570ae1170c114c3750745f45440 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sat, 21 Dec 2024 10:56:56 +0100 Subject: [PATCH 23/28] CrossSIMD: Add a bunch more functonality for use by the rasterizer --- Common/Math/CrossSIMD.h | 100 ++++++++++++++++++++++++++++++++++---- Common/Math/SIMDHeaders.h | 18 +++++++ unittest/UnitTest.cpp | 7 ++- 3 files changed, 113 insertions(+), 12 deletions(-) diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index 945ea166b15e..f723ea57d6d6 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -39,13 +39,29 @@ struct Vec4S32 { _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 0, 1)) }; } + Vec4S32 SignBits32ToMask() { + return Vec4S32{ + _mm_srai_epi32(v, 31) + }; + } Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ _mm_add_epi32(v, other.v) }; } Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ _mm_sub_epi32(v, other.v) }; } + Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ _mm_or_si128(v, other.v) }; } + Vec4S32 operator &(Vec4S32 other) const { return Vec4S32{ _mm_and_si128(v, other.v) }; } + Vec4S32 operator ^(Vec4S32 other) const { return Vec4S32{ _mm_xor_si128(v, other.v) }; } + // TODO: andnot + void operator +=(Vec4S32 other) { v = _mm_add_epi32(v, other.v); } + void operator -=(Vec4S32 other) { v = _mm_sub_epi32(v, other.v); } + // NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow. Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; } // (ab3,ab2,ab1,ab0) }; +inline bool AnyZeroSignBit(Vec4S32 value) { + return _mm_movemask_ps(_mm_castsi128_ps(value.v)) != 0xF; +} + struct Vec4F32 { __m128 v; @@ -108,9 +124,8 @@ struct Vec4F32 { } }; -inline Vec4S32 VecS32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvttps_epi32(f.v) }; } -inline Vec4S32 VecS32FromF32Round(Vec4F32 f) { return Vec4S32{ _mm_cvtps_epi32(f.v) }; } -inline Vec4F32 VecF32FromS32(Vec4S32 f) { return Vec4F32{ _mm_cvtepi32_ps(f.v) }; } +inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvttps_epi32(f.v) }; } +inline Vec4F32 Vec4F32FromS32(Vec4S32 f) { return Vec4F32{ _mm_cvtepi32_ps(f.v) }; } struct Vec4U16 { __m128i v; // we only use the lower 64 bits. @@ -121,11 +136,36 @@ struct Vec4U16 { static Vec4U16 Load(const uint16_t *mem) { return Vec4U16{ _mm_loadl_epi64((__m128i *)mem) }; } void Store(uint16_t *mem) { _mm_storel_epi64((__m128i *)mem, v); } - static Vec4U16 Max(Vec4U16 a, Vec4U16 b) { return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) }; } - static Vec4U16 Min(Vec4U16 a, Vec4U16 b) { return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) }; } + // NOTE: 16-bit signed saturation! Will work for a lot of things, but not all. + static Vec4U16 FromVec4S32(Vec4S32 v) { + return Vec4U16{ _mm_packu_epi32_SSE2(v.v)}; + } + static Vec4U16 FromVec4F32(Vec4F32 v) { + return Vec4U16{ _mm_packu_epi32_SSE2(_mm_cvtps_epi32(v.v)) }; + } + + Vec4U16 operator |(Vec4U16 other) const { return Vec4U16{ _mm_or_si128(v, other.v) }; } + Vec4U16 operator &(Vec4U16 other) const { return Vec4U16{ _mm_and_si128(v, other.v) }; } + Vec4U16 operator ^(Vec4U16 other) const { return Vec4U16{ _mm_xor_si128(v, other.v) }; } + + Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ _mm_max_epu16_SSE2(v, other.v) }; } + Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ _mm_max_epu16_SSE2(v, other.v) }; } Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ _mm_cmplt_epu16(v, other.v) }; } }; +Vec4U16 SignBits32ToMaskU16(Vec4S32 v) { + __m128i temp = _mm_srai_epi32(v.v, 31); + return Vec4U16 { + _mm_packs_epi32(temp, temp) + }; +} + +Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) { + return Vec4U16{ + _mm_andnot_si128(inverted.v, a.v) // NOTE: with andnot, the first parameter is inverted, and then and is performed. + }; +} + #elif PPSSPP_ARCH(ARM_NEON) struct Mat4F32 { @@ -163,6 +203,12 @@ struct Vec4S32 { Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ vaddq_s32(v, other.v) }; } Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ vsubq_s32(v, other.v) }; } Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; } + Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ vorrq_s32(v, other.v) }; } + Vec4S32 operator &(Vec4S32 other) const { return Vec4S32{ vandq_s32(v, other.v) }; } + Vec4S32 operator ^(Vec4S32 other) const { return Vec4S32{ veorq_s32(v, other.v) }; } + + void operator +=(Vec4S32 other) { v = vaddq_s32(v, other.v); } + void operator -=(Vec4S32 other) { v = vsubq_s32(v, other.v); } }; struct Vec4F32 { @@ -178,14 +224,12 @@ struct Vec4F32 { static Vec4F32 LoadConvertS16(const int16_t *src) { // Note: will load 8 bytes int16x4_t value = vld1_s16(src); - // 16-bit to 32-bit, use the upper words and an arithmetic shift right to sign extend return Vec4F32{ vcvtq_f32_s32(vmovl_s16(value)) }; } static Vec4F32 LoadConvertS8(const int8_t *src) { // Note: will load 8 bytes int8x8_t value = vld1_s8(src); int16x4_t value16 = vget_low_s16(vmovl_s8(value)); - // 16-bit to 32-bit, use the upper words and an arithmetic shift right to sign extend return Vec4F32{ vcvtq_f32_s32(vmovl_s16(value)) }; } @@ -244,7 +288,15 @@ struct Vec4F32 { } }; -inline Vec4S32 VecS32FromF32(Vec4F32 f) { return Vec4S32{ vcvtq_s32_f32(f.v) }; } +inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ vcvtq_s32_f32(f.v) }; } +inline Vec4F32 Vec4F32FromS32(Vec4S32 f) { return Vec4F32{ vcvtq_f32_s32(f.v) }; } + +inline bool AnyZeroSignBit(Vec4S32 value) { + // Very suboptimal, let's optimize later. + int32x2_t prod = vand_s32(vget_low_s32(value.v), vget_high_s32(value.v)); + int mask = vget_lane_s32(prod, 0) & vget_lane_s32(prod, 1); + return (mask & 0x80000000) != 0; +} struct Vec4U16 { uint16x4_t v; // we only use the lower 64 bits. @@ -254,15 +306,43 @@ struct Vec4U16 { static Vec4U16 Load(const uint16_t *mem) { return Vec4U16{ vld1_u16(mem) }; } void Store(uint16_t *mem) { vst1_u16(mem, v); } - static Vec4U16 Max(Vec4U16 a, Vec4U16 b) { return Vec4U16{ vmax_u16(a.v, b.v) }; } - static Vec4U16 Min(Vec4U16 a, Vec4U16 b) { return Vec4U16{ vmin_u16(a.v, b.v) }; } + static Vec4U16 FromVec4S32(Vec4S32 v) { + return Vec4U16{ vmovn_u16(v.v) }; + } + static Vec4U16 FromVec4F32(Vec4F32 v) { + return Vec4U16{ vmovn_u16(vcvtq_u32_f32(v.v)) }; + } + + Vec4U16 operator |(Vec4U16 other) const { return Vec4U16{ vorr_u16(v, other.v) }; } + Vec4U16 operator &(Vec4U16 other) const { return Vec4U16{ vand_u16(v, other.v) }; } + Vec4U16 operator ^(Vec4U16 other) const { return Vec4U16{ veor_u16(v, other.v) }; } + + Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ vmax_u16(v, other.v) }; } + Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ vmin_u16(v, other.v) }; } Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ vclt_u16(v, other.v) }; } }; +Vec4U16 SignBits32ToMaskU16(Vec4S32 v) { + int32x4_t sign_mask = vshrq_n_s32(v.v, 31); + uint16x4_t result = vreinterpret_u16_s16(vmovn_s32(sign_mask)); + return Vec4U16{ result }; +} + +Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) { + return Vec4U16{ vand_u16(a.v, vmvn_u16(inverted.v)) }; +} + #else struct Vec4S32 { s32 v[4]; + + Vec4S32 operator +(Vec4S32 other) const { + return Vec4S32{ { v[0] + other.v[0], v[1] + other.v[1], v[2] + other.v[2], v[3] + other.v[3], } }; + } + Vec4S32 operator -(Vec4S32 other) const { + return Vec4S32{ { v[0] - other.v[0], v[1] - other.v[1], v[2] - other.v[2], v[3] - other.v[3], } }; + } }; #endif diff --git a/Common/Math/SIMDHeaders.h b/Common/Math/SIMDHeaders.h index 3f8500dfb273..9705dca55434 100644 --- a/Common/Math/SIMDHeaders.h +++ b/Common/Math/SIMDHeaders.h @@ -118,6 +118,24 @@ inline __m128i _mm_packu_epi32_SSE2(const __m128i v0) { return _mm_castps_si128(_mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(3, 3, 2, 0))); } +#ifdef __cplusplus + +alignas(16) static const uint32_t g_sign32[4] = { 0x00008000, 0x00008000, 0x00008000, 0x00008000 }; +alignas(16) static const uint32_t g_sign16[4] = { 0x80008000, 0x80008000, 0x80008000, 0x80008000 }; + +// Alternate solution to the above, not sure if faster or slower. +// SSE2 replacement for half of _mm_packus_epi32 but without the saturation. +// Not ideal! pshufb would make this faster but that's SSSE3. +inline __m128i _mm_packu1_epi32_SSE2(const __m128i v0) { + // Toggle the sign bit, pack, then toggle back. + __m128i toggled = _mm_sub_epi32(v0, _mm_load_si128((const __m128i *)g_sign32)); + __m128i temp = _mm_packs_epi32(toggled, toggled); + __m128i restored = _mm_add_epi16(temp, _mm_load_si128((const __m128i *)g_sign16)); + return restored; +} + +#endif + // SSE2 replacement for the entire _mm_packus_epi32 but without the saturation. // Not ideal! pshufb would make this faster but that's SSSE3. inline __m128i _mm_packu2_epi32_SSE2(const __m128i v0, const __m128i v1) { diff --git a/unittest/UnitTest.cpp b/unittest/UnitTest.cpp index fe3fba0af4b9..a087d205b96b 100644 --- a/unittest/UnitTest.cpp +++ b/unittest/UnitTest.cpp @@ -1114,12 +1114,15 @@ bool TestSIMD() { __m128i a = _mm_set_epi16(0, 0x4444, 0, 0x3333, 0, 0x2222, 0, 0x1111); __m128i b = _mm_set_epi16(0, (int16_t)0x8888, 0, 0x7777, 0, 0x6666, 0, 0x5555); __m128i c = _mm_packu2_epi32_SSE2(a, b); - __m128i d = _mm_packus_epi32(a, b); + __m128i d = _mm_packu1_epi32_SSE2(b); - uint64_t testdata2[2]; + uint64_t testdata2[4]; _mm_store_si128((__m128i *)testdata2, c); + _mm_store_si128((__m128i *)testdata2 + 1, d); EXPECT_EQ_INT(testdata2[0], 0x4444333322221111); EXPECT_EQ_INT(testdata2[1], 0x8888777766665555); + EXPECT_EQ_INT(testdata2[2], 0x8888777766665555); + EXPECT_EQ_INT(testdata2[2], 0x8888777766665555); #endif return true; } From 399570e4113b76f7b915389dc90209666e540da9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sat, 21 Dec 2024 13:21:20 +0100 Subject: [PATCH 24/28] CrossSIMD: make the transpose function compatible with ARM32 --- Common/Math/CrossSIMD.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index f723ea57d6d6..982c859439c1 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -264,6 +264,8 @@ struct Vec4F32 { // One of many possible solutions. Sometimes we could also use vld4q_f32 probably.. static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) { +#if PPSSPP_ARCH(ARM64_NEON) + // Only works on ARM64 float32x4_t temp0 = vzip1q_f32(col0.v, col2.v); float32x4_t temp1 = vzip2q_f32(col0.v, col2.v); float32x4_t temp2 = vzip1q_f32(col1.v, col3.v); @@ -272,6 +274,14 @@ struct Vec4F32 { col1.v = vzip2q_f32(temp0, temp2); col2.v = vzip1q_f32(temp1, temp3); col3.v = vzip2q_f32(temp1, temp3); +#else + float32x4x2_t col01 = vtrnq_f32(col0.v, col1.v); + float32x4x2_t col23 = vtrnq_f32(col2.v, col3.v); + col0.v = vcombine_f32(vget_low_f32(col01.val[0]), vget_low_f32(col23.val[0])); + col1.v = vcombine_f32(vget_low_f32(col01.val[1]), vget_low_f32(col23.val[1])); + col2.v = vcombine_f32(vget_high_f32(col01.val[0]), vget_high_f32(col23.val[0])); + col3.v = vcombine_f32(vget_high_f32(col01.val[1]), vget_high_f32(col23.val[1])); +#endif } inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) { From 73ae6da75793ca00b25f272542a8536a2e6abe2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sat, 21 Dec 2024 11:28:29 +0100 Subject: [PATCH 25/28] Reimplement the depth rasterizer with SIMD. --- GPU/Common/DepthRaster.cpp | 263 ++++++++++++++++++++----------------- 1 file changed, 145 insertions(+), 118 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index 1da6e6ac9a2b..7075ebd9f77f 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -8,7 +8,14 @@ #include "Common/Math/math_util.h" #include "GPU/Common/VertexDecoderCommon.h" -void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, short depthValue, GEComparison depthCompare) { +// We only need to support these three modes. +enum class ZCompareMode { + Greater, // Most common + Less, // Less common + Always, // Fairly common +}; + +void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, short depthValue, ZCompareMode compareMode) { // Swap coordinates if needed, we don't back-face-cull rects. // We also ignore the UV rotation here. if (x1 > x2) { @@ -26,8 +33,8 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, for (int y = y1; y < y2; y++) { __m128i *ptr = (__m128i *)(dest + stride * y + x1); int w = x2 - x1; - switch (depthCompare) { - case GE_COMP_ALWAYS: + switch (compareMode) { + case ZCompareMode::Always: if (depthValue == 0) { memset(ptr, 0, w * 2); } else { @@ -39,8 +46,6 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, } break; // TODO: Trailer - case GE_COMP_NEVER: - break; default: // TODO break; @@ -53,8 +58,8 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, uint16_t *ptr = (uint16_t *)(dest + stride * y + x1); int w = x2 - x1; - switch (depthCompare) { - case GE_COMP_ALWAYS: + switch (compareMode) { + case ZCompareMode::Always: if (depthValue == 0) { memset(ptr, 0, w * 2); } else { @@ -66,8 +71,6 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, } break; // TODO: Trailer - case GE_COMP_NEVER: - break; default: // TODO break; @@ -78,10 +81,39 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, #endif } +alignas(16) static const int zero123[4] = {0, 1, 2, 3}; + +struct Edge { + // Dimensions of our pixel group + static const int stepXSize = 4; + static const int stepYSize = 1; + + Vec4S32 oneStepX; + Vec4S32 oneStepY; + + Vec4S32 init(int v0x, int v0y, int v1x, int v1y, int p0x, int p0y) { + // Edge setup + int A = v0y - v1y; + int B = v1x - v0x; + int C = v0x * v1y - v0y * v1x; + + // Step deltas + oneStepX = Vec4S32::Splat(A * stepXSize); + oneStepY = Vec4S32::Splat(B * stepYSize); + + // x/y values for initial pixel block. Add horizontal offsets. + Vec4S32 x = Vec4S32::Splat(p0x) + Vec4S32::LoadAligned(zero123); + Vec4S32 y = Vec4S32::Splat(p0y); + + // Edge function values at origin + return Vec4S32::Splat(A) * x + Vec4S32::Splat(B) * y + Vec4S32::Splat(C); + } +}; + // Adapted from Intel's depth rasterizer example. // Started with the scalar version, will SIMD-ify later. // x1/y1 etc are the scissor rect. -void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, GEComparison compareMode) { +void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, ZCompareMode compareMode) { int tileStartX = x1; int tileEndX = x2; @@ -93,124 +125,90 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, // are slow on SSE2. // Convert to whole pixels for now. Later subpixel precision. - DepthScreenVertex verts[3]; - verts[0].x = tx[0]; - verts[0].y = ty[0]; - verts[0].z = tz[0]; - verts[1].x = tx[1]; - verts[1].y = ty[1]; - verts[1].z = tz[1]; - verts[2].x = tx[2]; - verts[2].y = ty[2]; - verts[2].z = tz[2]; + int v0x = tx[0]; + int v0y = ty[0]; + int v0z = tz[0]; + int v1x = tx[1]; + int v1y = ty[1]; + int v1z = tz[1]; + int v2x = tx[2]; + int v2y = ty[2]; + int v2z = tz[2]; // use fixed-point only for X and Y. Avoid work for Z and W. - int startX = std::max(std::min(std::min(verts[0].x, verts[1].x), verts[2].x), tileStartX); - int endX = std::min(std::max(std::max(verts[0].x, verts[1].x), verts[2].x) + 1, tileEndX); - - int startY = std::max(std::min(std::min(verts[0].y, verts[1].y), verts[2].y), tileStartY); - int endY = std::min(std::max(std::max(verts[0].y, verts[1].y), verts[2].y) + 1, tileEndY); - if (endX == startX || endY == startY) { + // We use 4x1 tiles for simplicity. + int minX = std::max(std::min(std::min(v0x, v1x), v2x), tileStartX) & ~3; + int maxX = std::min(std::max(std::max(v0x, v1x), v2x) + 3, tileEndX) & ~3; + int minY = std::max(std::min(std::min(v0y, v1y), v2y), tileStartY); + int maxY = std::min(std::max(std::max(v0y, v1y), v2y), tileEndY); + if (maxX == minX || maxY == minY) { // No pixels, or outside screen. return; } + // TODO: Cull really small triangles here. - // Fab(x, y) = Ax + By + C = 0 - // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 - // Compute A = (ya - yb) for the 3 line segments that make up each triangle - int A0 = verts[1].y - verts[2].y; - int A1 = verts[2].y - verts[0].y; - int A2 = verts[0].y - verts[1].y; - - // Compute B = (xb - xa) for the 3 line segments that make up each triangle - int B0 = verts[2].x - verts[1].x; - int B1 = verts[0].x - verts[2].x; - int B2 = verts[1].x - verts[0].x; - - // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle - int C0 = verts[1].x * verts[2].y - verts[2].x * verts[1].y; - int C1 = verts[2].x * verts[0].y - verts[0].x * verts[2].y; - int C2 = verts[0].x * verts[1].y - verts[1].x * verts[0].y; - - // Compute triangle area. - // TODO: Cull really small triangles here - we can just raise the comparison value below. - int triArea = A0 * verts[0].x + B0 * verts[0].y + C0; + Edge e01, e12, e20; + + Vec4S32 w0_row = e12.init(v1x, v1y, v2x, v2y, minX, minY); + Vec4S32 w1_row = e20.init(v2x, v2y, v0x, v0y, minX, minY); + Vec4S32 w2_row = e01.init(v0x, v0y, v1x, v1y, minX, minY); + + int triArea = (v1y - v2y) * v0x + (v2x - v1x) * v0y + (v1x * v2y - v2x * v1y); if (triArea <= 0) { - // Too small to rasterize or backface culled - // NOTE: Just disabling this check won't enable two-sided rendering. - // Since it's not that common, let's just queue the triangles with both windings. return; } - - int rowIdx = (startY * stride + startX); - int col = startX; - int row = startY; - - // Calculate slopes at starting corner. - int alpha0 = (A0 * col) + (B0 * row) + C0; - int beta0 = (A1 * col) + (B1 * row) + C1; - int gamma0 = (A2 * col) + (B2 * row) + C2; - - float oneOverTriArea = (1.0f / float(triArea)); - - float zz[3]; - zz[0] = (float)verts[0].z; - zz[1] = (float)(verts[1].z - verts[0].z) * oneOverTriArea; - zz[2] = (float)(verts[2].z - verts[0].z) * oneOverTriArea; - - // END triangle setup. - - // Here we should draw four triangles in a sequence. - - // Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY) - for (int r = startY; r < endY; r++, - row++, - rowIdx += stride, - alpha0 += B0, - beta0 += B1, - gamma0 += B2) - { - int idx = rowIdx; - - // Restore row steppers. - int alpha = alpha0; - int beta = beta0; - int gamma = gamma0; - - for (int c = startX; c < endX; c++, - idx++, - alpha += A0, - beta += A1, - gamma += A2) - { - int mask = (alpha | beta | gamma) >= 0; - // Early out if all of this quad's pixels are outside the triangle. - if (!mask) { + float oneOverTriArea = 1.0f / (float)triArea; + + // Prepare to interpolate Z + Vec4F32 zz0 = Vec4F32::Splat((float)v0z); + Vec4F32 zz1 = Vec4F32::Splat((float)(v1z - v0z) * oneOverTriArea); + Vec4F32 zz2 = Vec4F32::Splat((float)(v2z - v0z) * oneOverTriArea); + + // Rasterize + for (int y = minY; y <= maxY; y += Edge::stepYSize, w0_row += e12.oneStepY, w1_row += e20.oneStepY, w2_row += e01.oneStepY) { + // Barycentric coordinates at start of row + Vec4S32 w0 = w0_row; + Vec4S32 w1 = w1_row; + Vec4S32 w2 = w2_row; + + uint16_t *rowPtr = depthBuf + stride * y; + + for (int x = minX; x <= maxX; x += Edge::stepXSize, w0 += e12.oneStepX, w1 += e20.oneStepX, w2 += e01.oneStepX) { + // If p is on or inside all edges for any pixels, + // render those pixels. + Vec4S32 signCalc = w0 | w1 | w2; + if (!AnyZeroSignBit(signCalc)) { continue; } - // Compute barycentric-interpolated depth. Could also compute it incrementally. - float depth = zz[0] + beta * zz[1] + gamma * zz[2]; - float previousDepthValue = (float)depthBuf[idx]; - int depthMask; + Vec4U16 bufferValues = Vec4U16::Load(rowPtr + x); + Vec4U16 shortMaskInv = SignBits32ToMaskU16(signCalc); + // Now, the mask has 1111111 where we should preserve the contents of the depth buffer. + + // Compute the Z value for all four pixels. + // float depth = zz[0] + beta * zz[1] + gamma * zz[2]; + Vec4U16 shortZ = Vec4U16::FromVec4F32(zz0 + Vec4F32FromS32(w1) * zz1 + Vec4F32FromS32(w2) * zz2); + + // TODO: Lift this switch out of the inner loop, or even out of the function with templating. switch (compareMode) { - case GE_COMP_EQUAL: depthMask = depth == previousDepthValue; break; - case GE_COMP_LESS: depthMask = depth < previousDepthValue; break; - case GE_COMP_LEQUAL: depthMask = depth <= previousDepthValue; break; - case GE_COMP_GEQUAL: depthMask = depth >= previousDepthValue; break; - case GE_COMP_GREATER: depthMask = depth > previousDepthValue; break; - case GE_COMP_NOTEQUAL: depthMask = depth != previousDepthValue; break; - case GE_COMP_ALWAYS: - default: - depthMask = 1; + case ZCompareMode::Greater: + // To implement the greater/greater-than comparison, we can combine mask and max. + // It might be better to do the math in float space on x86 due to SSE2 deficiencies. + // We use AndNot to zero out Z results, before doing Max with the buffer. + AndNot(shortZ, shortMaskInv).Max(bufferValues).Store(rowPtr + x); + break; + case ZCompareMode::Less: // UNTESTED + // This time, we OR the mask and use .Min. + (shortZ | shortMaskInv).Min(bufferValues).Store(rowPtr + x); + break; + case ZCompareMode::Always: // UNTESTED + // This could be replaced with a vblend operation. + ((bufferValues & shortMaskInv) | AndNot(shortZ, shortMaskInv)).Store(rowPtr + x); break; } - int finalMask = mask & depthMask; - depth = finalMask == 1 ? depth : previousDepthValue; - depthBuf[idx] = (u16)depth; - } //for each column - } // for each row + } + } } void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, VertexDecoder *dec, u32 vertTypeID) { @@ -299,9 +297,9 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *tran z *= recipW; Vec4S32 screen[3]; - screen[0] = VecS32FromF32((x * viewportScaleX + viewportX) - offsetX); - screen[1] = VecS32FromF32((y * viewportScaleY + viewportY) - offsetY); - screen[2] = VecS32FromF32((z * viewportScaleZ + viewportZ).Clamp(0.0f, 65535.0f)); + screen[0] = Vec4S32FromF32((x * viewportScaleX + viewportX) - offsetX); + screen[1] = Vec4S32FromF32((y * viewportScaleY + viewportY) - offsetY); + screen[2] = Vec4S32FromF32((z * viewportScaleZ + viewportZ).Clamp(0.0f, 65535.0f)); screen[0].Store(tx + outCount); screen[1].Store(ty + outCount); @@ -341,12 +339,41 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType pr // Prim should now be either TRIANGLES or RECTs. _dbg_assert_(prim == GE_PRIM_RECTANGLES || prim == GE_PRIM_TRIANGLES); + // Ignore draws where stencil operations are active? + if (gstate.isStencilTestEnabled()) { + // return; + } + GEComparison compareMode = gstate.getDepthTestFunction(); + + ZCompareMode comp; + // Ignore some useless compare modes. + switch (compareMode) { + case GE_COMP_NEVER: + case GE_COMP_EQUAL: + // These will never have a useful effect in Z-only raster. + return; + case GE_COMP_ALWAYS: + comp = ZCompareMode::Always; + break; + case GE_COMP_LEQUAL: + case GE_COMP_LESS: + comp = ZCompareMode::Less; + break; + case GE_COMP_GEQUAL: + case GE_COMP_GREATER: + comp = ZCompareMode::Greater; // Most common + break; + case GE_COMP_NOTEQUAL: + // This is highly unusual, let's just ignore it. + return; + } + if (gstate.isModeClear()) { if (!gstate.isClearModeDepthMask()) { return; } - compareMode = GE_COMP_ALWAYS; + comp = ZCompareMode::Always; } else { if (!gstate.isDepthTestEnabled() || !gstate.isDepthWriteEnabled()) return; @@ -358,12 +385,12 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType pr uint16_t z = tz[i + 1]; // depth from second vertex // TODO: Should clip coordinates to the scissor rectangle. // We remove the subpixel information here. - DepthRasterRect(depth, depthStride, tx[i], ty[i], tx[i + 1], ty[i + 1], z, compareMode); + DepthRasterRect(depth, depthStride, tx[i], ty[i], tx[i + 1], ty[i + 1], z, comp); } break; case GE_PRIM_TRIANGLES: for (int i = 0; i < count; i += 3) { - DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, &tx[i], &ty[i], &tz[i], compareMode); + DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, &tx[i], &ty[i], &tz[i], comp); } break; default: From 5df88fc1aab1feb92e60a6b14eec01f3b56e85b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sat, 21 Dec 2024 11:33:37 +0100 Subject: [PATCH 26/28] Convert the rect implementation to CrossSIMD --- Common/Math/CrossSIMD.h | 20 ++++++++++++++++++++ GPU/Common/DepthRaster.cpp | 33 ++------------------------------- 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index 982c859439c1..83ac18add4ef 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -153,6 +153,16 @@ struct Vec4U16 { Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ _mm_cmplt_epu16(v, other.v) }; } }; +struct Vec8U16 { + __m128i v; + + static Vec8U16 Zero() { return Vec8U16{ _mm_setzero_si128() }; } + static Vec8U16 Splat(uint16_t value) { return Vec8U16{ _mm_set1_epi16((int16_t)value) }; } + + static Vec8U16 Load(const uint16_t *mem) { return Vec8U16{ _mm_loadu_si128((__m128i *)mem) }; } + void Store(uint16_t *mem) { _mm_storeu_si128((__m128i *)mem, v); } +}; + Vec4U16 SignBits32ToMaskU16(Vec4S32 v) { __m128i temp = _mm_srai_epi32(v.v, 31); return Vec4U16 { @@ -342,6 +352,16 @@ Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) { return Vec4U16{ vand_u16(a.v, vmvn_u16(inverted.v)) }; } +struct Vec8U16 { + uint16x8_t v; + + static Vec8U16 Zero() { return Vec8U16{ vdupq_n_u16(0) }; } + static Vec8U16 Splat(uint16_t value) { return Vec8U16{ vdupq_n_u16(value) }; } + + static Vec8U16 Load(const uint16_t *mem) { return Vec8U16{ vld1q_u16(mem) }; } + void Store(uint16_t *mem) { vst1q_u16(mem, v); } +}; + #else struct Vec4S32 { diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index 7075ebd9f77f..4443974059e3 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -28,43 +28,17 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, return; } -#if PPSSPP_ARCH(SSE2) - __m128i valueX8 = _mm_set1_epi16(depthValue); - for (int y = y1; y < y2; y++) { - __m128i *ptr = (__m128i *)(dest + stride * y + x1); - int w = x2 - x1; - switch (compareMode) { - case ZCompareMode::Always: - if (depthValue == 0) { - memset(ptr, 0, w * 2); - } else { - while (w >= 8) { - _mm_storeu_si128(ptr, valueX8); - ptr++; - w -= 8; - } - } - break; - // TODO: Trailer - default: - // TODO - break; - } - } - -#elif PPSSPP_ARCH(ARM64_NEON) - uint16x8_t valueX8 = vdupq_n_u16(depthValue); + Vec8U16 valueX8 = Vec8U16::Splat(depthValue); for (int y = y1; y < y2; y++) { uint16_t *ptr = (uint16_t *)(dest + stride * y + x1); int w = x2 - x1; - switch (compareMode) { case ZCompareMode::Always: if (depthValue == 0) { memset(ptr, 0, w * 2); } else { while (w >= 8) { - vst1q_u16(ptr, valueX8); + valueX8.Store(ptr); ptr += 8; w -= 8; } @@ -76,9 +50,6 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, break; } } -#else - // Do nothing for now -#endif } alignas(16) static const int zero123[4] = {0, 1, 2, 3}; From 8cd86b47b5c49f39fd9490029bf8358133161a8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sat, 21 Dec 2024 11:44:49 +0100 Subject: [PATCH 27/28] AnyZeroSignBit arm fix, more crosssimd fixes. Now works on ARM. --- Common/Math/CrossSIMD.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h index 83ac18add4ef..55e7b86fc7f5 100644 --- a/Common/Math/CrossSIMD.h +++ b/Common/Math/CrossSIMD.h @@ -309,19 +309,20 @@ struct Vec4F32 { }; inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ vcvtq_s32_f32(f.v) }; } -inline Vec4F32 Vec4F32FromS32(Vec4S32 f) { return Vec4F32{ vcvtq_f32_s32(f.v) }; } +inline Vec4F32 Vec4F32FromS32(Vec4S32 s) { return Vec4F32{ vcvtq_f32_s32(s.v) }; } inline bool AnyZeroSignBit(Vec4S32 value) { // Very suboptimal, let's optimize later. int32x2_t prod = vand_s32(vget_low_s32(value.v), vget_high_s32(value.v)); int mask = vget_lane_s32(prod, 0) & vget_lane_s32(prod, 1); - return (mask & 0x80000000) != 0; + return (mask & 0x80000000) == 0; } struct Vec4U16 { - uint16x4_t v; // we only use the lower 64 bits. + uint16x4_t v; // 64 bits. static Vec4U16 Zero() { return Vec4U16{ vdup_n_u16(0) }; } + static Vec4U16 Splat(uint16_t value) { return Vec4U16{ vdup_n_u16(value) }; } static Vec4U16 Load(const uint16_t *mem) { return Vec4U16{ vld1_u16(mem) }; } void Store(uint16_t *mem) { vst1_u16(mem, v); } @@ -330,7 +331,7 @@ struct Vec4U16 { return Vec4U16{ vmovn_u16(v.v) }; } static Vec4U16 FromVec4F32(Vec4F32 v) { - return Vec4U16{ vmovn_u16(vcvtq_u32_f32(v.v)) }; + return Vec4U16{ vmovn_u32(vreinterpretq_u32_s32(vcvtq_s32_f32(v.v))) }; } Vec4U16 operator |(Vec4U16 other) const { return Vec4U16{ vorr_u16(v, other.v) }; } From 80cb57f8bb05d833b19ffac414cd71ddc7399109 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Sat, 21 Dec 2024 12:57:12 +0100 Subject: [PATCH 28/28] Cleanup --- GPU/Common/DepthRaster.cpp | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index 4443974059e3..ca8f81cedfb3 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -118,6 +118,12 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, } // TODO: Cull really small triangles here. + int triArea = (v1y - v2y) * v0x + (v2x - v1x) * v0y + (v1x * v2y - v2x * v1y); + if (triArea <= 0) { + return; + } + + float oneOverTriArea = 1.0f / (float)triArea; Edge e01, e12, e20; @@ -125,12 +131,6 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, Vec4S32 w1_row = e20.init(v2x, v2y, v0x, v0y, minX, minY); Vec4S32 w2_row = e01.init(v0x, v0y, v1x, v1y, minX, minY); - int triArea = (v1y - v2y) * v0x + (v2x - v1x) * v0y + (v1x * v2y - v2x * v1y); - if (triArea <= 0) { - return; - } - float oneOverTriArea = 1.0f / (float)triArea; - // Prepare to interpolate Z Vec4F32 zz0 = Vec4F32::Splat((float)v0z); Vec4F32 zz1 = Vec4F32::Splat((float)(v1z - v0z) * oneOverTriArea); @@ -320,10 +320,6 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType pr ZCompareMode comp; // Ignore some useless compare modes. switch (compareMode) { - case GE_COMP_NEVER: - case GE_COMP_EQUAL: - // These will never have a useful effect in Z-only raster. - return; case GE_COMP_ALWAYS: comp = ZCompareMode::Always; break; @@ -335,8 +331,14 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType pr case GE_COMP_GREATER: comp = ZCompareMode::Greater; // Most common break; + case GE_COMP_NEVER: + case GE_COMP_EQUAL: + // These will never have a useful effect in Z-only raster. + [[fallthrough]]; case GE_COMP_NOTEQUAL: // This is highly unusual, let's just ignore it. + [[fallthrough]]; + default: return; }