From 58adb379ea0019f2b803c22962bd8c997809a0a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Wed, 18 Dec 2024 18:35:55 +0100
Subject: [PATCH 01/28] GE debugger improvements

---
 Common/GPU/DataFormat.h             |  1 +
 Common/GPU/Vulkan/thin3d_vulkan.cpp |  8 +++++-
 Common/GPU/thin3d.cpp               | 19 +++++++++++++
 Common/GPU/thin3d.h                 |  1 +
 UI/ImDebugger/ImDebugger.h          |  2 ++
 UI/ImDebugger/ImGe.cpp              | 44 +++++++++++++++++++++++++++--
 ext/imgui/imgui_impl_thin3d.cpp     |  4 +++
 7 files changed, 75 insertions(+), 4 deletions(-)

diff --git a/Common/GPU/DataFormat.h b/Common/GPU/DataFormat.h
index 6f5cd2bc2eae..428146802a6c 100644
--- a/Common/GPU/DataFormat.h
+++ b/Common/GPU/DataFormat.h
@@ -74,6 +74,7 @@ bool DataFormatIsDepthStencil(DataFormat fmt);
 inline bool DataFormatIsColor(DataFormat fmt) {
 	return !DataFormatIsDepthStencil(fmt);
 }
+int DataFormatNumChannels(DataFormat fmt);
 bool DataFormatIsBlockCompressed(DataFormat fmt, int *blockSize);
 
 // Limited format support for now.
diff --git a/Common/GPU/Vulkan/thin3d_vulkan.cpp b/Common/GPU/Vulkan/thin3d_vulkan.cpp
index e3d135e8c0b1..b744235b892f 100644
--- a/Common/GPU/Vulkan/thin3d_vulkan.cpp
+++ b/Common/GPU/Vulkan/thin3d_vulkan.cpp
@@ -803,9 +803,15 @@ bool VKTexture::Create(VkCommandBuffer cmd, VulkanBarrierBatch *postBarriers, Vu
 	}
 
 	VkComponentMapping r8AsAlpha[4] = { VK_COMPONENT_SWIZZLE_ONE, VK_COMPONENT_SWIZZLE_ONE, VK_COMPONENT_SWIZZLE_ONE, VK_COMPONENT_SWIZZLE_R };
+	VkComponentMapping r8AsColor[4] = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_ONE };
 
+	VkComponentMapping *swizzle = nullptr;
+	switch (desc.swizzle) {
+	case TextureSwizzle::R8_AS_ALPHA: swizzle = r8AsAlpha; break;
+	case TextureSwizzle::R8_AS_GRAYSCALE: swizzle = r8AsColor; break;
+	}
 	VulkanBarrierBatch barrier;
-	if (!vkTex_->CreateDirect(width_, height_, 1, mipLevels_, vulkanFormat, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, usageBits, &barrier, desc.swizzle == TextureSwizzle::R8_AS_ALPHA ? r8AsAlpha : nullptr)) {
+	if (!vkTex_->CreateDirect(width_, height_, 1, mipLevels_, vulkanFormat, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, usageBits, &barrier, swizzle)) {
 		ERROR_LOG(Log::G3D,  "Failed to create VulkanTexture: %dx%dx%d fmt %d, %d levels", width_, height_, depth_, (int)vulkanFormat, mipLevels_);
 		return false;
 	}
diff --git a/Common/GPU/thin3d.cpp b/Common/GPU/thin3d.cpp
index de8db83fb331..72fbe873e86a 100644
--- a/Common/GPU/thin3d.cpp
+++ b/Common/GPU/thin3d.cpp
@@ -118,6 +118,25 @@ bool DataFormatIsBlockCompressed(DataFormat fmt, int *blockSize) {
 	}
 }
 
+int DataFormatNumChannels(DataFormat fmt) {
+	switch (fmt) {
+	case DataFormat::D16:
+	case DataFormat::D32F:
+	case DataFormat::R8_UNORM:
+	case DataFormat::R16_UNORM:
+	case DataFormat::R16_FLOAT:
+	case DataFormat::R32_FLOAT:
+		return 1;
+	case DataFormat::R8G8B8A8_UNORM:
+	case DataFormat::R8G8B8A8_UNORM_SRGB:
+	case DataFormat::B8G8R8A8_UNORM:
+	case DataFormat::B8G8R8A8_UNORM_SRGB:
+		return 4;
+	default:
+		return 0;
+	}
+}
+
 RefCountedObject::~RefCountedObject() {
 	const int rc = refcount_.load();
 	_dbg_assert_msg_(rc == 0xDEDEDE, "Unexpected refcount %d in object of type '%s'", rc, name_);
diff --git a/Common/GPU/thin3d.h b/Common/GPU/thin3d.h
index ce62382718e2..f7a23795b017 100644
--- a/Common/GPU/thin3d.h
+++ b/Common/GPU/thin3d.h
@@ -643,6 +643,7 @@ typedef std::function<bool(uint8_t *data, const uint8_t *initData, uint32_t w, u
 enum class TextureSwizzle {
 	DEFAULT,
 	R8_AS_ALPHA,
+	R8_AS_GRAYSCALE,
 };
 
 struct TextureDesc {
diff --git a/UI/ImDebugger/ImDebugger.h b/UI/ImDebugger/ImDebugger.h
index f1363d6a0f5f..5a8f84cf2d88 100644
--- a/UI/ImDebugger/ImDebugger.h
+++ b/UI/ImDebugger/ImDebugger.h
@@ -170,12 +170,14 @@ enum class ImCmd {
 	SHOW_IN_CPU_DISASM,
 	SHOW_IN_GE_DISASM,
 	SHOW_IN_MEMORY_VIEWER,  // param is address, param2 is viewer index
+	SHOW_IN_PIXEL_VIEWER,  // param is address, param2 is stride, |0x80000000 if depth, param3 is w/h
 };
 
 struct ImCommand {
 	ImCmd cmd;
 	uint32_t param;
 	uint32_t param2;
+	uint32_t param3;
 };
 
 struct ImControl {
diff --git a/UI/ImDebugger/ImGe.cpp b/UI/ImDebugger/ImGe.cpp
index 415cd0a7e133..9503c7a08e30 100644
--- a/UI/ImDebugger/ImGe.cpp
+++ b/UI/ImDebugger/ImGe.cpp
@@ -149,12 +149,39 @@ void ImGePixelViewerWindow::Draw(ImConfig &cfg, ImControl &control, GPUDebugInte
 		if (ImGui::Button("Refresh")) {
 			viewer_.Snapshot();
 		}
+		if (ImGui::Button("Show cur depth")) {
+			viewer_.addr = gstate.getDepthBufRawAddress() | 0x04000000;
+			viewer_.format = GE_FORMAT_DEPTH16;
+			viewer_.stride = gstate.DepthBufStride();
+			viewer_.width = viewer_.stride;
+			viewer_.Snapshot();
+		}
+		if (ImGui::Button("Show cur color")) {
+			viewer_.addr = gstate.getFrameBufAddress();
+			viewer_.format = gstate.FrameBufFormat();
+			viewer_.stride = gstate.FrameBufStride();
+			viewer_.width = viewer_.stride;
+			viewer_.Snapshot();
+		}
 	}
 	ImGui::EndChild();
 
 	ImGui::SameLine();
 	if (ImGui::BeginChild("right")) {
+		ImVec2 p0 = ImGui::GetCursorScreenPos();
 		viewer_.Draw(gpuDebug, draw);
+		if (ImGui::IsItemHovered()) {
+			int x = (int)(ImGui::GetMousePos().x - p0.x);
+			int y = (int)(ImGui::GetMousePos().y - p0.y);
+			char temp[128];
+			if (viewer_.FormatValueAt(temp, sizeof(temp), x, y)) {
+				ImGui::Text("(%d, %d): %s", x, y, temp);
+			} else {
+				ImGui::Text("%d, %d: N/A", x, y);
+			}
+		} else {
+			ImGui::TextUnformatted("(no pixel hovered)");
+		}
 	}
 	ImGui::EndChild();
 	ImGui::End();
@@ -211,6 +238,12 @@ bool ImGePixelViewer::FormatValueAt(char *buf, size_t bufSize, int x, int y) con
 		snprintf(buf, bufSize, "%08x (raw: %04x)", RGBA5551ToRGBA8888(raw), raw);
 		break;
 	}
+	case GE_FORMAT_DEPTH16:
+	{
+		u16 raw = Memory::Read_U16(pixelAddr);
+		snprintf(buf, bufSize, "%0.4f (raw: %04x / %d)", (float)raw / 65535.0f, raw, raw);
+		break;
+	}
 	default:
 		snprintf(buf, bufSize, "N/A");
 		return false;
@@ -356,6 +389,7 @@ bool ImGeReadbackViewer::Draw(GPUDebugInterface *gpuDebug, Draw::DrawContext *dr
 			readbackFmt_ = Draw::DataFormat::R8G8B8A8_UNORM;
 			break;
 		case Draw::Aspect::DEPTH_BIT:
+			// TODO: Add fallback
 			readbackFmt_ = Draw::DataFormat::D32F;
 			break;
 		case Draw::Aspect::STENCIL_BIT:
@@ -385,14 +419,15 @@ bool ImGeReadbackViewer::Draw(GPUDebugInterface *gpuDebug, Draw::DrawContext *dr
 				}
 			}
 
+			Draw::DataFormat fmt = rbBpp == 1 ? Draw::DataFormat::R8_UNORM : Draw::DataFormat::R32_FLOAT;
 			Draw::TextureDesc desc{ Draw::TextureType::LINEAR2D,
-				rbBpp == 1 ? Draw::DataFormat::R8_UNORM : Draw::DataFormat::R32_FLOAT,
+				fmt,
 				(int)w,
 				(int)h,
 				1,
 				1,
 				false,
-				rbBpp == 1 ? Draw::TextureSwizzle::R8_AS_ALPHA : Draw::TextureSwizzle::DEFAULT,
+				Draw::DataFormatNumChannels(fmt) == 1 ? Draw::TextureSwizzle::R8_AS_GRAYSCALE: Draw::TextureSwizzle::DEFAULT,
 				"PixelViewer temp",
 				{ texData },
 				nullptr,
@@ -432,7 +467,9 @@ bool ImGeReadbackViewer::FormatValueAt(char *buf, size_t bufSize, int x, int y)
 	case Draw::DataFormat::D32F:
 	{
 		const float *read = (const float *)(data_ + offset);
-		snprintf(buf, bufSize, "%0.4f", *read);
+		float value = *read;
+		int ivalue = *read * 65535.0f;
+		snprintf(buf, bufSize, "%0.4f (raw: %04x / %d)", *read, ivalue, ivalue);
 		return true;
 	}
 	case Draw::DataFormat::S8:
@@ -1060,6 +1097,7 @@ void ImGeDebuggerWindow::Draw(ImConfig &cfg, ImControl &control, GPUDebugInterfa
 					DrawPreviewPrimitive(drawList, p0, previewPrim_, previewIndices_, previewVertices_, previewCount_, true, texW, texH);
 
 					drawList->PopClipRect();
+
 				} else {
 					ImGui::Text("(no valid texture bound)");
 					// In software mode, we should just decode the texture here.
diff --git a/ext/imgui/imgui_impl_thin3d.cpp b/ext/imgui/imgui_impl_thin3d.cpp
index bd302a55d51b..4c1ecba7a8e3 100644
--- a/ext/imgui/imgui_impl_thin3d.cpp
+++ b/ext/imgui/imgui_impl_thin3d.cpp
@@ -114,6 +114,10 @@ void ImGui_ImplThin3d_RenderDrawData(ImDrawData* draw_data, Draw::DrawContext *d
 				boundSampler = bd->fontSampler;
 			} else {
 				size_t index = (size_t)pcmd->TextureId - TEX_ID_OFFSET;
+				if (index >= bd->tempTextures.size()) {
+					WARN_LOG(Log::System, "Missing temp texture %d (out of %d)", index, (int)bd->tempTextures.size());
+					continue;
+				}
 				_dbg_assert_(index < bd->tempTextures.size());
 				switch (bd->tempTextures[index].type) {
 				case RegisteredTextureType::Framebuffer:

From b442183259f5ff8e5a872b5a466602238686574a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Thu, 19 Dec 2024 10:12:58 +0100
Subject: [PATCH 02/28] Add "Realtime" checkbox to pixel viewer

---
 UI/ImDebugger/ImDebugger.cpp | 1 +
 UI/ImDebugger/ImDebugger.h   | 1 +
 UI/ImDebugger/ImGe.cpp       | 5 +++++
 UI/ImDebugger/ImGe.h         | 2 --
 4 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/UI/ImDebugger/ImDebugger.cpp b/UI/ImDebugger/ImDebugger.cpp
index 5f14a8529c0d..9cb082cf68bf 100644
--- a/UI/ImDebugger/ImDebugger.cpp
+++ b/UI/ImDebugger/ImDebugger.cpp
@@ -1552,4 +1552,5 @@ void ImConfig::SyncConfig(IniFile *ini, bool save) {
 
 	sync.SetSection(ini->GetOrCreateSection("Settings"));
 	sync.Sync("displayLatched", &displayLatched, false);
+	sync.Sync("realtimePixelPreview", &realtimePixelPreview, false);
 }
diff --git a/UI/ImDebugger/ImDebugger.h b/UI/ImDebugger/ImDebugger.h
index 5a8f84cf2d88..c0fa9a037d56 100644
--- a/UI/ImDebugger/ImDebugger.h
+++ b/UI/ImDebugger/ImDebugger.h
@@ -153,6 +153,7 @@ struct ImConfig {
 	int selectedMemCheck = -1;
 	uint64_t selectedTexAddr = 0;
 
+	bool realtimePixelPreview = false;
 	int breakCount = 0;
 
 	bool displayLatched = false;
diff --git a/UI/ImDebugger/ImGe.cpp b/UI/ImDebugger/ImGe.cpp
index 9503c7a08e30..52783719a545 100644
--- a/UI/ImDebugger/ImGe.cpp
+++ b/UI/ImDebugger/ImGe.cpp
@@ -163,9 +163,14 @@ void ImGePixelViewerWindow::Draw(ImConfig &cfg, ImControl &control, GPUDebugInte
 			viewer_.width = viewer_.stride;
 			viewer_.Snapshot();
 		}
+		ImGui::Checkbox("Realtime", &cfg.realtimePixelPreview);
 	}
 	ImGui::EndChild();
 
+	if (cfg.realtimePixelPreview) {
+		viewer_.Snapshot();
+	}
+
 	ImGui::SameLine();
 	if (ImGui::BeginChild("right")) {
 		ImVec2 p0 = ImGui::GetCursorScreenPos();
diff --git a/UI/ImDebugger/ImGe.h b/UI/ImDebugger/ImGe.h
index c642b620efb7..227b0f0c1e52 100644
--- a/UI/ImDebugger/ImGe.h
+++ b/UI/ImDebugger/ImGe.h
@@ -124,8 +124,6 @@ class ImGePixelViewerWindow {
 	}
 
 private:
-	void UpdateTexture(Draw::DrawContext *draw);
-
 	ImGePixelViewer viewer_;
 };
 

From c5ad81e3d513f47b1873f05b98d185c0a0e9710b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Tue, 17 Dec 2024 12:58:33 +0100
Subject: [PATCH 03/28] Add DepthRaster.cpp/h. Rasterize depth rectangles, some
 triangles

---
 CMakeLists.txt                      |   2 +
 Common/Math/CrossSIMD.h             |   1 -
 Core/Compatibility.cpp              |   1 +
 Core/Compatibility.h                |   1 +
 GPU/Common/DepthRaster.cpp          | 367 ++++++++++++++++++++++++++++
 GPU/Common/DepthRaster.h            |  12 +
 GPU/Common/DrawEngineCommon.h       |   5 +
 GPU/Common/VertexDecoderCommon.h    |   6 +-
 GPU/GPU.vcxproj                     |   2 +
 GPU/GPU.vcxproj.filters             |   6 +
 GPU/GPUCommonHW.cpp                 |   9 +
 UI/ImDebugger/ImGe.h                |   6 +-
 UWP/GPU_UWP/GPU_UWP.vcxproj         |   4 +-
 UWP/GPU_UWP/GPU_UWP.vcxproj.filters |   4 +-
 android/jni/Android.mk              |   1 +
 assets/compat.ini                   |   4 +-
 libretro/Makefile.common            |   1 +
 17 files changed, 422 insertions(+), 10 deletions(-)
 create mode 100644 GPU/Common/DepthRaster.cpp
 create mode 100644 GPU/Common/DepthRaster.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c1860a2b5332..fcf74005e7f9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1906,6 +1906,8 @@ set(GPU_SOURCES
 	GPU/Common/Draw2D.cpp
 	GPU/Common/Draw2D.h
 	GPU/Common/DepthBufferCommon.cpp
+	GPU/Common/DepthRaster.cpp
+	GPU/Common/DepthRaster.h
 	GPU/Common/TextureShaderCommon.cpp
 	GPU/Common/TextureShaderCommon.h
 	GPU/Common/DepalettizeShaderCommon.cpp
diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
index 6ad03b832217..94b2d3933b52 100644
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@@ -5,4 +5,3 @@
 #pragma once
 
 #include "Common/Math/SIMDHeaders.h"
-
diff --git a/Core/Compatibility.cpp b/Core/Compatibility.cpp
index 379042198011..cd2c50c48835 100644
--- a/Core/Compatibility.cpp
+++ b/Core/Compatibility.cpp
@@ -149,6 +149,7 @@ void Compatibility::CheckSettings(IniFile &iniFile, const std::string &gameID) {
 	CheckSetting(iniFile, gameID, "DisableMemcpySlicing", &flags_.DisableMemcpySlicing);
 	CheckSetting(iniFile, gameID, "ForceEnableGPUReadback", &flags_.ForceEnableGPUReadback);
 	CheckSetting(iniFile, gameID, "UseFFMPEGFindStreamInfo", &flags_.UseFFMPEGFindStreamInfo);
+	CheckSetting(iniFile, gameID, "SoftwareRasterDepth", &flags_.SoftwareRasterDepth);
 }
 
 void Compatibility::CheckVRSettings(IniFile &iniFile, const std::string &gameID) {
diff --git a/Core/Compatibility.h b/Core/Compatibility.h
index 8a0e33af4d34..4688df37c055 100644
--- a/Core/Compatibility.h
+++ b/Core/Compatibility.h
@@ -112,6 +112,7 @@ struct CompatFlags {
 	bool DisableMemcpySlicing;
 	bool ForceEnableGPUReadback;
 	bool UseFFMPEGFindStreamInfo;
+	bool SoftwareRasterDepth;
 };
 
 struct VRCompat {
diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
new file mode 100644
index 000000000000..20ac069f8248
--- /dev/null
+++ b/GPU/Common/DepthRaster.cpp
@@ -0,0 +1,367 @@
+#include <algorithm>
+
+#include "Common/Math/CrossSIMD.h"
+#include "GPU/Common/DepthRaster.h"
+#include "GPU/Math3D.h"
+#include "Common/Math/math_util.h"
+#include "GPU/Common/VertexDecoderCommon.h"
+
+// TODO: Should respect the scissor rect.
+
+struct ScreenVert {
+	int x;
+	int y;
+	uint16_t z;
+	uint16_t behind;
+};
+
+void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, short depthValue, GEComparison depthCompare) {
+	// Swap coordinates if needed, we don't back-face-cull rects.
+	// We also ignore the UV rotation here.
+	if (x1 > x2) {
+		std::swap(x1, x2);
+	}
+	if (y1 > y2) {
+		std::swap(y1, y2);
+	}
+	if (x1 == x2 || y1 == y2) {
+		return;
+	}
+
+	__m128i valueX8 = _mm_set1_epi16(depthValue);
+
+#if PPSSPP_ARCH(SSE2)
+	for (int y = y1; y < y2; y++) {
+		__m128i *ptr = (__m128i *)(dest + stride * y + x1);
+		int w = x2 - x1;
+
+		switch (depthCompare) {
+		case GE_COMP_ALWAYS:
+			while (w >= 8) {
+				_mm_storeu_si128(ptr, valueX8);
+				ptr++;
+				w -= 8;
+			}
+			break;
+			// TODO: Trailer
+		case GE_COMP_NEVER:
+			break;
+		default:
+			// TODO
+			break;
+		}
+	}
+#elif PPSSPP_ARCH(ARM64_NEON)
+
+#else
+	// Do nothing for now
+#endif
+}
+
+using namespace Math3D;
+struct int2 {
+	int x, y;
+	int2(float a, float b) {
+		x = (int)(a + 0.5f);
+		y = (int)(b + 0.5f);
+	}
+};
+
+// Adapted from Intel's depth rasterizer example.
+// Started with the scalar version, will SIMD-ify later.
+void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const ScreenVert vertsSub[3], GEComparison compareMode) {
+	int tileStartX = x1;
+	int tileEndX = x2;
+
+	int tileStartY = y1;
+	int tileEndY = y2;
+
+	// Convert to whole pixels for now. Later subpixel precision.
+	ScreenVert verts[3];
+	verts[0].x = vertsSub[0].x >> 4;
+	verts[0].y = vertsSub[0].y >> 4;
+	verts[0].z = vertsSub[0].z;
+	verts[1].x = vertsSub[2].x >> 4;
+	verts[1].y = vertsSub[2].y >> 4;
+	verts[1].z = vertsSub[2].z;
+	verts[2].x = vertsSub[1].x >> 4;
+	verts[2].y = vertsSub[1].y >> 4;
+	verts[2].z = vertsSub[1].z;
+
+	// use fixed-point only for X and Y.  Avoid work for Z and W.
+	int startX = std::max(std::min(std::min(verts[0].x, verts[1].x), verts[2].x), tileStartX) & int(0xFFFFFFFE);
+	int endX = std::min(std::max(std::max(verts[0].x, verts[1].x), verts[2].x) + 1, tileEndX);
+
+	int startY = std::max(std::min(std::min(verts[0].y, verts[1].y), verts[2].y), tileStartY) & int(0xFFFFFFFE);
+	int endY = std::min(std::max(std::max(verts[0].y, verts[1].y), verts[2].y) + 1, tileEndY);
+
+	if (endX == startX || endY == startY) {
+		// No pixels
+		return;
+	}
+
+	// Fab(x, y) =     Ax       +       By     +      C              = 0
+	// Fab(x, y) = (ya - yb)x   +   (xb - xa)y + (xa * yb - xb * ya) = 0
+	// Compute A = (ya - yb) for the 3 line segments that make up each triangle
+	int A0 = verts[1].y - verts[2].y;
+	int A1 = verts[2].y - verts[0].y;
+	int A2 = verts[0].y - verts[1].y;
+
+	// Compute B = (xb - xa) for the 3 line segments that make up each triangle
+	int B0 = verts[2].x - verts[1].x;
+	int B1 = verts[0].x - verts[2].x;
+	int B2 = verts[1].x - verts[0].x;
+
+	// Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle
+	int C0 = verts[1].x * verts[2].y - verts[2].x * verts[1].y;
+	int C1 = verts[2].x * verts[0].y - verts[0].x * verts[2].y;
+	int C2 = verts[0].x * verts[1].y - verts[1].x * verts[0].y;
+
+	// Compute triangle area
+	int triArea = A0 * verts[0].x + B0 * verts[0].y + C0;
+	if (triArea <= 0) {
+		// Too small to rasterize or backface culled
+		// NOTE: Just disabling this check won't enable two-sided rendering.
+		// Since it's not that common, let's just queue the triangles with both windings.
+		return;
+	}
+
+	float oneOverTriArea = (1.0f / float(triArea));
+
+	float zz[3];
+	for (int vv = 0; vv < 3; vv++) {
+		zz[vv] = (float)verts[vv].z * oneOverTriArea;
+	}
+
+	int rowIdx = (startY * stride + startX);
+	int col = startX;
+	int row = startY;
+
+	// Calculate slopes at starting corner.
+	int alpha0 = (A0 * col) + (B0 * row) + C0;
+	int beta0 = (A1 * col) + (B1 * row) + C1;
+	int gama0 = (A2 * col) + (B2 * row) + C2;
+
+	// Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY)
+	for (int r = startY; r < endY; r++,
+		row++,
+		rowIdx += stride,
+		alpha0 += B0,
+		beta0 += B1,
+		gama0 += B2)
+	{
+		// Compute barycentric coordinates 
+		int idx = rowIdx;
+		int alpha = alpha0;
+		int beta = beta0;
+		int gama = gama0;
+
+		for (int c = startX; c < endX; c++,
+			idx++,
+			alpha += A0,
+			beta += A1,
+			gama += A2)
+		{
+			int mask = alpha >= 0 && beta >= 0 && gama >= 0;
+			// Early out if all of this quad's pixels are outside the triangle.
+			if (!mask) {
+				continue;
+			}
+			// Compute barycentric-interpolated depth
+			float depth = alpha * zz[0] + beta * zz[1] + gama * zz[2];
+			float previousDepthValue = (float)depthBuf[idx];
+
+			int depthMask;
+			switch (compareMode) {
+			case GE_COMP_EQUAL:  depthMask = depth == previousDepthValue; break;
+			case GE_COMP_LESS: depthMask = depth < previousDepthValue; break;
+			case GE_COMP_LEQUAL: depthMask = depth <= previousDepthValue; break;
+			case GE_COMP_GEQUAL: depthMask = depth >= previousDepthValue; break;
+			case GE_COMP_GREATER: depthMask = depth > previousDepthValue; break;
+			case GE_COMP_NOTEQUAL: depthMask = depth != previousDepthValue; break;
+			case GE_COMP_ALWAYS:
+			default:
+				depthMask = 1;
+				break;
+			}
+			int finalMask = mask & depthMask;
+			depth = finalMask == 1 ? depth : previousDepthValue;
+			depthBuf[idx] = (u16)depth;
+		} //for each column
+	} // for each row
+}
+
+// We ignore lots of primitive types for now.
+void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, int y2, void *bufferData,
+	const void *vertexData, const void *indexData, GEPrimitiveType prim, int count, VertexDecoder *dec, u32 vertTypeID, bool clockwise) {
+
+	GEComparison compareMode = gstate.getDepthTestFunction();
+	if (gstate.isModeClear()) {
+		if (!gstate.isClearModeDepthMask()) {
+			return;
+		}
+		compareMode = GE_COMP_ALWAYS;
+	} else {
+		if (!gstate.isDepthTestEnabled() || !gstate.isDepthWriteEnabled())
+			return;
+	}
+
+	switch (prim) {
+	case GE_PRIM_INVALID:
+	case GE_PRIM_KEEP_PREVIOUS:
+	case GE_PRIM_LINES:
+	case GE_PRIM_LINE_STRIP:
+	case GE_PRIM_POINTS:
+		return;
+	default:
+		break;
+	}
+
+	// TODO: Ditch indexed primitives for now, also ditched skinned ones since we don't have a fast way to skin without
+	// running the full decoder.
+	if (vertTypeID & (GE_VTYPE_IDX_MASK | GE_VTYPE_WEIGHT_MASK)) {
+		return;
+	}
+
+	bool isThroughMode = (vertTypeID & GE_VTYPE_THROUGH_MASK) != 0;
+
+	// Turn the input data into a raw float array that we can pass to an optimized triangle rasterizer.
+	float *verts = (float *)bufferData;
+	ScreenVert *screenVerts = (ScreenVert *)((uint8_t *)bufferData + 65536 * 8);
+
+	// Simple, most common case.
+	int vertexStride = dec->VertexSize();
+	int offset = dec->posoff;
+	float factor = 1.0f;
+	switch (vertTypeID & GE_VTYPE_POS_MASK) {
+	case GE_VTYPE_POS_8BIT:
+		if (!isThroughMode) {
+			factor = 1.0f / 128.0f;
+		}
+		for (int i = 0; i < count; i++) {
+			const s8 *data = (const s8 *)vertexData + i * vertexStride + offset;
+			for (int j = 0; j < 3; j++) {
+				verts[i * 3 + j] = data[j] * factor;
+			}
+		}
+		break;
+	case GE_VTYPE_POS_16BIT:
+		if (!isThroughMode) {
+			factor = 1.0f / 32768.0f;
+		}
+		for (int i = 0; i < count; i++) {
+			const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset));
+			for (int j = 0; j < 3; j++) {
+				verts[i * 3 + j] = data[j] * factor;
+			}
+		}
+		break;
+	case GE_VTYPE_POS_FLOAT:
+		for (int i = 0; i < count; i++)
+			memcpy(&verts[i * 3], (const u8 *)vertexData + vertexStride * i + offset, sizeof(float) * 3);
+		break;
+	}
+
+	// OK, we now have the coordinates. Let's transform, we can actually do this in-place.
+	if (!(vertTypeID & GE_VTYPE_THROUGH_MASK)) {
+		// TODO: This is very suboptimal. This should be one matrix multiplication per vertex.
+
+		float viewportX = gstate.getViewportXCenter();
+		float viewportY = gstate.getViewportYCenter();
+		float viewportZ = gstate.getViewportZCenter();
+		float viewportScaleX = gstate.getViewportXScale();
+		float viewportScaleY = gstate.getViewportYScale();
+		float viewportScaleZ = gstate.getViewportZScale();
+		
+		bool allBehind = true;
+
+		for (int i = 0; i < count; i++) {
+			float world[3];
+			float view[3];
+			float proj[4];
+			Vec3ByMatrix43(world, verts + i * 3, gstate.worldMatrix);
+			Vec3ByMatrix43(view, world, gstate.viewMatrix);
+			Vec3ByMatrix44(proj, view, gstate.projMatrix);  // TODO: Include adjustments to the proj matrix?
+
+			float w = proj[3];
+
+			bool inFront = w > 0.0f;
+			screenVerts[i].behind = !inFront;
+			if (inFront) {
+				allBehind = false;
+			}
+
+			// Clip to the w=0 plane.
+			proj[0] /= w;
+			proj[1] /= w;
+			proj[2] /= w;
+
+			// Then transform by the viewport and offset to finally get subpixel coordinates. Normally, this is done by the viewport
+			// and offset params.
+			float screen[3];
+			screen[0] = (proj[0] * viewportScaleX + viewportX) * 16.0f - gstate.getOffsetX16();
+			screen[1] = (proj[1] * viewportScaleY + viewportY) * 16.0f - gstate.getOffsetY16();
+			screen[2] = (proj[2] * viewportScaleZ + viewportZ);
+			if (screen[2] < 0.0f) {
+				screen[2] = 0.0f;
+			}
+			if (screen[2] >= 65535.0f) {
+				screen[2] = 65535.0f;
+			}
+			screenVerts[i].x = screen[0];
+			screenVerts[i].y = screen[1];
+			screenVerts[i].z = screen[2];
+		}
+		if (allBehind) {
+			// Cull the whole draw.
+			return;
+		}
+	} else {
+		for (int i = 0; i < count; i++) {
+			screenVerts[i].x = (int)verts[i * 3 + 0] << 4;
+			screenVerts[i].y = (int)verts[i * 3 + 1] << 4;
+			screenVerts[i].z = (u16)clamp_value(verts[i * 3 + 2], 0.0f, 65535.0f);
+		}
+	}
+
+	// Then we need to stitch primitives from strips, etc etc...
+	// For now we'll just do it tri by tri. Later let's be more efficient.
+	
+	switch (prim) {
+	case GE_PRIM_RECTANGLES:
+		for (int i = 0; i < count / 2; i++) {
+			uint16_t z = screenVerts[i + 1].z;  // depth from second vertex
+			// We remove the subpixel information here.
+			DepthRasterRect(depth, depthStride, screenVerts[i].x >> 4, screenVerts[i].y >> 4, screenVerts[i + 1].x >> 4, screenVerts[i + 1].y >> 4,
+				z, compareMode);
+		}
+		break;
+	case GE_PRIM_TRIANGLES:
+		for (int i = 0; i < count / 3; i++) {
+			if (screenVerts[i * 3].behind || screenVerts[i * 3 + 1].behind || screenVerts[i * 3 + 2].behind) {
+				continue;
+			}
+			DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, screenVerts + i * 3, compareMode);
+		}
+		break;
+	case GE_PRIM_TRIANGLE_STRIP:
+	{
+		int wind = 2;
+		for (int i = 0; i < count - 2; i++) {
+			int i0 = i;
+			int i1 = i + wind;
+			wind ^= 3;
+			int i2 = i + wind;
+			if (screenVerts[i0].behind || screenVerts[i1].behind || screenVerts[i2].behind) {
+				continue;
+			}
+			ScreenVert v[3];
+			v[0] = screenVerts[i0];
+			v[1] = screenVerts[i1];
+			v[2] = screenVerts[i2];
+			DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, v, compareMode);
+		}
+		break;
+	}
+	}
+}
diff --git a/GPU/Common/DepthRaster.h b/GPU/Common/DepthRaster.h
new file mode 100644
index 000000000000..01fa60e257d1
--- /dev/null
+++ b/GPU/Common/DepthRaster.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "Common/CommonTypes.h"
+#include "GPU/ge_constants.h"
+
+// Specialized, very limited depth-only rasterizer.
+// Meant to run in parallel with hardware rendering, in games that read back the depth buffer
+// for effects like lens flare.
+// So, we can be quite inaccurate without any issues, and skip a lot of functionality.
+
+class VertexDecoder;
+void DepthRasterPrim(uint16_t *dest, int stride, int x1, int x2, int y1, int y2, void *bufferData, const void *vertexData, const void *indexData, GEPrimitiveType prim, int count, VertexDecoder *decoder, u32 vertexTypeID, bool clockwise);
diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h
index 0f8ab8a7515a..595ab929aab4 100644
--- a/GPU/Common/DrawEngineCommon.h
+++ b/GPU/Common/DrawEngineCommon.h
@@ -158,6 +158,11 @@ class DrawEngineCommon {
 		_dbg_assert_(numDrawVerts_ == 0 && numDrawInds_ == 0);
 	}
 
+	// temporary hack
+	uint8_t *GetTempSpace() {
+		return decoded_ + 12 * 65536;
+	}
+
 protected:
 	virtual bool UpdateUseHWTessellation(bool enabled) const { return enabled; }
 	void UpdatePlanes();
diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h
index 2793ad80eed5..4b905f03c956 100644
--- a/GPU/Common/VertexDecoderCommon.h
+++ b/GPU/Common/VertexDecoderCommon.h
@@ -122,7 +122,7 @@ class IndexConverter {
 // Reads decoded vertex formats in a convenient way. For software transform and debugging.
 class VertexReader {
 public:
-	VertexReader(u8 *base, const DecVtxFormat &decFmt, int vtype) : base_(base), data_(base), decFmt_(decFmt), vtype_(vtype) {}
+	VertexReader(const u8 *base, const DecVtxFormat &decFmt, int vtype) : base_(base), data_(base), decFmt_(decFmt), vtype_(vtype) {}
 
 	void ReadPos(float pos[3]) const {
 		// Only DEC_FLOAT_3 is supported.
@@ -297,8 +297,8 @@ class VertexReader {
 	}
 
 private:
-	u8 *base_;
-	u8 *data_;
+	const u8 *base_;
+	const u8 *data_;
 	DecVtxFormat decFmt_;
 	int vtype_;
 };
diff --git a/GPU/GPU.vcxproj b/GPU/GPU.vcxproj
index 5cb3ea9e6238..c27d08354936 100644
--- a/GPU/GPU.vcxproj
+++ b/GPU/GPU.vcxproj
@@ -346,6 +346,7 @@
   </ItemDefinitionGroup>
   <ItemGroup>
     <ClInclude Include="..\ext\xbrz\xbrz.h" />
+    <ClInclude Include="Common\DepthRaster.h" />
     <ClInclude Include="Common\ReplacedTexture.h" />
     <ClInclude Include="Common\TextureReplacer.h" />
     <ClInclude Include="Common\TextureShaderCommon.h" />
@@ -468,6 +469,7 @@
   <ItemGroup>
     <ClCompile Include="..\ext\xbrz\xbrz.cpp" />
     <ClCompile Include="Common\DepthBufferCommon.cpp" />
+    <ClCompile Include="Common\DepthRaster.cpp" />
     <ClCompile Include="Common\ReplacedTexture.cpp" />
     <ClCompile Include="Common\TextureReplacer.cpp" />
     <ClCompile Include="Common\TextureShaderCommon.cpp" />
diff --git a/GPU/GPU.vcxproj.filters b/GPU/GPU.vcxproj.filters
index 610ba94cbe33..1529b974c13f 100644
--- a/GPU/GPU.vcxproj.filters
+++ b/GPU/GPU.vcxproj.filters
@@ -279,6 +279,9 @@
     <ClInclude Include="Debugger\State.h">
       <Filter>Debugger</Filter>
     </ClInclude>
+    <ClInclude Include="Common\DepthRaster.h">
+      <Filter>Common</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="Math3D.cpp">
@@ -554,6 +557,9 @@
     <ClCompile Include="Debugger\State.cpp">
       <Filter>Debugger</Filter>
     </ClCompile>
+    <ClCompile Include="Common\DepthRaster.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <FxCompile Include="..\assets\shaders\tex_4xbrz.csh">
diff --git a/GPU/GPUCommonHW.cpp b/GPU/GPUCommonHW.cpp
index 9b5389c8750a..f5383cf6be59 100644
--- a/GPU/GPUCommonHW.cpp
+++ b/GPU/GPUCommonHW.cpp
@@ -13,6 +13,7 @@
 #include "GPU/Common/DrawEngineCommon.h"
 #include "GPU/Common/TextureCacheCommon.h"
 #include "GPU/Common/FramebufferManagerCommon.h"
+#include "GPU/Common/DepthRaster.h"
 
 struct CommonCommandTableEntry {
 	uint8_t cmd;
@@ -1039,6 +1040,10 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
 	if (passCulling) {
 		if (!drawEngineCommon_->SubmitPrim(verts, inds, prim, count, decoder, vertTypeID, true, &bytesRead)) {
 			canExtend = false;
+		} else if (PSP_CoreParameter().compat.flags().SoftwareRasterDepth) {
+			DepthRasterPrim((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(),
+				gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), drawEngineCommon_->GetTempSpace(),
+				verts, inds, prim, count, decoder, vertTypeID, false);
 		}
 		onePassed = true;
 	} else {
@@ -1117,6 +1122,10 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
 			if (passCulling) {
 				if (!drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, decoder, vertTypeID, clockwise, &bytesRead)) {
 					canExtend = false;
+				} else if (PSP_CoreParameter().compat.flags().SoftwareRasterDepth) {
+					DepthRasterPrim((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(),
+						gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), drawEngineCommon_->GetTempSpace(),
+						verts, inds, newPrim, count, decoder, vertTypeID, clockwise);
 				}
 				// As soon as one passes, assume we don't need to check the rest of this batch.
 				onePassed = true;
diff --git a/UI/ImDebugger/ImGe.h b/UI/ImDebugger/ImGe.h
index 227b0f0c1e52..41dc6a8b1f94 100644
--- a/UI/ImDebugger/ImGe.h
+++ b/UI/ImDebugger/ImGe.h
@@ -71,14 +71,14 @@ struct ImGePixelViewer : public PixelLookup {
 	}
 	bool FormatValueAt(char *buf, size_t bufSize, int x, int y) const override;
 
-	uint32_t addr = 0x04000000;
+	uint32_t addr = 0x04110000;
 	uint16_t stride = 512;
 	uint16_t width = 480;
 	uint16_t height = 272;
-	GEBufferFormat format = GE_FORMAT_565;
+	GEBufferFormat format = GE_FORMAT_DEPTH16;
 	bool useAlpha = false;
 	bool showAlpha = false;
-	float scale = 1.0f;
+	float scale = 20.0f;
 
 private:
 	void UpdateTexture(Draw::DrawContext *draw);
diff --git a/UWP/GPU_UWP/GPU_UWP.vcxproj b/UWP/GPU_UWP/GPU_UWP.vcxproj
index 7bb4b346bd8f..a7ba27a14097 100644
--- a/UWP/GPU_UWP/GPU_UWP.vcxproj
+++ b/UWP/GPU_UWP/GPU_UWP.vcxproj
@@ -109,6 +109,7 @@
     </Link>
   </ItemDefinitionGroup>
   <ItemGroup>
+    <ClInclude Include="..\..\GPU\Common\DepthRaster.h" />
     <ClInclude Include="..\..\GPU\Common\ReplacedTexture.h" />
     <ClInclude Include="..\..\GPU\Common\TextureReplacer.h" />
     <ClInclude Include="..\..\GPU\Common\TextureShaderCommon.h" />
@@ -177,6 +178,7 @@
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="..\..\GPU\Common\DepthBufferCommon.cpp" />
+    <ClCompile Include="..\..\GPU\Common\DepthRaster.cpp" />
     <ClCompile Include="..\..\GPU\Common\ReplacedTexture.cpp" />
     <ClCompile Include="..\..\GPU\Common\TextureReplacer.cpp" />
     <ClCompile Include="..\..\GPU\Common\TextureShaderCommon.cpp" />
@@ -261,4 +263,4 @@
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/UWP/GPU_UWP/GPU_UWP.vcxproj.filters b/UWP/GPU_UWP/GPU_UWP.vcxproj.filters
index 84b4c5d39630..31d14b549feb 100644
--- a/UWP/GPU_UWP/GPU_UWP.vcxproj.filters
+++ b/UWP/GPU_UWP/GPU_UWP.vcxproj.filters
@@ -80,6 +80,7 @@
     <ClCompile Include="..\..\GPU\Debugger\GECommandTable.cpp">
       <Filter>Debugger</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\GPU\Common\DepthRaster.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\GPU\Common\DepalettizeShaderCommon.h" />
@@ -163,10 +164,11 @@
     <ClInclude Include="..\..\GPU\Debugger\GECommandTable.h">
       <Filter>Debugger</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\GPU\Common\DepthRaster.h" />
   </ItemGroup>
   <ItemGroup>
     <Filter Include="Debugger">
       <UniqueIdentifier>{49bcf7f6-518a-4ecd-af55-bda3a344efe7}</UniqueIdentifier>
     </Filter>
   </ItemGroup>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/android/jni/Android.mk b/android/jni/Android.mk
index 10ab9a5f77a3..dbd88097886f 100644
--- a/android/jni/Android.mk
+++ b/android/jni/Android.mk
@@ -530,6 +530,7 @@ EXEC_AND_LIB_FILES := \
   $(SRC)/GPU/Common/SoftwareTransformCommon.cpp.arm \
   $(SRC)/GPU/Common/ReinterpretFramebuffer.cpp \
   $(SRC)/GPU/Common/DepthBufferCommon.cpp \
+  $(SRC)/GPU/Common/DepthRaster.cpp \
   $(SRC)/GPU/Common/VertexDecoderCommon.cpp.arm \
   $(SRC)/GPU/Common/VertexDecoderHandwritten.cpp.arm \
   $(SRC)/GPU/Common/TextureCacheCommon.cpp.arm \
diff --git a/assets/compat.ini b/assets/compat.ini
index 8a374753efb2..0452c5b85484 100644
--- a/assets/compat.ini
+++ b/assets/compat.ini
@@ -1228,8 +1228,10 @@ ULJS19067 = true
 ULAS42247 = true
 ULAS42318 = true
 
+[SoftwareRasterDepth]
+
 [DisableFirstFrameReadback]
-# Wipeout Pure: Temporary workaround for lens flare flicker. See #13344
+# Wipeout Pure
 UCUS98612 = true
 UCJS10007 = true
 UCES00001 = true
diff --git a/libretro/Makefile.common b/libretro/Makefile.common
index 804a1d72199f..c1cb5a454fed 100644
--- a/libretro/Makefile.common
+++ b/libretro/Makefile.common
@@ -543,6 +543,7 @@ SOURCES_CXX += \
 	$(GPUDIR)/Common/TextureScalerCommon.cpp \
 	$(GPUDIR)/Common/SoftwareTransformCommon.cpp \
 	$(GPUDIR)/Common/DepthBufferCommon.cpp \
+	$(GPUDIR)/Common/DepthRaster.cpp \
 	$(GPUDIR)/Common/StencilCommon.cpp \
 	$(GPUDIR)/Software/TransformUnit.cpp \
 	$(GPUDIR)/Software/SoftGpu.cpp \

From d27d8c9dae7497bec335fe6caf0a8f86475e4cc8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Thu, 19 Dec 2024 09:36:50 +0100
Subject: [PATCH 04/28] Remove subpixel precision. Some sketching.

---
 GPU/Common/DepthRaster.cpp | 198 ++++++++++++++++++++++++++++++-------
 1 file changed, 161 insertions(+), 37 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index 20ac069f8248..afeb4b64a107 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -1,4 +1,5 @@
 #include <algorithm>
+#include <cstring>
 
 #include "Common/Math/CrossSIMD.h"
 #include "GPU/Common/DepthRaster.h"
@@ -6,7 +7,91 @@
 #include "Common/Math/math_util.h"
 #include "GPU/Common/VertexDecoderCommon.h"
 
-// TODO: Should respect the scissor rect.
+#if PPSSPP_ARCH(SSE2)
+
+struct Vec4S32 {
+	__m128i v;
+
+	Vec4S32 operator +(Vec4S32 other) const {
+		return Vec4S32{ _mm_add_epi32(v, other.v) };
+	}
+	Vec4S32 operator -(Vec4S32 other) const {
+		return Vec4S32{ _mm_sub_epi32(v, other.v) };
+	}
+	// This is really bad if we restrict ourselves to SSE2 only.
+	// If we have SSE4, we can do _mm_mullo_epi32.
+	// Let's avoid using it as much as possible.
+	// https://stackoverflow.com/questions/17264399/fastest-way-to-multiply-two-vectors-of-32bit-integers-in-c-with-sse
+	Vec4S32 operator *(Vec4S32 other) const {
+		__m128i a13 = _mm_shuffle_epi32(v, 0xF5);          // (-,a3,-,a1)
+		__m128i b13 = _mm_shuffle_epi32(other.v, 0xF5);          // (-,b3,-,b1)
+		__m128i prod02 = _mm_mul_epu32(v, other.v);                 // (-,a2*b2,-,a0*b0)
+		__m128i prod13 = _mm_mul_epu32(a13, b13);             // (-,a3*b3,-,a1*b1)
+		__m128i prod01 = _mm_unpacklo_epi32(prod02, prod13);   // (-,-,a1*b1,a0*b0) 
+		__m128i prod23 = _mm_unpackhi_epi32(prod02, prod13);   // (-,-,a3*b3,a2*b2) 
+		return Vec4S32{ _mm_unpacklo_epi64(prod01, prod23) };   // (ab3,ab2,ab1,ab0)
+	}
+};
+
+struct Vec4F32 {
+	__m128 v;
+
+	static Vec4F32 FromVec4S32(Vec4S32 other) {
+		return Vec4F32{ _mm_cvtepi32_ps(other.v) };
+	}
+
+	Vec4F32 operator +(Vec4F32 other) const {
+		return Vec4F32{ _mm_add_ps(v, other.v) };
+	}
+	Vec4F32 operator -(Vec4F32 other) const {
+		return Vec4F32{ _mm_sub_ps(v, other.v) };
+	}
+	Vec4F32 operator *(Vec4F32 other) const {
+		return Vec4F32{ _mm_mul_ps(v, other.v) };
+	}
+};
+
+#elif PPSSPP_ARCH(ARM_NEON)
+
+struct Vec4S32 {
+	uint32x4_t v;
+
+	Vec4S32 operator +(Vec4S32 other) const {
+		return Vec4S32{ vaddq_s32(v, other.v) };
+	}
+	Vec4S32 operator -(Vec4S32 other) const {
+		return Vec4S32{ vsubq_s32(v, other.v) };
+	}
+	Vec4S32 operator *(Vec4S32 other) const {
+		return Vec4S32{ vmulq_s32(v, other.v) };
+	}
+};
+
+struct Vec4F32 {
+	float32x4_t v;
+
+	static Vec4F32 FromVec4S32(Vec4S32 other) {
+		return Vec4F32{ _mm_cvtepi32_ps(other.v) };
+	}
+
+	Vec4F32 operator +(Vec4F32 other) const {
+		return Vec4F32{ vaddq_f32(v, other.v) };
+	}
+	Vec4F32 operator -(Vec4F32 other) const {
+		return Vec4F32{ vsubq_f32(v, other.v) };
+	}
+	Vec4F32 operator *(Vec4F32 other) const {
+		return Vec4F32{ vmulq_f32(v, other.v) };
+	}
+};
+
+#else
+
+struct Vec4S32 {
+	s32 v[4];
+};
+
+#endif
 
 struct ScreenVert {
 	int x;
@@ -28,19 +113,21 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2,
 		return;
 	}
 
-	__m128i valueX8 = _mm_set1_epi16(depthValue);
-
 #if PPSSPP_ARCH(SSE2)
+	__m128i valueX8 = _mm_set1_epi16(depthValue);
 	for (int y = y1; y < y2; y++) {
 		__m128i *ptr = (__m128i *)(dest + stride * y + x1);
 		int w = x2 - x1;
-
 		switch (depthCompare) {
 		case GE_COMP_ALWAYS:
-			while (w >= 8) {
-				_mm_storeu_si128(ptr, valueX8);
-				ptr++;
-				w -= 8;
+			if (depthValue == 0) {
+				memset(ptr, 0, w * 2);
+			} else {
+				while (w >= 8) {
+					_mm_storeu_si128(ptr, valueX8);
+					ptr++;
+					w -= 8;
+				}
 			}
 			break;
 			// TODO: Trailer
@@ -51,8 +138,33 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2,
 			break;
 		}
 	}
+
 #elif PPSSPP_ARCH(ARM64_NEON)
+	uint16x8_t valueX8 = vdupq_n_u16(depthValue);
+	for (int y = y1; y < y2; y++) {
+		uint16_t *ptr = (uint16_t *)(dest + stride * y + x1);
+		int w = x2 - x1;
 
+		switch (depthCompare) {
+		case GE_COMP_ALWAYS:
+			if (depthValue == 0) {
+				memset(ptr, 0, w * 2);
+			} else {
+				while (w >= 8) {
+					vst1q_u16(ptr, valueX8);
+					ptr += 8;
+					w -= 8;
+				}
+			}
+			break;
+			// TODO: Trailer
+		case GE_COMP_NEVER:
+			break;
+		default:
+			// TODO
+			break;
+		}
+	}
 #else
 	// Do nothing for now
 #endif
@@ -69,6 +181,7 @@ struct int2 {
 
 // Adapted from Intel's depth rasterizer example.
 // Started with the scalar version, will SIMD-ify later.
+// x1/y1 etc are the scissor rect.
 void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const ScreenVert vertsSub[3], GEComparison compareMode) {
 	int tileStartX = x1;
 	int tileEndX = x2;
@@ -76,29 +189,33 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
 	int tileStartY = y1;
 	int tileEndY = y2;
 
+	// BEGIN triangle setup. This should be done SIMD, four triangles at a time.
+	// Due to the many multiplications, we might want to do it in floating point as 32-bit integer muls
+	// are slow on SSE2.
+
 	// Convert to whole pixels for now. Later subpixel precision.
 	ScreenVert verts[3];
-	verts[0].x = vertsSub[0].x >> 4;
-	verts[0].y = vertsSub[0].y >> 4;
+	verts[0].x = vertsSub[0].x;
+	verts[0].y = vertsSub[0].y;
 	verts[0].z = vertsSub[0].z;
-	verts[1].x = vertsSub[2].x >> 4;
-	verts[1].y = vertsSub[2].y >> 4;
+	verts[1].x = vertsSub[2].x;
+	verts[1].y = vertsSub[2].y;
 	verts[1].z = vertsSub[2].z;
-	verts[2].x = vertsSub[1].x >> 4;
-	verts[2].y = vertsSub[1].y >> 4;
+	verts[2].x = vertsSub[1].x;
+	verts[2].y = vertsSub[1].y;
 	verts[2].z = vertsSub[1].z;
 
 	// use fixed-point only for X and Y.  Avoid work for Z and W.
-	int startX = std::max(std::min(std::min(verts[0].x, verts[1].x), verts[2].x), tileStartX) & int(0xFFFFFFFE);
+	int startX = std::max(std::min(std::min(verts[0].x, verts[1].x), verts[2].x), tileStartX);
 	int endX = std::min(std::max(std::max(verts[0].x, verts[1].x), verts[2].x) + 1, tileEndX);
 
-	int startY = std::max(std::min(std::min(verts[0].y, verts[1].y), verts[2].y), tileStartY) & int(0xFFFFFFFE);
+	int startY = std::max(std::min(std::min(verts[0].y, verts[1].y), verts[2].y), tileStartY);
 	int endY = std::min(std::max(std::max(verts[0].y, verts[1].y), verts[2].y) + 1, tileEndY);
-
 	if (endX == startX || endY == startY) {
 		// No pixels
 		return;
 	}
+	// TODO: Cull really small triangles here.
 
 	// Fab(x, y) =     Ax       +       By     +      C              = 0
 	// Fab(x, y) = (ya - yb)x   +   (xb - xa)y + (xa * yb - xb * ya) = 0
@@ -126,13 +243,6 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
 		return;
 	}
 
-	float oneOverTriArea = (1.0f / float(triArea));
-
-	float zz[3];
-	for (int vv = 0; vv < 3; vv++) {
-		zz[vv] = (float)verts[vv].z * oneOverTriArea;
-	}
-
 	int rowIdx = (startY * stride + startX);
 	int col = startX;
 	int row = startY;
@@ -140,7 +250,15 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
 	// Calculate slopes at starting corner.
 	int alpha0 = (A0 * col) + (B0 * row) + C0;
 	int beta0 = (A1 * col) + (B1 * row) + C1;
-	int gama0 = (A2 * col) + (B2 * row) + C2;
+	int gamma0 = (A2 * col) + (B2 * row) + C2;
+
+	float oneOverTriArea = (1.0f / float(triArea));
+
+	// END triangle setup.
+	float zz[3];
+	for (int vv = 0; vv < 3; vv++) {
+		zz[vv] = (float)verts[vv].z * oneOverTriArea;
+	}
 
 	// Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY)
 	for (int r = startY; r < endY; r++,
@@ -148,27 +266,28 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
 		rowIdx += stride,
 		alpha0 += B0,
 		beta0 += B1,
-		gama0 += B2)
+		gamma0 += B2)
 	{
-		// Compute barycentric coordinates 
 		int idx = rowIdx;
+
+		// Restore row steppers.
 		int alpha = alpha0;
 		int beta = beta0;
-		int gama = gama0;
+		int gamma = gamma0;
 
 		for (int c = startX; c < endX; c++,
 			idx++,
 			alpha += A0,
 			beta += A1,
-			gama += A2)
+			gamma += A2)
 		{
-			int mask = alpha >= 0 && beta >= 0 && gama >= 0;
+			int mask = alpha >= 0 && beta >= 0 && gamma >= 0;
 			// Early out if all of this quad's pixels are outside the triangle.
 			if (!mask) {
 				continue;
 			}
 			// Compute barycentric-interpolated depth
-			float depth = alpha * zz[0] + beta * zz[1] + gama * zz[2];
+			float depth = alpha * zz[0] + beta * zz[1] + gamma * zz[2];
 			float previousDepthValue = (float)depthBuf[idx];
 
 			int depthMask;
@@ -224,6 +343,8 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i
 	}
 
 	bool isThroughMode = (vertTypeID & GE_VTYPE_THROUGH_MASK) != 0;
+	bool cullEnabled = false;
+	bool cullCCW = false;
 
 	// Turn the input data into a raw float array that we can pass to an optimized triangle rasterizer.
 	float *verts = (float *)bufferData;
@@ -244,7 +365,7 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i
 				verts[i * 3 + j] = data[j] * factor;
 			}
 		}
-		break;
+		break; 
 	case GE_VTYPE_POS_16BIT:
 		if (!isThroughMode) {
 			factor = 1.0f / 32768.0f;
@@ -264,6 +385,8 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i
 
 	// OK, we now have the coordinates. Let's transform, we can actually do this in-place.
 	if (!(vertTypeID & GE_VTYPE_THROUGH_MASK)) {
+		cullEnabled = gstate.isCullEnabled();
+
 		// TODO: This is very suboptimal. This should be one matrix multiplication per vertex.
 
 		float viewportX = gstate.getViewportXCenter();
@@ -308,8 +431,8 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i
 			if (screen[2] >= 65535.0f) {
 				screen[2] = 65535.0f;
 			}
-			screenVerts[i].x = screen[0];
-			screenVerts[i].y = screen[1];
+			screenVerts[i].x = screen[0] * (1.0f / 16.0f);  // We ditch the subpixel precision here.
+			screenVerts[i].y = screen[1] * (1.0f / 16.0f);
 			screenVerts[i].z = screen[2];
 		}
 		if (allBehind) {
@@ -318,8 +441,8 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i
 		}
 	} else {
 		for (int i = 0; i < count; i++) {
-			screenVerts[i].x = (int)verts[i * 3 + 0] << 4;
-			screenVerts[i].y = (int)verts[i * 3 + 1] << 4;
+			screenVerts[i].x = (int)verts[i * 3 + 0];
+			screenVerts[i].y = (int)verts[i * 3 + 1];
 			screenVerts[i].z = (u16)clamp_value(verts[i * 3 + 2], 0.0f, 65535.0f);
 		}
 	}
@@ -331,8 +454,9 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i
 	case GE_PRIM_RECTANGLES:
 		for (int i = 0; i < count / 2; i++) {
 			uint16_t z = screenVerts[i + 1].z;  // depth from second vertex
+			// TODO: Should clip coordinates to the scissor rectangle.
 			// We remove the subpixel information here.
-			DepthRasterRect(depth, depthStride, screenVerts[i].x >> 4, screenVerts[i].y >> 4, screenVerts[i + 1].x >> 4, screenVerts[i + 1].y >> 4,
+			DepthRasterRect(depth, depthStride, screenVerts[i].x, screenVerts[i].y, screenVerts[i + 1].x, screenVerts[i + 1].y,
 				z, compareMode);
 		}
 		break;

From 09afe363ca963df852da031ae3dc0d845817903f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Thu, 19 Dec 2024 10:23:30 +0100
Subject: [PATCH 05/28] One less operation in the inner loop

---
 GPU/Common/DepthRaster.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index afeb4b64a107..af20c9c89a41 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -254,11 +254,12 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
 
 	float oneOverTriArea = (1.0f / float(triArea));
 
-	// END triangle setup.
 	float zz[3];
-	for (int vv = 0; vv < 3; vv++) {
-		zz[vv] = (float)verts[vv].z * oneOverTriArea;
-	}
+	zz[0] = (float)verts[0].z;
+	zz[1] = (float)(verts[1].z - verts[0].z) * oneOverTriArea;
+	zz[2] = (float)(verts[2].z - verts[0].z) * oneOverTriArea;
+
+	// END triangle setup.
 
 	// Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY)
 	for (int r = startY; r < endY; r++,
@@ -287,7 +288,7 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
 				continue;
 			}
 			// Compute barycentric-interpolated depth
-			float depth = alpha * zz[0] + beta * zz[1] + gamma * zz[2];
+			float depth = zz[0] + beta * zz[1] + gamma * zz[2];
 			float previousDepthValue = (float)depthBuf[idx];
 
 			int depthMask;

From 72c954d8c31176ca2502ba74af4c13d57f8a8c4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Thu, 19 Dec 2024 10:38:56 +0100
Subject: [PATCH 06/28] Add convenient wrappers

---
 GPU/Common/DepthRaster.cpp | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index af20c9c89a41..2bb83831bbb5 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -1,5 +1,6 @@
 #include <algorithm>
 #include <cstring>
+#include <cstdint>
 
 #include "Common/Math/CrossSIMD.h"
 #include "GPU/Common/DepthRaster.h"
@@ -18,18 +19,9 @@ struct Vec4S32 {
 	Vec4S32 operator -(Vec4S32 other) const {
 		return Vec4S32{ _mm_sub_epi32(v, other.v) };
 	}
-	// This is really bad if we restrict ourselves to SSE2 only.
-	// If we have SSE4, we can do _mm_mullo_epi32.
-	// Let's avoid using it as much as possible.
-	// https://stackoverflow.com/questions/17264399/fastest-way-to-multiply-two-vectors-of-32bit-integers-in-c-with-sse
+	// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
 	Vec4S32 operator *(Vec4S32 other) const {
-		__m128i a13 = _mm_shuffle_epi32(v, 0xF5);          // (-,a3,-,a1)
-		__m128i b13 = _mm_shuffle_epi32(other.v, 0xF5);          // (-,b3,-,b1)
-		__m128i prod02 = _mm_mul_epu32(v, other.v);                 // (-,a2*b2,-,a0*b0)
-		__m128i prod13 = _mm_mul_epu32(a13, b13);             // (-,a3*b3,-,a1*b1)
-		__m128i prod01 = _mm_unpacklo_epi32(prod02, prod13);   // (-,-,a1*b1,a0*b0) 
-		__m128i prod23 = _mm_unpackhi_epi32(prod02, prod13);   // (-,-,a3*b3,a2*b2) 
-		return Vec4S32{ _mm_unpacklo_epi64(prod01, prod23) };   // (ab3,ab2,ab1,ab0)
+		return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) };   // (ab3,ab2,ab1,ab0)
 	}
 };
 
@@ -234,7 +226,8 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
 	int C1 = verts[2].x * verts[0].y - verts[0].x * verts[2].y;
 	int C2 = verts[0].x * verts[1].y - verts[1].x * verts[0].y;
 
-	// Compute triangle area
+	// Compute triangle area.
+	// TODO: Cull really small triangles here - we can just raise the comparison value below.
 	int triArea = A0 * verts[0].x + B0 * verts[0].y + C0;
 	if (triArea <= 0) {
 		// Too small to rasterize or backface culled
@@ -287,7 +280,7 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
 			if (!mask) {
 				continue;
 			}
-			// Compute barycentric-interpolated depth
+			// Compute barycentric-interpolated depth. Could also compute it incrementally.
 			float depth = zz[0] + beta * zz[1] + gamma * zz[2];
 			float previousDepthValue = (float)depthBuf[idx];
 

From c92b3b6521cc0a6976ec2a3c0bf918cab0f5e62f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Fri, 20 Dec 2024 08:48:16 +0100
Subject: [PATCH 07/28] Move prototype cross simd wrapper structs to
 CrossSIMD.h

---
 Common/Data/Convert/ColorConv.cpp |  4 +-
 Common/Math/CrossSIMD.h           | 96 +++++++++++++++++++++++++++++++
 Common/Math/SIMDHeaders.h         | 22 +++++++
 Common/Math/fast/fast_matrix.c    |  2 -
 GPU/Common/DepthRaster.cpp        | 77 -------------------------
 5 files changed, 120 insertions(+), 81 deletions(-)

diff --git a/Common/Data/Convert/ColorConv.cpp b/Common/Data/Convert/ColorConv.cpp
index 72fac52f2f01..5c4df7fca808 100644
--- a/Common/Data/Convert/ColorConv.cpp
+++ b/Common/Data/Convert/ColorConv.cpp
@@ -65,7 +65,7 @@ void ConvertBGRA8888ToRGB888(u8 *dst, const u32 *src, u32 numPixels) {
 }
 
 #if PPSSPP_ARCH(SSE2)
-// fp64's improved version, see #19751
+// fp64's improved SSE2 version, see #19751. SSE4 no longer required here.
 static inline void ConvertRGBA8888ToRGBA5551(__m128i *dstp, const __m128i *srcp, u32 sseChunks) {
 	const __m128i maskRB = _mm_set1_epi32(0x00F800F8);
 	const __m128i maskGA = _mm_set1_epi32(0x8000F800);
@@ -76,7 +76,7 @@ static inline void ConvertRGBA8888ToRGBA5551(__m128i *dstp, const __m128i *srcp,
 		__m128i c0 = _mm_load_si128(&srcp[i + 0]);
 		__m128i c1 = _mm_load_si128(&srcp[i + 1]);
 
-		__m128i rb0 = _mm_and_si128(c0, maskRB);              // 00000000bbbbb00000000000rrrrr000
+		__m128i rb0 = _mm_and_si128(c0, maskRB);              // 00000000bbbbb00000000000rrrrr000 (each 32-bit lane)
 		__m128i rb1 = _mm_and_si128(c1, maskRB);              // 00000000bbbbb00000000000rrrrr000
 		__m128i ga0 = _mm_and_si128(c0, maskGA);              // a000000000000000ggggg00000000000
 		__m128i ga1 = _mm_and_si128(c1, maskGA);              // a000000000000000ggggg00000000000
diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
index 94b2d3933b52..11ed21702486 100644
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@@ -5,3 +5,99 @@
 #pragma once
 
 #include "Common/Math/SIMDHeaders.h"
+
+#if PPSSPP_ARCH(SSE2)
+
+struct Vec4S32 {
+	__m128i v;
+
+	Vec4S32 operator +(Vec4S32 other) const {
+		return Vec4S32{ _mm_add_epi32(v, other.v) };
+	}
+	Vec4S32 operator -(Vec4S32 other) const {
+		return Vec4S32{ _mm_sub_epi32(v, other.v) };
+	}
+	// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
+	Vec4S32 operator *(Vec4S32 other) const {
+		return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) };   // (ab3,ab2,ab1,ab0)
+	}
+};
+
+struct Vec4F32 {
+	__m128 v;
+
+	static Vec4F32 FromVec4S32(Vec4S32 other) {
+		return Vec4F32{ _mm_cvtepi32_ps(other.v) };
+	}
+
+	Vec4F32 operator +(Vec4F32 other) const {
+		return Vec4F32{ _mm_add_ps(v, other.v) };
+	}
+	Vec4F32 operator -(Vec4F32 other) const {
+		return Vec4F32{ _mm_sub_ps(v, other.v) };
+	}
+	Vec4F32 operator *(Vec4F32 other) const {
+		return Vec4F32{ _mm_mul_ps(v, other.v) };
+	}
+};
+
+struct Vec4U16 {
+	__m128i v;  // we only use the lower 64 bits.
+	static Vec4U16 Load(void *mem) {
+		return Vec4U16{ _mm_loadl_epi64((__m128i *)mem) };
+	}
+	void Store(void *mem) {
+		_mm_storel_epi64((__m128i *)mem, v);
+	}
+	static Vec4U16 Max(Vec4U16 a, Vec4U16 b) {
+		return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) };
+	}
+	static Vec4U16 Min(Vec4U16 a, Vec4U16 b) {
+		return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) };
+	}
+	Vec4U16 CompareLT(Vec4U16 other) {
+		return Vec4U16{ _mm_cmplt_epu16(v, other.v) };
+	}
+};
+
+#elif PPSSPP_ARCH(ARM_NEON)
+
+struct Vec4S32 {
+	int32x4_t v;
+
+	Vec4S32 operator +(Vec4S32 other) const {
+		return Vec4S32{ vaddq_s32(v, other.v) };
+	}
+	Vec4S32 operator -(Vec4S32 other) const {
+		return Vec4S32{ vsubq_s32(v, other.v) };
+	}
+	Vec4S32 operator *(Vec4S32 other) const {
+		return Vec4S32{ vmulq_s32(v, other.v) };
+	}
+};
+
+struct Vec4F32 {
+	float32x4_t v;
+
+	static Vec4F32 FromVec4S32(Vec4S32 other) {
+		return Vec4F32{ vcvtq_f32_s32(other.v) };
+	}
+
+	Vec4F32 operator +(Vec4F32 other) const {
+		return Vec4F32{ vaddq_f32(v, other.v) };
+	}
+	Vec4F32 operator -(Vec4F32 other) const {
+		return Vec4F32{ vsubq_f32(v, other.v) };
+	}
+	Vec4F32 operator *(Vec4F32 other) const {
+		return Vec4F32{ vmulq_f32(v, other.v) };
+	}
+};
+
+#else
+
+struct Vec4S32 {
+	s32 v[4];
+};
+
+#endif
diff --git a/Common/Math/SIMDHeaders.h b/Common/Math/SIMDHeaders.h
index 8e812a7819e6..3f8500dfb273 100644
--- a/Common/Math/SIMDHeaders.h
+++ b/Common/Math/SIMDHeaders.h
@@ -128,4 +128,26 @@ inline __m128i _mm_packu2_epi32_SSE2(const __m128i v0, const __m128i v1) {
 	return _mm_castps_si128(_mm_shuffle_ps(packed0, packed1, _MM_SHUFFLE(2, 0, 2, 0)));
 }
 
+// The below are not real SSE instructions in any generation, but should exist.
+
+// Return 0xFFFF where x <= y, else 0x0000.
+inline __m128i _mm_cmple_epu16(__m128i x, __m128i y) {
+	return _mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128());
+}
+
+// Return 0xFFFF where x >= y, else 0x0000.
+inline __m128i _mm_cmpge_epu16(__m128i x, __m128i y) {
+	return _mm_cmple_epu16(y, x);
+}
+
+// Return 0xFFFF where x > y, else 0x0000.
+inline __m128i _mm_cmpgt_epu16(__m128i x, __m128i y) {
+	return _mm_andnot_si128(_mm_cmpeq_epi16(x, y), _mm_cmple_epu16(y, x));
+}
+
+// Return 0xFFFF where x < y, else 0x0000.
+inline __m128i _mm_cmplt_epu16(__m128i x, __m128i y) {
+	return _mm_cmpgt_epu16(y, x);
+}
+
 #endif
diff --git a/Common/Math/fast/fast_matrix.c b/Common/Math/fast/fast_matrix.c
index 0402f366297e..d23ce3b0e0b2 100644
--- a/Common/Math/fast/fast_matrix.c
+++ b/Common/Math/fast/fast_matrix.c
@@ -6,8 +6,6 @@
 
 #if PPSSPP_ARCH(SSE2)
 
-#include "fast_matrix.h"
-
 void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b) {
 	int i;
 	__m128 a_col_1 = _mm_loadu_ps(a);
diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index 2bb83831bbb5..8068df066f29 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -8,83 +8,6 @@
 #include "Common/Math/math_util.h"
 #include "GPU/Common/VertexDecoderCommon.h"
 
-#if PPSSPP_ARCH(SSE2)
-
-struct Vec4S32 {
-	__m128i v;
-
-	Vec4S32 operator +(Vec4S32 other) const {
-		return Vec4S32{ _mm_add_epi32(v, other.v) };
-	}
-	Vec4S32 operator -(Vec4S32 other) const {
-		return Vec4S32{ _mm_sub_epi32(v, other.v) };
-	}
-	// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
-	Vec4S32 operator *(Vec4S32 other) const {
-		return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) };   // (ab3,ab2,ab1,ab0)
-	}
-};
-
-struct Vec4F32 {
-	__m128 v;
-
-	static Vec4F32 FromVec4S32(Vec4S32 other) {
-		return Vec4F32{ _mm_cvtepi32_ps(other.v) };
-	}
-
-	Vec4F32 operator +(Vec4F32 other) const {
-		return Vec4F32{ _mm_add_ps(v, other.v) };
-	}
-	Vec4F32 operator -(Vec4F32 other) const {
-		return Vec4F32{ _mm_sub_ps(v, other.v) };
-	}
-	Vec4F32 operator *(Vec4F32 other) const {
-		return Vec4F32{ _mm_mul_ps(v, other.v) };
-	}
-};
-
-#elif PPSSPP_ARCH(ARM_NEON)
-
-struct Vec4S32 {
-	uint32x4_t v;
-
-	Vec4S32 operator +(Vec4S32 other) const {
-		return Vec4S32{ vaddq_s32(v, other.v) };
-	}
-	Vec4S32 operator -(Vec4S32 other) const {
-		return Vec4S32{ vsubq_s32(v, other.v) };
-	}
-	Vec4S32 operator *(Vec4S32 other) const {
-		return Vec4S32{ vmulq_s32(v, other.v) };
-	}
-};
-
-struct Vec4F32 {
-	float32x4_t v;
-
-	static Vec4F32 FromVec4S32(Vec4S32 other) {
-		return Vec4F32{ _mm_cvtepi32_ps(other.v) };
-	}
-
-	Vec4F32 operator +(Vec4F32 other) const {
-		return Vec4F32{ vaddq_f32(v, other.v) };
-	}
-	Vec4F32 operator -(Vec4F32 other) const {
-		return Vec4F32{ vsubq_f32(v, other.v) };
-	}
-	Vec4F32 operator *(Vec4F32 other) const {
-		return Vec4F32{ vmulq_f32(v, other.v) };
-	}
-};
-
-#else
-
-struct Vec4S32 {
-	s32 v[4];
-};
-
-#endif
-
 struct ScreenVert {
 	int x;
 	int y;

From c7f0eabc6509002d440c428d4069b254278542e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Fri, 20 Dec 2024 08:58:37 +0100
Subject: [PATCH 08/28] DepthRaster: Premultiply world-view-proj matrices

---
 GPU/Common/DepthRaster.cpp | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index 8068df066f29..da85632c38ed 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -85,15 +85,6 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2,
 #endif
 }
 
-using namespace Math3D;
-struct int2 {
-	int x, y;
-	int2(float a, float b) {
-		x = (int)(a + 0.5f);
-		y = (int)(b + 0.5f);
-	}
-};
-
 // Adapted from Intel's depth rasterizer example.
 // Started with the scalar version, will SIMD-ify later.
 // x1/y1 etc are the scissor rect.
@@ -127,7 +118,7 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
 	int startY = std::max(std::min(std::min(verts[0].y, verts[1].y), verts[2].y), tileStartY);
 	int endY = std::min(std::max(std::max(verts[0].y, verts[1].y), verts[2].y) + 1, tileEndY);
 	if (endX == startX || endY == startY) {
-		// No pixels
+		// No pixels, or outside screen.
 		return;
 	}
 	// TODO: Cull really small triangles here.
@@ -300,6 +291,15 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i
 		break;
 	}
 
+	float world[16];
+	float view[16];
+	float worldview[16];
+	float worldviewproj[16];
+	ConvertMatrix4x3To4x4(world, gstate.worldMatrix);
+	ConvertMatrix4x3To4x4(view, gstate.viewMatrix);
+	Matrix4ByMatrix4(worldview, world, view);
+	Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix);
+
 	// OK, we now have the coordinates. Let's transform, we can actually do this in-place.
 	if (!(vertTypeID & GE_VTYPE_THROUGH_MASK)) {
 		cullEnabled = gstate.isCullEnabled();
@@ -316,12 +316,8 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i
 		bool allBehind = true;
 
 		for (int i = 0; i < count; i++) {
-			float world[3];
-			float view[3];
 			float proj[4];
-			Vec3ByMatrix43(world, verts + i * 3, gstate.worldMatrix);
-			Vec3ByMatrix43(view, world, gstate.viewMatrix);
-			Vec3ByMatrix44(proj, view, gstate.projMatrix);  // TODO: Include adjustments to the proj matrix?
+			Vec3ByMatrix44(proj, verts + i * 3, worldviewproj);  // TODO: Include adjustments to the proj matrix?
 
 			float w = proj[3];
 

From dd315182723a0848977ea12c71c89e2c63d00397 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Fri, 20 Dec 2024 09:09:18 +0100
Subject: [PATCH 09/28] DepthRaster: Merge the decode and transform steps

---
 GPU/Common/DepthRaster.cpp | 118 +++++++++++++++++++++++--------------
 1 file changed, 73 insertions(+), 45 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index da85632c38ed..7faefdb61114 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -255,56 +255,26 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i
 	bool cullCCW = false;
 
 	// Turn the input data into a raw float array that we can pass to an optimized triangle rasterizer.
-	float *verts = (float *)bufferData;
+	float *transformed = (float *)bufferData;
+
 	ScreenVert *screenVerts = (ScreenVert *)((uint8_t *)bufferData + 65536 * 8);
 
 	// Simple, most common case.
 	int vertexStride = dec->VertexSize();
 	int offset = dec->posoff;
-	float factor = 1.0f;
-	switch (vertTypeID & GE_VTYPE_POS_MASK) {
-	case GE_VTYPE_POS_8BIT:
-		if (!isThroughMode) {
-			factor = 1.0f / 128.0f;
-		}
-		for (int i = 0; i < count; i++) {
-			const s8 *data = (const s8 *)vertexData + i * vertexStride + offset;
-			for (int j = 0; j < 3; j++) {
-				verts[i * 3 + j] = data[j] * factor;
-			}
-		}
-		break; 
-	case GE_VTYPE_POS_16BIT:
-		if (!isThroughMode) {
-			factor = 1.0f / 32768.0f;
-		}
-		for (int i = 0; i < count; i++) {
-			const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset));
-			for (int j = 0; j < 3; j++) {
-				verts[i * 3 + j] = data[j] * factor;
-			}
-		}
-		break;
-	case GE_VTYPE_POS_FLOAT:
-		for (int i = 0; i < count; i++)
-			memcpy(&verts[i * 3], (const u8 *)vertexData + vertexStride * i + offset, sizeof(float) * 3);
-		break;
-	}
-
-	float world[16];
-	float view[16];
-	float worldview[16];
-	float worldviewproj[16];
-	ConvertMatrix4x3To4x4(world, gstate.worldMatrix);
-	ConvertMatrix4x3To4x4(view, gstate.viewMatrix);
-	Matrix4ByMatrix4(worldview, world, view);
-	Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix);
 
 	// OK, we now have the coordinates. Let's transform, we can actually do this in-place.
 	if (!(vertTypeID & GE_VTYPE_THROUGH_MASK)) {
-		cullEnabled = gstate.isCullEnabled();
+		float world[16];
+		float view[16];
+		float worldview[16];
+		float worldviewproj[16];
+		ConvertMatrix4x3To4x4(world, gstate.worldMatrix);
+		ConvertMatrix4x3To4x4(view, gstate.viewMatrix);
+		Matrix4ByMatrix4(worldview, world, view);
+		Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix);   // TODO: Include adjustments to the proj matrix?
 
-		// TODO: This is very suboptimal. This should be one matrix multiplication per vertex.
+		cullEnabled = gstate.isCullEnabled();
 
 		float viewportX = gstate.getViewportXCenter();
 		float viewportY = gstate.getViewportYCenter();
@@ -315,9 +285,39 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i
 		
 		bool allBehind = true;
 
+		float temp[3];
+		for (int i = 0; i < count; i++) {
+			switch (vertTypeID & GE_VTYPE_POS_MASK) {
+			case GE_VTYPE_POS_8BIT:
+				for (int i = 0; i < count; i++) {
+					const s8 *data = (const s8 *)vertexData + i * vertexStride + offset;
+					for (int j = 0; j < 3; j++) {
+						temp[j] = data[j] * (1.0f / 128.0f);  // TODO: Can we bake this factor in somewhere?
+					}
+					Vec3ByMatrix44(transformed + i * 4, temp, worldviewproj);
+				}
+				break;
+			case GE_VTYPE_POS_16BIT:
+				for (int i = 0; i < count; i++) {
+					const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset));
+					for (int j = 0; j < 3; j++) {
+						temp[j] = data[j] * (1.0f / 32768.0f); // TODO: Can we bake this factor in somewhere?
+					}
+					Vec3ByMatrix44(transformed + i * 4, temp, worldviewproj);
+				}
+				break;
+			case GE_VTYPE_POS_FLOAT:
+				for (int i = 0; i < count; i++) {
+					const float *data = (const float *)((const u8 *)vertexData + vertexStride * i + offset);
+					Vec3ByMatrix44(transformed + i * 4, data, worldviewproj);
+				}
+				break;
+			}
+		}
+
 		for (int i = 0; i < count; i++) {
 			float proj[4];
-			Vec3ByMatrix44(proj, verts + i * 3, worldviewproj);  // TODO: Include adjustments to the proj matrix?
+			memcpy(proj, transformed + i * 4, 4 * sizeof(float));
 
 			float w = proj[3];
 
@@ -353,10 +353,38 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i
 			return;
 		}
 	} else {
+		float factor = 1.0f;
+		switch (vertTypeID & GE_VTYPE_POS_MASK) {
+		case GE_VTYPE_POS_8BIT:
+			for (int i = 0; i < count; i++) {
+				const s8 *data = (const s8 *)vertexData + i * vertexStride + offset;
+				for (int j = 0; j < 3; j++) {
+					transformed[i * 4 + j] = data[j] * factor;
+				}
+				transformed[i * 4 + 3] = 1.0f;
+			}
+			break;
+		case GE_VTYPE_POS_16BIT:
+			for (int i = 0; i < count; i++) {
+				const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset));
+				for (int j = 0; j < 3; j++) {
+					transformed[i * 4 + j] = data[j] * factor;
+				}
+				transformed[i * 4 + 3] = 1.0f;
+			}
+			break;
+		case GE_VTYPE_POS_FLOAT:
+			for (int i = 0; i < count; i++) {
+				memcpy(&transformed[i * 4], (const u8 *)vertexData + vertexStride * i + offset, sizeof(float) * 3);
+				transformed[i * 4 + 3] = 1.0f;
+			}
+			break;
+		}
+
 		for (int i = 0; i < count; i++) {
-			screenVerts[i].x = (int)verts[i * 3 + 0];
-			screenVerts[i].y = (int)verts[i * 3 + 1];
-			screenVerts[i].z = (u16)clamp_value(verts[i * 3 + 2], 0.0f, 65535.0f);
+			screenVerts[i].x = (int)transformed[i * 4 + 0];
+			screenVerts[i].y = (int)transformed[i * 4 + 1];
+			screenVerts[i].z = (u16)clamp_value(transformed[i * 4 + 2], 0.0f, 65535.0f);
 		}
 	}
 

From bdb5f3a91b64d5db894338bff574ba1cbba9c93f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Fri, 20 Dec 2024 10:30:23 +0100
Subject: [PATCH 10/28] Reorganize the depth vertex pipeline for future
 optimizations

---
 GPU/Common/DepthRaster.cpp      | 267 +++++++++++---------------------
 GPU/Common/DepthRaster.h        |  13 +-
 GPU/Common/DrawEngineCommon.cpp |  74 ++++++++-
 GPU/Common/DrawEngineCommon.h   |  15 ++
 GPU/GPUCommonHW.cpp             |   9 --
 GPU/Vulkan/DrawEngineVulkan.cpp |   9 ++
 6 files changed, 201 insertions(+), 186 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index 7faefdb61114..e99160f89bc3 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -8,13 +8,6 @@
 #include "Common/Math/math_util.h"
 #include "GPU/Common/VertexDecoderCommon.h"
 
-struct ScreenVert {
-	int x;
-	int y;
-	uint16_t z;
-	uint16_t behind;
-};
-
 void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, short depthValue, GEComparison depthCompare) {
 	// Swap coordinates if needed, we don't back-face-cull rects.
 	// We also ignore the UV rotation here.
@@ -88,7 +81,7 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2,
 // Adapted from Intel's depth rasterizer example.
 // Started with the scalar version, will SIMD-ify later.
 // x1/y1 etc are the scissor rect.
-void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const ScreenVert vertsSub[3], GEComparison compareMode) {
+void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const DepthScreenVertex vertsSub[3], GEComparison compareMode) {
 	int tileStartX = x1;
 	int tileEndX = x2;
 
@@ -100,7 +93,7 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
 	// are slow on SSE2.
 
 	// Convert to whole pixels for now. Later subpixel precision.
-	ScreenVert verts[3];
+	DepthScreenVertex verts[3];
 	verts[0].x = vertsSub[0].x;
 	verts[0].y = vertsSub[0].y;
 	verts[0].z = vertsSub[0].z;
@@ -218,179 +211,123 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
 	} // for each row
 }
 
-// We ignore lots of primitive types for now.
-void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, int y2, void *bufferData,
-	const void *vertexData, const void *indexData, GEPrimitiveType prim, int count, VertexDecoder *dec, u32 vertTypeID, bool clockwise) {
+void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int count, VertexDecoder *dec, u32 vertTypeID) {
+	// TODO: Ditch skinned and morphed prims for now since we don't have a fast way to skin without running the full decoder.
+	_dbg_assert_((vertTypeID & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) == 0);
 
-	GEComparison compareMode = gstate.getDepthTestFunction();
-	if (gstate.isModeClear()) {
-		if (!gstate.isClearModeDepthMask()) {
-			return;
-		}
-		compareMode = GE_COMP_ALWAYS;
-	} else {
-		if (!gstate.isDepthTestEnabled() || !gstate.isDepthWriteEnabled())
-			return;
-	}
+	int vertexStride = dec->VertexSize();
+	int offset = dec->posoff;
 
-	switch (prim) {
-	case GE_PRIM_INVALID:
-	case GE_PRIM_KEEP_PREVIOUS:
-	case GE_PRIM_LINES:
-	case GE_PRIM_LINE_STRIP:
-	case GE_PRIM_POINTS:
-		return;
-	default:
+	float temp[3];
+	switch (vertTypeID & GE_VTYPE_POS_MASK) {
+	case GE_VTYPE_POS_8BIT:
+		for (int i = 0; i < count; i++) {
+			const s8 *data = (const s8 *)vertexData + i * vertexStride + offset;
+			for (int j = 0; j < 3; j++) {
+				temp[j] = data[j] * (1.0f / 128.0f);  // TODO: Can we bake this factor in somewhere?
+			}
+			Vec3ByMatrix44(dest + i * 4, temp, worldviewproj);
+		}
+		break;
+	case GE_VTYPE_POS_16BIT:
+		for (int i = 0; i < count; i++) {
+			const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset));
+			for (int j = 0; j < 3; j++) {
+				temp[j] = data[j] * (1.0f / 32768.0f); // TODO: Can we bake this factor in somewhere?
+			}
+			Vec3ByMatrix44(dest + i * 4, temp, worldviewproj);
+		}
+		break;
+	case GE_VTYPE_POS_FLOAT:
+		for (int i = 0; i < count; i++) {
+			const float *data = (const float *)((const u8 *)vertexData + vertexStride * i + offset);
+			Vec3ByMatrix44(dest + i * 4, data, worldviewproj);
+		}
 		break;
 	}
+}
 
-	// TODO: Ditch indexed primitives for now, also ditched skinned ones since we don't have a fast way to skin without
-	// running the full decoder.
-	if (vertTypeID & (GE_VTYPE_IDX_MASK | GE_VTYPE_WEIGHT_MASK)) {
-		return;
+void DepthRasterConvertTransformed(DepthScreenVertex *screenVerts, const TransformedVertex *transformed, int count) {
+	for (int i = 0; i < count; i++) {
+		screenVerts[i].x = (int)transformed[i].pos[0];
+		screenVerts[i].y = (int)transformed[i].pos[1];
+		screenVerts[i].z = (u16)transformed[i].pos[2];
 	}
+}
 
-	bool isThroughMode = (vertTypeID & GE_VTYPE_THROUGH_MASK) != 0;
-	bool cullEnabled = false;
-	bool cullCCW = false;
-
-	// Turn the input data into a raw float array that we can pass to an optimized triangle rasterizer.
-	float *transformed = (float *)bufferData;
+int DepthRasterClipIndexedTriangles(DepthScreenVertex *screenVerts, const float *transformed, const uint16_t *indexBuffer, int count) {
+	bool cullEnabled = gstate.isCullEnabled();
 
-	ScreenVert *screenVerts = (ScreenVert *)((uint8_t *)bufferData + 65536 * 8);
+	const float viewportX = gstate.getViewportXCenter();
+	const float viewportY = gstate.getViewportYCenter();
+	const float viewportZ = gstate.getViewportZCenter();
+	const float viewportScaleX = gstate.getViewportXScale();
+	const float viewportScaleY = gstate.getViewportYScale();
+	const float viewportScaleZ = gstate.getViewportZScale();
 
-	// Simple, most common case.
-	int vertexStride = dec->VertexSize();
-	int offset = dec->posoff;
+	bool cullCCW = false;
 
 	// OK, we now have the coordinates. Let's transform, we can actually do this in-place.
-	if (!(vertTypeID & GE_VTYPE_THROUGH_MASK)) {
-		float world[16];
-		float view[16];
-		float worldview[16];
-		float worldviewproj[16];
-		ConvertMatrix4x3To4x4(world, gstate.worldMatrix);
-		ConvertMatrix4x3To4x4(view, gstate.viewMatrix);
-		Matrix4ByMatrix4(worldview, world, view);
-		Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix);   // TODO: Include adjustments to the proj matrix?
-
-		cullEnabled = gstate.isCullEnabled();
-
-		float viewportX = gstate.getViewportXCenter();
-		float viewportY = gstate.getViewportYCenter();
-		float viewportZ = gstate.getViewportZCenter();
-		float viewportScaleX = gstate.getViewportXScale();
-		float viewportScaleY = gstate.getViewportYScale();
-		float viewportScaleZ = gstate.getViewportZScale();
-		
-		bool allBehind = true;
-
-		float temp[3];
-		for (int i = 0; i < count; i++) {
-			switch (vertTypeID & GE_VTYPE_POS_MASK) {
-			case GE_VTYPE_POS_8BIT:
-				for (int i = 0; i < count; i++) {
-					const s8 *data = (const s8 *)vertexData + i * vertexStride + offset;
-					for (int j = 0; j < 3; j++) {
-						temp[j] = data[j] * (1.0f / 128.0f);  // TODO: Can we bake this factor in somewhere?
-					}
-					Vec3ByMatrix44(transformed + i * 4, temp, worldviewproj);
-				}
-				break;
-			case GE_VTYPE_POS_16BIT:
-				for (int i = 0; i < count; i++) {
-					const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset));
-					for (int j = 0; j < 3; j++) {
-						temp[j] = data[j] * (1.0f / 32768.0f); // TODO: Can we bake this factor in somewhere?
-					}
-					Vec3ByMatrix44(transformed + i * 4, temp, worldviewproj);
-				}
-				break;
-			case GE_VTYPE_POS_FLOAT:
-				for (int i = 0; i < count; i++) {
-					const float *data = (const float *)((const u8 *)vertexData + vertexStride * i + offset);
-					Vec3ByMatrix44(transformed + i * 4, data, worldviewproj);
-				}
-				break;
-			}
-		}
 
-		for (int i = 0; i < count; i++) {
-			float proj[4];
-			memcpy(proj, transformed + i * 4, 4 * sizeof(float));
+	int outCount = 0;
 
-			float w = proj[3];
+	for (int i = 0; i < count; i += 3) {
+		const float *verts[3] = {
+			transformed + indexBuffer[i] * 4,
+			transformed + indexBuffer[i + 1] * 4,
+			transformed + indexBuffer[i + 2] * 4,
+		};
 
-			bool inFront = w > 0.0f;
-			screenVerts[i].behind = !inFront;
-			if (inFront) {
-				allBehind = false;
-			}
+		// Check if any vertex is behind the 0 plane.
+		if (verts[0][3] < 0.0f || verts[1][3] < 0.0f || verts[2][3] < 0.0f) {
+			// Ditch this triangle. Later we should clip here.
+			continue;
+		}
+
+		for (int c = 0; c < 3; c++) {
+			const float *src = verts[c];
+			float invW = 1.0f / src[3];
 
-			// Clip to the w=0 plane.
-			proj[0] /= w;
-			proj[1] /= w;
-			proj[2] /= w;
+			float x = src[0] * invW;
+			float y = src[1] * invW;
+			float z = src[2] * invW;
 
-			// Then transform by the viewport and offset to finally get subpixel coordinates. Normally, this is done by the viewport
-			// and offset params.
 			float screen[3];
-			screen[0] = (proj[0] * viewportScaleX + viewportX) * 16.0f - gstate.getOffsetX16();
-			screen[1] = (proj[1] * viewportScaleY + viewportY) * 16.0f - gstate.getOffsetY16();
-			screen[2] = (proj[2] * viewportScaleZ + viewportZ);
+			screen[0] = (x * viewportScaleX + viewportX) * 16.0f - gstate.getOffsetX16();
+			screen[1] = (y * viewportScaleY + viewportY) * 16.0f - gstate.getOffsetY16();
+			screen[2] = (z * viewportScaleZ + viewportZ);
 			if (screen[2] < 0.0f) {
 				screen[2] = 0.0f;
 			}
 			if (screen[2] >= 65535.0f) {
 				screen[2] = 65535.0f;
 			}
-			screenVerts[i].x = screen[0] * (1.0f / 16.0f);  // We ditch the subpixel precision here.
-			screenVerts[i].y = screen[1] * (1.0f / 16.0f);
-			screenVerts[i].z = screen[2];
+			screenVerts[outCount].x = screen[0] * (1.0f / 16.0f);  // We ditch the subpixel precision here.
+			screenVerts[outCount].y = screen[1] * (1.0f / 16.0f);
+			screenVerts[outCount].z = screen[2];
+
+			outCount++;
 		}
-		if (allBehind) {
-			// Cull the whole draw.
+	}
+	return outCount;
+}
+
+// Rasterizes screen-space vertices.
+void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const DepthScreenVertex *screenVerts, int count) {
+	// Prim should now be either TRIANGLES or RECTs.
+	_dbg_assert_(prim == GE_PRIM_RECTANGLES || prim == GE_PRIM_TRIANGLES);
+
+	GEComparison compareMode = gstate.getDepthTestFunction();
+	if (gstate.isModeClear()) {
+		if (!gstate.isClearModeDepthMask()) {
 			return;
 		}
+		compareMode = GE_COMP_ALWAYS;
 	} else {
-		float factor = 1.0f;
-		switch (vertTypeID & GE_VTYPE_POS_MASK) {
-		case GE_VTYPE_POS_8BIT:
-			for (int i = 0; i < count; i++) {
-				const s8 *data = (const s8 *)vertexData + i * vertexStride + offset;
-				for (int j = 0; j < 3; j++) {
-					transformed[i * 4 + j] = data[j] * factor;
-				}
-				transformed[i * 4 + 3] = 1.0f;
-			}
-			break;
-		case GE_VTYPE_POS_16BIT:
-			for (int i = 0; i < count; i++) {
-				const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset));
-				for (int j = 0; j < 3; j++) {
-					transformed[i * 4 + j] = data[j] * factor;
-				}
-				transformed[i * 4 + 3] = 1.0f;
-			}
-			break;
-		case GE_VTYPE_POS_FLOAT:
-			for (int i = 0; i < count; i++) {
-				memcpy(&transformed[i * 4], (const u8 *)vertexData + vertexStride * i + offset, sizeof(float) * 3);
-				transformed[i * 4 + 3] = 1.0f;
-			}
-			break;
-		}
-
-		for (int i = 0; i < count; i++) {
-			screenVerts[i].x = (int)transformed[i * 4 + 0];
-			screenVerts[i].y = (int)transformed[i * 4 + 1];
-			screenVerts[i].z = (u16)clamp_value(transformed[i * 4 + 2], 0.0f, 65535.0f);
-		}
+		if (!gstate.isDepthTestEnabled() || !gstate.isDepthWriteEnabled())
+			return;
 	}
 
-	// Then we need to stitch primitives from strips, etc etc...
-	// For now we'll just do it tri by tri. Later let's be more efficient.
-	
 	switch (prim) {
 	case GE_PRIM_RECTANGLES:
 		for (int i = 0; i < count / 2; i++) {
@@ -403,30 +340,10 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i
 		break;
 	case GE_PRIM_TRIANGLES:
 		for (int i = 0; i < count / 3; i++) {
-			if (screenVerts[i * 3].behind || screenVerts[i * 3 + 1].behind || screenVerts[i * 3 + 2].behind) {
-				continue;
-			}
 			DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, screenVerts + i * 3, compareMode);
 		}
 		break;
-	case GE_PRIM_TRIANGLE_STRIP:
-	{
-		int wind = 2;
-		for (int i = 0; i < count - 2; i++) {
-			int i0 = i;
-			int i1 = i + wind;
-			wind ^= 3;
-			int i2 = i + wind;
-			if (screenVerts[i0].behind || screenVerts[i1].behind || screenVerts[i2].behind) {
-				continue;
-			}
-			ScreenVert v[3];
-			v[0] = screenVerts[i0];
-			v[1] = screenVerts[i1];
-			v[2] = screenVerts[i2];
-			DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, v, compareMode);
-		}
-		break;
-	}
+	default:
+		_dbg_assert_(false);
 	}
 }
diff --git a/GPU/Common/DepthRaster.h b/GPU/Common/DepthRaster.h
index 01fa60e257d1..d639103aaaa2 100644
--- a/GPU/Common/DepthRaster.h
+++ b/GPU/Common/DepthRaster.h
@@ -3,10 +3,21 @@
 #include "Common/CommonTypes.h"
 #include "GPU/ge_constants.h"
 
+struct DepthScreenVertex {
+	int x;
+	int y;
+	uint16_t z;
+};
+
 // Specialized, very limited depth-only rasterizer.
 // Meant to run in parallel with hardware rendering, in games that read back the depth buffer
 // for effects like lens flare.
 // So, we can be quite inaccurate without any issues, and skip a lot of functionality.
 
 class VertexDecoder;
-void DepthRasterPrim(uint16_t *dest, int stride, int x1, int x2, int y1, int y2, void *bufferData, const void *vertexData, const void *indexData, GEPrimitiveType prim, int count, VertexDecoder *decoder, u32 vertexTypeID, bool clockwise);
+struct TransformedVertex;
+
+int DepthRasterClipIndexedTriangles(DepthScreenVertex *screenVerts, const float *transformed, const uint16_t *indexBuffer, int count);
+void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int count, VertexDecoder *dec, u32 vertTypeID);
+void DepthRasterConvertTransformed(DepthScreenVertex *screenVerts, const TransformedVertex *transformed, int count);
+void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const DepthScreenVertex *screenVerts, int count);
diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp
index 818021a79b3a..df80511b4691 100644
--- a/GPU/Common/DrawEngineCommon.cpp
+++ b/GPU/Common/DrawEngineCommon.cpp
@@ -23,9 +23,11 @@
 #include "Common/LogReporting.h"
 #include "Common/Math/SIMDHeaders.h"
 #include "Common/Math/lin/matrix4x4.h"
+#include "Core/System.h"
 #include "Core/Config.h"
 #include "GPU/Common/DrawEngineCommon.h"
 #include "GPU/Common/SplineCommon.h"
+#include "GPU/Common/DepthRaster.h"
 #include "GPU/Common/VertexDecoderCommon.h"
 #include "GPU/Common/SoftwareTransformCommon.h"
 #include "GPU/ge_constants.h"
@@ -34,7 +36,9 @@
 #define QUAD_INDICES_MAX 65536
 
 enum {
-	TRANSFORMED_VERTEX_BUFFER_SIZE = VERTEX_BUFFER_MAX * sizeof(TransformedVertex)
+	TRANSFORMED_VERTEX_BUFFER_SIZE = VERTEX_BUFFER_MAX * sizeof(TransformedVertex),
+	DEPTH_TRANSFORMED_SIZE = VERTEX_BUFFER_MAX * 4,
+	DEPTH_SCREENVERTS_SIZE = VERTEX_BUFFER_MAX * sizeof(DepthScreenVertex),
 };
 
 DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) {
@@ -46,6 +50,12 @@ DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) {
 	decoded_ = (u8 *)AllocateMemoryPages(DECODED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
 	decIndex_ = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
 	indexGen.Setup(decIndex_);
+
+	useDepthRaster_ = PSP_CoreParameter().compat.flags().SoftwareRasterDepth;
+	if (useDepthRaster_) {
+		depthTransformed_ = (float *)AllocateMemoryPages(DEPTH_TRANSFORMED_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
+		depthScreenVerts_ = (DepthScreenVertex *)AllocateMemoryPages(DEPTH_SCREENVERTS_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
+	}
 }
 
 DrawEngineCommon::~DrawEngineCommon() {
@@ -53,6 +63,10 @@ DrawEngineCommon::~DrawEngineCommon() {
 	FreeMemoryPages(decIndex_, DECODED_INDEX_BUFFER_SIZE);
 	FreeMemoryPages(transformed_, TRANSFORMED_VERTEX_BUFFER_SIZE);
 	FreeMemoryPages(transformedExpanded_, 3 * TRANSFORMED_VERTEX_BUFFER_SIZE);
+	if (depthTransformed_) {
+		FreeMemoryPages(depthTransformed_, DEPTH_TRANSFORMED_SIZE);
+		FreeMemoryPages(depthScreenVerts_, DEPTH_SCREENVERTS_SIZE);
+	}
 	delete decJitCache_;
 	decoderMap_.Iterate([&](const uint32_t vtype, VertexDecoder *decoder) {
 		delete decoder;
@@ -886,3 +900,61 @@ bool DrawEngineCommon::DescribeCodePtr(const u8 *ptr, std::string &name) const {
 		return false;
 	}
 }
+
+void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID) {
+	switch (prim) {
+	case GE_PRIM_INVALID:
+	case GE_PRIM_KEEP_PREVIOUS:
+	case GE_PRIM_LINES:
+	case GE_PRIM_LINE_STRIP:
+	case GE_PRIM_POINTS:
+		return;
+	default:
+		break;
+	}
+
+	if (vertTypeID & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) {
+		return;
+	}
+
+	float world[16];
+	float view[16];
+	float worldview[16];
+	float worldviewproj[16];
+	ConvertMatrix4x3To4x4(world, gstate.worldMatrix);
+	ConvertMatrix4x3To4x4(view, gstate.viewMatrix);
+	Matrix4ByMatrix4(worldview, world, view);
+	Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix);   // TODO: Include adjustments to the proj matrix?
+
+	// Decode.
+	int numDec = 0;
+	for (int i = 0; i < numDrawVerts_; i++) {
+		DecodeAndTransformForDepthRaster(depthTransformed_ + numDec * 4, prim, worldviewproj, drawVerts_[i].verts, drawVerts_[i].vertexCount, dec, vertTypeID);
+		numDec += drawVerts_[i].vertexCount;
+	}
+
+	// Clip and triangulate using the index buffer.
+	int outVertCount = DepthRasterClipIndexedTriangles(depthScreenVerts_, depthTransformed_, decIndex_, numDec);
+
+	DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(),
+		GE_PRIM_TRIANGLES, gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(),
+		depthScreenVerts_, outVertCount);
+}
+
+void DrawEngineCommon::DepthRasterPretransformed(GEPrimitiveType prim, const TransformedVertex *inVerts, int count) {
+	switch (prim) {
+	case GE_PRIM_INVALID:
+	case GE_PRIM_KEEP_PREVIOUS:
+	case GE_PRIM_LINES:
+	case GE_PRIM_LINE_STRIP:
+	case GE_PRIM_POINTS:
+		return;
+	default:
+		break;
+	}
+
+	DepthRasterConvertTransformed(depthScreenVerts_, inVerts, count);
+	DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(),
+		prim, gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(),
+		depthScreenVerts_, count);
+}
diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h
index 595ab929aab4..64e8478cd06e 100644
--- a/GPU/Common/DrawEngineCommon.h
+++ b/GPU/Common/DrawEngineCommon.h
@@ -27,6 +27,7 @@
 #include "GPU/Common/GPUStateUtils.h"
 #include "GPU/Common/IndexGenerator.h"
 #include "GPU/Common/VertexDecoderCommon.h"
+#include "GPU/Common/DepthRaster.h"
 
 class VertexDecoder;
 
@@ -174,6 +175,9 @@ class DrawEngineCommon {
 
 	void ApplyFramebufferRead(FBOTexState *fboTexState);
 
+	void DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID);
+	void DepthRasterPretransformed(GEPrimitiveType prim, const TransformedVertex *inVerts, int count);
+
 	static inline int IndexSize(u32 vtype) {
 		const u32 indexType = (vtype & GE_VTYPE_IDX_MASK);
 		if (indexType == GE_VTYPE_IDX_16BIT) {
@@ -228,6 +232,11 @@ class DrawEngineCommon {
 	}
 
 	inline bool CollectedPureDraw() const {
+		// TODO: Do something faster.
+		if (useDepthRaster_) {
+			return false;
+		}
+
 		switch (seenPrims_) {
 		case 1 << GE_PRIM_TRIANGLE_STRIP:
 			return !anyCCWOrIndexed_ && numDrawInds_ == 1;
@@ -343,4 +352,10 @@ class DrawEngineCommon {
 	bool offsetOutsideEdge_;
 
 	GPUCommon *gpuCommon_;
+
+	// Software depth raster
+	bool useDepthRaster_ = false;
+
+	float *depthTransformed_ = nullptr;
+	DepthScreenVertex *depthScreenVerts_ = nullptr;
 };
diff --git a/GPU/GPUCommonHW.cpp b/GPU/GPUCommonHW.cpp
index f5383cf6be59..9b5389c8750a 100644
--- a/GPU/GPUCommonHW.cpp
+++ b/GPU/GPUCommonHW.cpp
@@ -13,7 +13,6 @@
 #include "GPU/Common/DrawEngineCommon.h"
 #include "GPU/Common/TextureCacheCommon.h"
 #include "GPU/Common/FramebufferManagerCommon.h"
-#include "GPU/Common/DepthRaster.h"
 
 struct CommonCommandTableEntry {
 	uint8_t cmd;
@@ -1040,10 +1039,6 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
 	if (passCulling) {
 		if (!drawEngineCommon_->SubmitPrim(verts, inds, prim, count, decoder, vertTypeID, true, &bytesRead)) {
 			canExtend = false;
-		} else if (PSP_CoreParameter().compat.flags().SoftwareRasterDepth) {
-			DepthRasterPrim((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(),
-				gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), drawEngineCommon_->GetTempSpace(),
-				verts, inds, prim, count, decoder, vertTypeID, false);
 		}
 		onePassed = true;
 	} else {
@@ -1122,10 +1117,6 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) {
 			if (passCulling) {
 				if (!drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, decoder, vertTypeID, clockwise, &bytesRead)) {
 					canExtend = false;
-				} else if (PSP_CoreParameter().compat.flags().SoftwareRasterDepth) {
-					DepthRasterPrim((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(),
-						gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), drawEngineCommon_->GetTempSpace(),
-						verts, inds, newPrim, count, decoder, vertTypeID, clockwise);
 				}
 				// As soon as one passes, assume we don't need to check the rest of this batch.
 				onePassed = true;
diff --git a/GPU/Vulkan/DrawEngineVulkan.cpp b/GPU/Vulkan/DrawEngineVulkan.cpp
index f1279b855a69..e1775722ee3e 100644
--- a/GPU/Vulkan/DrawEngineVulkan.cpp
+++ b/GPU/Vulkan/DrawEngineVulkan.cpp
@@ -370,6 +370,9 @@ void DrawEngineVulkan::Flush() {
 		} else {
 			renderManager->Draw(descSetIndex, ARRAY_SIZE(dynamicUBOOffsets), dynamicUBOOffsets, vbuf, vbOffset, vertexCount);
 		}
+		if (useDepthRaster_) {
+			DepthRasterTransform(prim, dec_, dec_->VertexType());
+		}
 	} else {
 		PROFILE_THIS_SCOPE("soft");
 		VertexDecoder *swDec = dec_;
@@ -438,6 +441,12 @@ void DrawEngineVulkan::Flush() {
 		swTransform.SetProjMatrix(gstate.projMatrix, gstate_c.vpWidth < 0, gstate_c.vpHeight < 0, trans, scale);
 
 		swTransform.Transform(prim, swDec->VertexType(), swDec->GetDecVtxFmt(), numDecodedVerts_, &result);
+
+		// At this point, rect and line primitives are still preserved as such. So, it's the best time to do software depth raster.
+		if (useDepthRaster_) {
+			DepthRasterPretransformed(prim, transformed_, numDecodedVerts_);
+		}
+
 		// Non-zero depth clears are unusual, but some drivers don't match drawn depth values to cleared values.
 		// Games sometimes expect exact matches (see #12626, for example) for equal comparisons.
 		if (result.action == SW_CLEAR && everUsedEqualDepth_ && gstate.isClearModeDepthMask() && result.depth > 0.0f && result.depth < 1.0f)

From bdf4b692073479b52631472e118c5aa0e500b08b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Fri, 20 Dec 2024 11:05:28 +0100
Subject: [PATCH 11/28] Warning fixes, minor cleanup

---
 GPU/Common/DepthRaster.cpp      | 18 ++++++++++--------
 GPU/Common/DepthRaster.h        |  2 +-
 GPU/Common/DrawEngineCommon.cpp |  4 +++-
 GPU/Common/IndexGenerator.h     |  2 +-
 unittest/UnitTest.cpp           |  2 +-
 5 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index e99160f89bc3..4e5c99a12790 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -247,14 +247,6 @@ void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const f
 	}
 }
 
-void DepthRasterConvertTransformed(DepthScreenVertex *screenVerts, const TransformedVertex *transformed, int count) {
-	for (int i = 0; i < count; i++) {
-		screenVerts[i].x = (int)transformed[i].pos[0];
-		screenVerts[i].y = (int)transformed[i].pos[1];
-		screenVerts[i].z = (u16)transformed[i].pos[2];
-	}
-}
-
 int DepthRasterClipIndexedTriangles(DepthScreenVertex *screenVerts, const float *transformed, const uint16_t *indexBuffer, int count) {
 	bool cullEnabled = gstate.isCullEnabled();
 
@@ -312,6 +304,16 @@ int DepthRasterClipIndexedTriangles(DepthScreenVertex *screenVerts, const float
 	return outCount;
 }
 
+void DepthRasterConvertTransformed(DepthScreenVertex *screenVerts, GEPrimitiveType prim, const TransformedVertex *transformed, int count) {
+	_dbg_assert_(prim == GE_PRIM_RECTANGLES || prim == GE_PRIM_TRIANGLES);
+
+	for (int i = 0; i < count; i++) {
+		screenVerts[i].x = (int)transformed[i].pos[0];
+		screenVerts[i].y = (int)transformed[i].pos[1];
+		screenVerts[i].z = (u16)transformed[i].pos[2];
+	}
+}
+
 // Rasterizes screen-space vertices.
 void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const DepthScreenVertex *screenVerts, int count) {
 	// Prim should now be either TRIANGLES or RECTs.
diff --git a/GPU/Common/DepthRaster.h b/GPU/Common/DepthRaster.h
index d639103aaaa2..50ef309d936c 100644
--- a/GPU/Common/DepthRaster.h
+++ b/GPU/Common/DepthRaster.h
@@ -19,5 +19,5 @@ struct TransformedVertex;
 
 int DepthRasterClipIndexedTriangles(DepthScreenVertex *screenVerts, const float *transformed, const uint16_t *indexBuffer, int count);
 void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int count, VertexDecoder *dec, u32 vertTypeID);
-void DepthRasterConvertTransformed(DepthScreenVertex *screenVerts, const TransformedVertex *transformed, int count);
+void DepthRasterConvertTransformed(DepthScreenVertex *screenVerts, GEPrimitiveType prim, const TransformedVertex *transformed, int count);
 void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const DepthScreenVertex *screenVerts, int count);
diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp
index df80511b4691..60024326fbe5 100644
--- a/GPU/Common/DrawEngineCommon.cpp
+++ b/GPU/Common/DrawEngineCommon.cpp
@@ -953,7 +953,9 @@ void DrawEngineCommon::DepthRasterPretransformed(GEPrimitiveType prim, const Tra
 		break;
 	}
 
-	DepthRasterConvertTransformed(depthScreenVerts_, inVerts, count);
+	_dbg_assert_(prim != GE_PRIM_TRIANGLE_STRIP && prim != GE_PRIM_TRIANGLE_FAN);
+
+	DepthRasterConvertTransformed(depthScreenVerts_, prim, inVerts, count);
 	DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(),
 		prim, gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(),
 		depthScreenVerts_, count);
diff --git a/GPU/Common/IndexGenerator.h b/GPU/Common/IndexGenerator.h
index 723f4caabd7c..48df11e97291 100644
--- a/GPU/Common/IndexGenerator.h
+++ b/GPU/Common/IndexGenerator.h
@@ -54,7 +54,7 @@ class IndexGenerator {
 	void TranslatePrim(int prim, int numInds, const u32_le *inds, int indexOffset, bool clockwise);
 
 	// This is really the number of generated indices, or 3x the number of triangles.
-	int VertexCount() const { return inds_ - indsBase_; }
+	int VertexCount() const { return (int)(inds_ - indsBase_); }
 
 private:
 	// Points (why index these? code simplicity)
diff --git a/unittest/UnitTest.cpp b/unittest/UnitTest.cpp
index 45c664a29af8..fe3fba0af4b9 100644
--- a/unittest/UnitTest.cpp
+++ b/unittest/UnitTest.cpp
@@ -1112,7 +1112,7 @@ bool TestSIMD() {
 	EXPECT_EQ_INT(testdata[1], 0);
 
 	__m128i a = _mm_set_epi16(0, 0x4444, 0, 0x3333, 0, 0x2222, 0, 0x1111);
-	__m128i b = _mm_set_epi16(0, 0x8888, 0, 0x7777, 0, 0x6666, 0, 0x5555);
+	__m128i b = _mm_set_epi16(0, (int16_t)0x8888, 0, 0x7777, 0, 0x6666, 0, 0x5555);
 	__m128i c = _mm_packu2_epi32_SSE2(a, b);
 	__m128i d = _mm_packus_epi32(a, b);
 

From de45960420c3455d68818dec4948ef220855a5fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Fri, 20 Dec 2024 12:38:03 +0100
Subject: [PATCH 12/28] Reformat CrossSIMD.h for easier editing. Add some new
 methods.

---
 Common/Math/CrossSIMD.h | 102 +++++++++++++++++++---------------------
 1 file changed, 48 insertions(+), 54 deletions(-)

diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
index 11ed21702486..056cc5f0c7b5 100644
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@@ -11,53 +11,40 @@
 struct Vec4S32 {
 	__m128i v;
 
-	Vec4S32 operator +(Vec4S32 other) const {
-		return Vec4S32{ _mm_add_epi32(v, other.v) };
-	}
-	Vec4S32 operator -(Vec4S32 other) const {
-		return Vec4S32{ _mm_sub_epi32(v, other.v) };
-	}
+	static Vec4S32 Load(int *src) { return Vec4S32{ _mm_loadu_si128((const __m128i *)src) }; }
+	static Vec4S32 LoadAligned(int *src) { return Vec4S32{ _mm_load_si128((const __m128i *)src) }; }
+	void Store(int *dst) { _mm_storeu_si128((__m128i *)dst, v); }
+	void StoreAligned(int *dst) { _mm_store_si128((__m128i *)dst, v);}
+
+	Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ _mm_add_epi32(v, other.v) }; }
+	Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ _mm_sub_epi32(v, other.v) }; }
 	// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
-	Vec4S32 operator *(Vec4S32 other) const {
-		return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) };   // (ab3,ab2,ab1,ab0)
-	}
+	Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; }  // (ab3,ab2,ab1,ab0)
 };
 
 struct Vec4F32 {
 	__m128 v;
 
-	static Vec4F32 FromVec4S32(Vec4S32 other) {
-		return Vec4F32{ _mm_cvtepi32_ps(other.v) };
-	}
+	static Vec4F32 Load(float *src) { return Vec4F32{ _mm_loadu_ps(src) }; }
+	static Vec4F32 LoadAligned(float *src) { return Vec4F32{ _mm_load_ps(src) }; }
+	void Store(float *dst) { _mm_storeu_ps(dst, v); }
+	void StoreAligned (float *dst) { _mm_store_ps(dst, v); }
 
-	Vec4F32 operator +(Vec4F32 other) const {
-		return Vec4F32{ _mm_add_ps(v, other.v) };
-	}
-	Vec4F32 operator -(Vec4F32 other) const {
-		return Vec4F32{ _mm_sub_ps(v, other.v) };
-	}
-	Vec4F32 operator *(Vec4F32 other) const {
-		return Vec4F32{ _mm_mul_ps(v, other.v) };
-	}
+	static Vec4F32 FromVec4S32(Vec4S32 other) { return Vec4F32{ _mm_cvtepi32_ps(other.v) }; }
+
+	Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ _mm_add_ps(v, other.v) }; }
+	Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ _mm_sub_ps(v, other.v) }; }
+	Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ _mm_mul_ps(v, other.v) }; }
 };
 
 struct Vec4U16 {
 	__m128i v;  // we only use the lower 64 bits.
-	static Vec4U16 Load(void *mem) {
-		return Vec4U16{ _mm_loadl_epi64((__m128i *)mem) };
-	}
-	void Store(void *mem) {
-		_mm_storel_epi64((__m128i *)mem, v);
-	}
-	static Vec4U16 Max(Vec4U16 a, Vec4U16 b) {
-		return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) };
-	}
-	static Vec4U16 Min(Vec4U16 a, Vec4U16 b) {
-		return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) };
-	}
-	Vec4U16 CompareLT(Vec4U16 other) {
-		return Vec4U16{ _mm_cmplt_epu16(v, other.v) };
-	}
+	static Vec4U16 Load(uint16_t *mem) { return Vec4U16{ _mm_loadl_epi64((__m128i *)mem) }; }
+	void Store(uint16_t *mem) { _mm_storel_epi64((__m128i *)mem, v); }
+
+	static Vec4U16 Max(Vec4U16 a, Vec4U16 b) { return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) }; }
+	static Vec4U16 Min(Vec4U16 a, Vec4U16 b) { return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) }; }
+	Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ _mm_cmplt_epu16(v, other.v) }; }
 };
 
 #elif PPSSPP_ARCH(ARM_NEON)
@@ -65,33 +52,40 @@ struct Vec4U16 {
 struct Vec4S32 {
 	int32x4_t v;
 
-	Vec4S32 operator +(Vec4S32 other) const {
-		return Vec4S32{ vaddq_s32(v, other.v) };
-	}
-	Vec4S32 operator -(Vec4S32 other) const {
-		return Vec4S32{ vsubq_s32(v, other.v) };
-	}
-	Vec4S32 operator *(Vec4S32 other) const {
-		return Vec4S32{ vmulq_s32(v, other.v) };
-	}
+	static Vec4F32 Load(int *src) { return Vec4F32{ vld1q_s32(src) }; }
+	static Vec4F32 LoadAligned(int *src) { return Vec4F32{ vld1q_s32(src) }; }
+	void Store(int *dst) { vst1q_s32(dst, v); }
+	void StoreAligned(int *dst) { vst1q_s32(dst, v); }
+
+	Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ vaddq_s32(v, other.v) }; }
+	Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ vsubq_s32(v, other.v) }; }
+	Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
 };
 
 struct Vec4F32 {
 	float32x4_t v;
 
+	static Vec4F32 Load(float *src) { return Vec4F32{ vld1q_f32(src) }; }
+	static Vec4F32 LoadAligned(float *src) { return Vec4F32{ vld1q_f32(src) }; }
+	void Store(float *dst) { vst1q_f32(dst, v); }
+	void StoreAligned(float *dst) { vst1q_f32(dst, v); }
+
 	static Vec4F32 FromVec4S32(Vec4S32 other) {
 		return Vec4F32{ vcvtq_f32_s32(other.v) };
 	}
 
-	Vec4F32 operator +(Vec4F32 other) const {
-		return Vec4F32{ vaddq_f32(v, other.v) };
-	}
-	Vec4F32 operator -(Vec4F32 other) const {
-		return Vec4F32{ vsubq_f32(v, other.v) };
-	}
-	Vec4F32 operator *(Vec4F32 other) const {
-		return Vec4F32{ vmulq_f32(v, other.v) };
-	}
+	Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ vaddq_f32(v, other.v) }; }
+	Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ vsubq_f32(v, other.v) }; }
+	Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ vmulq_f32(v, other.v) }; }
+};
+
+struct Vec4U16 {
+	uint16x4_t v;  // we only use the lower 64 bits.
+	static Vec4U16 Load(uint16_t *mem) { return Vec4U16{ vld1_u16(mem) }; }
+	void Store(uint16_t *mem) { vst1_u16(mem, v); }
+	static Vec4U16 Max(Vec4U16 a, Vec4U16 b) { return Vec4U16{ vmax_u16(a.v, b.v) }; }
+	static Vec4U16 Min(Vec4U16 a, Vec4U16 b) { return Vec4U16{ vmin_u16(a.v, b.v) }; }
+	Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ vclt_u16(v, other.v) }; }
 };
 
 #else

From 03b9f9805568e1b400ac8b6f00ce1f0a31517b5b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Fri, 20 Dec 2024 13:19:20 +0100
Subject: [PATCH 13/28] Add more funcionality to CrossSIMD.h, like fast matrix
 mul and some conversion

---
 Common/Math/CrossSIMD.h | 137 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 127 insertions(+), 10 deletions(-)

diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
index 056cc5f0c7b5..b240678d17a4 100644
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@@ -8,11 +8,32 @@
 
 #if PPSSPP_ARCH(SSE2)
 
+struct Mat4F32 {
+	Mat4F32(const float *matrix) {
+		col0 = _mm_loadu_ps(matrix);
+		col1 = _mm_loadu_ps(matrix + 4);
+		col2 = _mm_loadu_ps(matrix + 8);
+		col3 = _mm_loadu_ps(matrix + 12);
+	}
+
+	void Transpose() {
+		_MM_TRANSPOSE4_PS(col0, col1, col2, col3);
+	}
+
+	__m128 col0;
+	__m128 col1;
+	__m128 col2;
+	__m128 col3;
+};
+
 struct Vec4S32 {
 	__m128i v;
 
-	static Vec4S32 Load(int *src) { return Vec4S32{ _mm_loadu_si128((const __m128i *)src) }; }
-	static Vec4S32 LoadAligned(int *src) { return Vec4S32{ _mm_load_si128((const __m128i *)src) }; }
+	static Vec4S32 Zero() { return Vec4S32{ _mm_setzero_si128() }; }
+	static Vec4S32 Splat(int lane) { return Vec4S32{ _mm_set1_epi32(lane) }; }
+
+	static Vec4S32 Load(const int *src) { return Vec4S32{ _mm_loadu_si128((const __m128i *)src) }; }
+	static Vec4S32 LoadAligned(const int *src) { return Vec4S32{ _mm_load_si128((const __m128i *)src) }; }
 	void Store(int *dst) { _mm_storeu_si128((__m128i *)dst, v); }
 	void StoreAligned(int *dst) { _mm_store_si128((__m128i *)dst, v);}
 
@@ -25,21 +46,56 @@ struct Vec4S32 {
 struct Vec4F32 {
 	__m128 v;
 
-	static Vec4F32 Load(float *src) { return Vec4F32{ _mm_loadu_ps(src) }; }
-	static Vec4F32 LoadAligned(float *src) { return Vec4F32{ _mm_load_ps(src) }; }
+	static Vec4F32 Zero() { return Vec4F32{ _mm_setzero_ps() }; }
+	static Vec4F32 Splat(float lane) { return Vec4F32{ _mm_set1_ps(lane) }; }
+
+	static Vec4F32 Load(const float *src) { return Vec4F32{ _mm_loadu_ps(src) }; }
+	static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ _mm_load_ps(src) }; }
 	void Store(float *dst) { _mm_storeu_ps(dst, v); }
 	void StoreAligned (float *dst) { _mm_store_ps(dst, v); }
 
+	static Vec4F32 LoadConvertS16(const int16_t *src) {  // Note: will load 8 bytes
+		__m128i value = _mm_loadl_epi64((const __m128i *)src);
+		// 16-bit to 32-bit, use the upper words and an arithmetic shift right to sign extend
+		return Vec4F32{ _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(value, value), 16)) };
+	}
+
+	static Vec4F32 LoadConvertS8(const int8_t *src) {  // Note: will load 8 bytes
+		__m128i value = _mm_loadl_epi64((const __m128i *)src);
+		__m128i value16 = _mm_unpacklo_epi8(value, value);
+		// 16-bit to 32-bit, use the upper words and an arithmetic shift right to sign extend
+		return Vec4F32{ _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(value16, value16), 16)) };
+	}
+
 	static Vec4F32 FromVec4S32(Vec4S32 other) { return Vec4F32{ _mm_cvtepi32_ps(other.v) }; }
 
 	Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ _mm_add_ps(v, other.v) }; }
 	Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ _mm_sub_ps(v, other.v) }; }
 	Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ _mm_mul_ps(v, other.v) }; }
+
+	Vec4F32 Mul(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; }
+
+	inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
+		return Vec4F32{ _mm_add_ps(
+			_mm_add_ps(
+				_mm_mul_ps(m.col0, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0))),
+				_mm_mul_ps(m.col1, _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)))
+			),
+			_mm_add_ps(
+				_mm_mul_ps(m.col2, _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2))),
+				m.col3)
+			)
+		};
+	}
 };
 
 struct Vec4U16 {
 	__m128i v;  // we only use the lower 64 bits.
-	static Vec4U16 Load(uint16_t *mem) { return Vec4U16{ _mm_loadl_epi64((__m128i *)mem) }; }
+
+	static Vec4U16 Zero() { return Vec4U16{ _mm_setzero_si128() }; }
+	// static Vec4U16 AllOnes() { return Vec4U16{ _mm_cmpeq_epi16(_mm_setzero_si128(), _mm_setzero_si128()) }; }
+
+	static Vec4U16 Load(const uint16_t *mem) { return Vec4U16{ _mm_loadl_epi64((__m128i *)mem) }; }
 	void Store(uint16_t *mem) { _mm_storel_epi64((__m128i *)mem, v); }
 
 	static Vec4U16 Max(Vec4U16 a, Vec4U16 b) { return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) }; }
@@ -49,11 +105,37 @@ struct Vec4U16 {
 
 #elif PPSSPP_ARCH(ARM_NEON)
 
+struct Mat4F32 {
+	Mat4F32(const float *matrix) {
+		col0 = vld1q_f32(matrix);
+		col1 = vld1q_f32(matrix + 4);
+		col2 = vld1q_f32(matrix + 8);
+		col3 = vld1q_f32(matrix + 12);
+	}
+	void Transpose() {
+		float32x4_t temp0 = vzip1q_s32(col0, col2);
+		float32x4_t temp1 = vzip2q_s32(col0, col2);
+		float32x4_t temp2 = vzip1q_s32(col1, col3);
+		float32x4_t temp3 = vzip2q_s32(col1, col3);
+		col0 = vzip1q_s32(temp0, temp2);
+		col1 = vzip2q_s32(temp0, temp2);
+		col2 = vzip1q_s32(temp1, temp3);
+		col3 = vzip2q_s32(temp1, temp3);
+	}
+	float32x4_t col0;
+	float32x4_t col1;
+	float32x4_t col2;
+	float32x4_t col3;
+};
+
 struct Vec4S32 {
 	int32x4_t v;
 
-	static Vec4F32 Load(int *src) { return Vec4F32{ vld1q_s32(src) }; }
-	static Vec4F32 LoadAligned(int *src) { return Vec4F32{ vld1q_s32(src) }; }
+	static Vec4S32 Zero() { return Vec4S32{ vdupq_n_s32(0) }; }
+	static Vec4S32 Splat(int lane) { return Vec4S32{ vdupq_n_s32(lane) }; }
+
+	static Vec4S32 Load(const int *src) { return Vec4S32{ vld1q_s32(src) }; }
+	static Vec4S32 LoadAligned(const int *src) { return Vec4S32{ vld1q_s32(src) }; }
 	void Store(int *dst) { vst1q_s32(dst, v); }
 	void StoreAligned(int *dst) { vst1q_s32(dst, v); }
 
@@ -65,11 +147,27 @@ struct Vec4S32 {
 struct Vec4F32 {
 	float32x4_t v;
 
-	static Vec4F32 Load(float *src) { return Vec4F32{ vld1q_f32(src) }; }
-	static Vec4F32 LoadAligned(float *src) { return Vec4F32{ vld1q_f32(src) }; }
+	static Vec4F32 Zero() { return Vec4F32{ vdupq_n_f32(0.0f) }; }
+	static Vec4F32 Splat(float lane) { return Vec4F32{ vdupq_n_f32(lane) }; }
+
+	static Vec4F32 Load(const float *src) { return Vec4F32{ vld1q_f32(src) }; }
+	static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ vld1q_f32(src) }; }
 	void Store(float *dst) { vst1q_f32(dst, v); }
 	void StoreAligned(float *dst) { vst1q_f32(dst, v); }
 
+	static Vec4F32 LoadConvertS16(const int16_t *src) {  // Note: will load 8 bytes
+		int16x4_t value = vld1_s16(src);
+		// 16-bit to 32-bit, use the upper words and an arithmetic shift right to sign extend
+		return Vec4F32{ vcvtq_f32_s32(vmovl_s16(value)) };
+	}
+
+	static Vec4F32 LoadConvertS8(const int8_t *src) {  // Note: will load 8 bytes
+		int8x8_t value = vld1_s8(src);
+		int16x4_t value16 = vget_low_s16(vmovl_s8(value));
+		// 16-bit to 32-bit, use the upper words and an arithmetic shift right to sign extend
+		return Vec4F32{ vcvtq_f32_s32(vmovl_s16(value)) };
+	}
+
 	static Vec4F32 FromVec4S32(Vec4S32 other) {
 		return Vec4F32{ vcvtq_f32_s32(other.v) };
 	}
@@ -77,12 +175,31 @@ struct Vec4F32 {
 	Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ vaddq_f32(v, other.v) }; }
 	Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ vsubq_f32(v, other.v) }; }
 	Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ vmulq_f32(v, other.v) }; }
+
+	Vec4F32 Mul(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; }
+
+	inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
+#if PPSSPP_ARCH(ARM64_NEON)
+		float32x4_t sum = vaddq_f32(
+			vaddq_f32(vmulq_laneq_f32(m.col0, v, 0), vmulq_laneq_f32(m.col1, v, 1)),
+			vaddq_f32(vmulq_laneq_f32(m.col2, v, 2), m.col3));
+#else
+		float32x4_t sum = vaddq_f32(
+			vaddq_f32(vmulq_lane_f32(m.col0, vget_low_f32(v), 0), vmulq_lane_f32(m.col1, vget_low_f32(v), 1)),
+			vaddq_f32(vmulq_lane_f32(m.col2, vget_high_f32(v), 0), m.col3));
+#endif
+		return Vec4F32{ sum };
+	}
 };
 
 struct Vec4U16 {
 	uint16x4_t v;  // we only use the lower 64 bits.
-	static Vec4U16 Load(uint16_t *mem) { return Vec4U16{ vld1_u16(mem) }; }
+
+	static Vec4U16 Zero() { return Vec4U16{ vdup_n_u16(0) }; }
+
+	static Vec4U16 Load(const uint16_t *mem) { return Vec4U16{ vld1_u16(mem) }; }
 	void Store(uint16_t *mem) { vst1_u16(mem, v); }
+
 	static Vec4U16 Max(Vec4U16 a, Vec4U16 b) { return Vec4U16{ vmax_u16(a.v, b.v) }; }
 	static Vec4U16 Min(Vec4U16 a, Vec4U16 b) { return Vec4U16{ vmin_u16(a.v, b.v) }; }
 	Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ vclt_u16(v, other.v) }; }

From 6a1010afb0e3b031c7c0c14eace5c7765c6b2900 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Fri, 20 Dec 2024 13:24:41 +0100
Subject: [PATCH 14/28] Use CrossSIMD to optimize
 DecodeAndTransformForDepthRaster

Checked the output, the generated assembly is great!
---
 GPU/Common/DepthRaster.cpp | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index 4e5c99a12790..23e506ff7be9 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -218,30 +218,25 @@ void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const f
 	int vertexStride = dec->VertexSize();
 	int offset = dec->posoff;
 
-	float temp[3];
+	Mat4F32 mat(worldviewproj);
+
 	switch (vertTypeID & GE_VTYPE_POS_MASK) {
-	case GE_VTYPE_POS_8BIT:
+	case GE_VTYPE_POS_FLOAT:
 		for (int i = 0; i < count; i++) {
-			const s8 *data = (const s8 *)vertexData + i * vertexStride + offset;
-			for (int j = 0; j < 3; j++) {
-				temp[j] = data[j] * (1.0f / 128.0f);  // TODO: Can we bake this factor in somewhere?
-			}
-			Vec3ByMatrix44(dest + i * 4, temp, worldviewproj);
+			const float *data = (const float *)((const u8 *)vertexData + vertexStride * i + offset);
+			Vec4F32::Load(data).AsVec3ByMatrix44(mat).Store(dest + i * 4);
 		}
 		break;
 	case GE_VTYPE_POS_16BIT:
 		for (int i = 0; i < count; i++) {
 			const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset));
-			for (int j = 0; j < 3; j++) {
-				temp[j] = data[j] * (1.0f / 32768.0f); // TODO: Can we bake this factor in somewhere?
-			}
-			Vec3ByMatrix44(dest + i * 4, temp, worldviewproj);
+			Vec4F32::LoadConvertS16(data).Mul(1.0f / 32768.f).AsVec3ByMatrix44(mat).Store(dest + i * 4);
 		}
 		break;
-	case GE_VTYPE_POS_FLOAT:
+	case GE_VTYPE_POS_8BIT:
 		for (int i = 0; i < count; i++) {
-			const float *data = (const float *)((const u8 *)vertexData + vertexStride * i + offset);
-			Vec3ByMatrix44(dest + i * 4, data, worldviewproj);
+			const s8 *data = (const s8 *)vertexData + i * vertexStride + offset;
+			Vec4F32::LoadConvertS8(data).Mul(1.0f / 128.0f).AsVec3ByMatrix44(mat).Store(dest + i * 4);
 		}
 		break;
 	}

From 0b009c10bec5fb9703bf986a3288efc67c23f178 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Fri, 20 Dec 2024 15:42:52 +0100
Subject: [PATCH 15/28] CrossSIMD: Add reciprocal, clamp, swaplowerelements,
 etc

---
 Common/Math/CrossSIMD.h | 82 +++++++++++++++++++++++++++++++++--------
 1 file changed, 67 insertions(+), 15 deletions(-)

diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
index b240678d17a4..65fd02b3d3df 100644
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@@ -16,10 +16,6 @@ struct Mat4F32 {
 		col3 = _mm_loadu_ps(matrix + 12);
 	}
 
-	void Transpose() {
-		_MM_TRANSPOSE4_PS(col0, col1, col2, col3);
-	}
-
 	__m128 col0;
 	__m128 col1;
 	__m128 col2;
@@ -37,6 +33,13 @@ struct Vec4S32 {
 	void Store(int *dst) { _mm_storeu_si128((__m128i *)dst, v); }
 	void StoreAligned(int *dst) { _mm_store_si128((__m128i *)dst, v);}
 
+	// Swaps the two lower elements. Useful for reversing triangles..
+	Vec4S32 SwapLowerElements() {
+		return Vec4S32{
+			_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 0, 1))
+		};
+	}
+
 	Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ _mm_add_epi32(v, other.v) }; }
 	Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ _mm_sub_epi32(v, other.v) }; }
 	// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
@@ -64,7 +67,7 @@ struct Vec4F32 {
 		__m128i value = _mm_loadl_epi64((const __m128i *)src);
 		__m128i value16 = _mm_unpacklo_epi8(value, value);
 		// 16-bit to 32-bit, use the upper words and an arithmetic shift right to sign extend
-		return Vec4F32{ _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(value16, value16), 16)) };
+		return Vec4F32{ _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(value16, value16), 24)) };
 	}
 
 	static Vec4F32 FromVec4S32(Vec4S32 other) { return Vec4F32{ _mm_cvtepi32_ps(other.v) }; }
@@ -72,8 +75,20 @@ struct Vec4F32 {
 	Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ _mm_add_ps(v, other.v) }; }
 	Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ _mm_sub_ps(v, other.v) }; }
 	Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ _mm_mul_ps(v, other.v) }; }
+	void operator +=(Vec4F32 other) { v = _mm_add_ps(v, other.v); }
+	void operator -=(Vec4F32 other) { v = _mm_sub_ps(v, other.v); }
+	void operator *=(Vec4F32 other) { v = _mm_mul_ps(v, other.v); }
+	void operator /=(Vec4F32 other) { v = _mm_div_ps(v, other.v); }
+	Vec4F32 operator *(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; }
 
 	Vec4F32 Mul(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; }
+	Vec4F32 Recip() { return Vec4F32{ _mm_rcp_ps(v) }; }
+
+	Vec4F32 Clamp(float lower, float higher) {
+		return Vec4F32{
+			_mm_min_ps(_mm_max_ps(v, _mm_set1_ps(lower)), _mm_set1_ps(higher))
+		};
+	}
 
 	inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
 		return Vec4F32{ _mm_add_ps(
@@ -87,8 +102,14 @@ struct Vec4F32 {
 			)
 		};
 	}
+
+	static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
+		_MM_TRANSPOSE4_PS(col0.v, col1.v, col2.v, col3.v);
+	}
 };
 
+inline Vec4S32 VecS32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvtps_epi32(f.v) }; }
+
 struct Vec4U16 {
 	__m128i v;  // we only use the lower 64 bits.
 
@@ -112,16 +133,6 @@ struct Mat4F32 {
 		col2 = vld1q_f32(matrix + 8);
 		col3 = vld1q_f32(matrix + 12);
 	}
-	void Transpose() {
-		float32x4_t temp0 = vzip1q_s32(col0, col2);
-		float32x4_t temp1 = vzip2q_s32(col0, col2);
-		float32x4_t temp2 = vzip1q_s32(col1, col3);
-		float32x4_t temp3 = vzip2q_s32(col1, col3);
-		col0 = vzip1q_s32(temp0, temp2);
-		col1 = vzip2q_s32(temp0, temp2);
-		col2 = vzip1q_s32(temp1, temp3);
-		col3 = vzip2q_s32(temp1, temp3);
-	}
 	float32x4_t col0;
 	float32x4_t col1;
 	float32x4_t col2;
@@ -139,6 +150,14 @@ struct Vec4S32 {
 	void Store(int *dst) { vst1q_s32(dst, v); }
 	void StoreAligned(int *dst) { vst1q_s32(dst, v); }
 
+	// Swaps the two lower elements, but NOT the two upper ones. Useful for reversing triangles..
+	// This is quite awkward on ARM64 :/ Maybe there's a better solution?
+	Vec4S32 SwapLowerElements() {
+		float32x2_t upper = vget_high_s32(v);
+		float32x2_t lowerSwapped = vrev64_s32(vget_low_s32(v));
+		return Vec4S32{ vcombine_s32(lowerSwapped, upper) };
+	};
+
 	Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ vaddq_s32(v, other.v) }; }
 	Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ vsubq_s32(v, other.v) }; }
 	Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
@@ -175,8 +194,39 @@ struct Vec4F32 {
 	Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ vaddq_f32(v, other.v) }; }
 	Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ vsubq_f32(v, other.v) }; }
 	Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ vmulq_f32(v, other.v) }; }
+	void operator +=(Vec4F32 other) { v = vaddq_f32(v, other.v); }
+	void operator -=(Vec4F32 other) { v = vsubq_f32(v, other.v); }
+	void operator *=(Vec4F32 other) { v = vmulq_f32(v, other.v); }
+	void operator /=(Vec4F32 other) { v = vmulq_f32(v, other.Recip().v); }
+	Vec4F32 operator *(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; }
 
 	Vec4F32 Mul(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; }
+	Vec4F32 Recip() {
+		float32x4_t recip = vrecpeq_f32(v);
+		// Use a couple Newton-Raphson steps to refine the estimate.
+		// May be able to get away with only one refinement, not sure!
+		recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
+		recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
+		return Vec4F32{ recip };
+	}
+
+	Vec4F32 Clamp(float lower, float higher) {
+		return Vec4F32{
+			vminq_f32(vmaxq_f32(v, vdupq_n_f32(lower)), vdupq_n_f32(higher))
+		};
+	}
+
+	// One of many possible solutions. Sometimes we could also use vld4q_f32 probably..
+	static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
+		float32x4_t temp0 = vzip1q_s32(col0.v, col2.v);
+		float32x4_t temp1 = vzip2q_s32(col0.v, col2.v);
+		float32x4_t temp2 = vzip1q_s32(col1.v, col3.v);
+		float32x4_t temp3 = vzip2q_s32(col1.v, col3.v);
+		col0.v = vzip1q_s32(temp0, temp2);
+		col1.v = vzip2q_s32(temp0, temp2);
+		col2.v = vzip1q_s32(temp1, temp3);
+		col3.v = vzip2q_s32(temp1, temp3);
+	}
 
 	inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
 #if PPSSPP_ARCH(ARM64_NEON)
@@ -192,6 +242,8 @@ struct Vec4F32 {
 	}
 };
 
+inline Vec4S32 VecS32FromF32(Vec4F32 f) { return Vec4S32{ vcvtq_s32_f32(f.v) }; }
+
 struct Vec4U16 {
 	uint16x4_t v;  // we only use the lower 64 bits.
 

From 67078d439b3d66e4c6b4b9ed7986791d8068c1e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Fri, 20 Dec 2024 15:28:56 +0100
Subject: [PATCH 16/28] Depth raster: Switch to a SoA data layout for the
 screen space verts

---
 GPU/Common/DepthRaster.cpp      | 51 ++++++++++++++++-----------------
 GPU/Common/DepthRaster.h        |  8 +++---
 GPU/Common/DrawEngineCommon.cpp | 22 ++++++++++----
 GPU/Common/DrawEngineCommon.h   |  2 +-
 4 files changed, 46 insertions(+), 37 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index 23e506ff7be9..dbf3a2f06824 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -81,7 +81,7 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2,
 // Adapted from Intel's depth rasterizer example.
 // Started with the scalar version, will SIMD-ify later.
 // x1/y1 etc are the scissor rect.
-void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const DepthScreenVertex vertsSub[3], GEComparison compareMode) {
+void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, GEComparison compareMode) {
 	int tileStartX = x1;
 	int tileEndX = x2;
 
@@ -94,15 +94,15 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
 
 	// Convert to whole pixels for now. Later subpixel precision.
 	DepthScreenVertex verts[3];
-	verts[0].x = vertsSub[0].x;
-	verts[0].y = vertsSub[0].y;
-	verts[0].z = vertsSub[0].z;
-	verts[1].x = vertsSub[2].x;
-	verts[1].y = vertsSub[2].y;
-	verts[1].z = vertsSub[2].z;
-	verts[2].x = vertsSub[1].x;
-	verts[2].y = vertsSub[1].y;
-	verts[2].z = vertsSub[1].z;
+	verts[0].x = tx[0];
+	verts[0].y = ty[0];
+	verts[0].z = tz[0];
+	verts[1].x = tx[2];
+	verts[1].y = ty[2];
+	verts[1].z = tz[2];
+	verts[2].x = tx[1];
+	verts[2].y = ty[1];
+	verts[2].z = tz[1];
 
 	// use fixed-point only for X and Y.  Avoid work for Z and W.
 	int startX = std::max(std::min(std::min(verts[0].x, verts[1].x), verts[2].x), tileStartX);
@@ -242,7 +242,7 @@ void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const f
 	}
 }
 
-int DepthRasterClipIndexedTriangles(DepthScreenVertex *screenVerts, const float *transformed, const uint16_t *indexBuffer, int count) {
+int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count) {
 	bool cullEnabled = gstate.isCullEnabled();
 
 	const float viewportX = gstate.getViewportXCenter();
@@ -289,28 +289,28 @@ int DepthRasterClipIndexedTriangles(DepthScreenVertex *screenVerts, const float
 			if (screen[2] >= 65535.0f) {
 				screen[2] = 65535.0f;
 			}
-			screenVerts[outCount].x = screen[0] * (1.0f / 16.0f);  // We ditch the subpixel precision here.
-			screenVerts[outCount].y = screen[1] * (1.0f / 16.0f);
-			screenVerts[outCount].z = screen[2];
-
+			tx[outCount] = screen[0] * (1.0f / 16.0f);  // We ditch the subpixel precision here.
+			ty[outCount] = screen[1] * (1.0f / 16.0f);
+			tz[outCount] = screen[2];
 			outCount++;
 		}
 	}
 	return outCount;
 }
 
-void DepthRasterConvertTransformed(DepthScreenVertex *screenVerts, GEPrimitiveType prim, const TransformedVertex *transformed, int count) {
+void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, GEPrimitiveType prim, const TransformedVertex *transformed, int count) {
 	_dbg_assert_(prim == GE_PRIM_RECTANGLES || prim == GE_PRIM_TRIANGLES);
 
+	// TODO: This is basically a transpose, or AoS->SoA conversion. There may be fast ways.
 	for (int i = 0; i < count; i++) {
-		screenVerts[i].x = (int)transformed[i].pos[0];
-		screenVerts[i].y = (int)transformed[i].pos[1];
-		screenVerts[i].z = (u16)transformed[i].pos[2];
+		tx[i] = (int)transformed[i].pos[0];
+		ty[i] = (int)transformed[i].pos[1];
+		tz[i] = (u16)transformed[i].pos[2];
 	}
 }
 
 // Rasterizes screen-space vertices.
-void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const DepthScreenVertex *screenVerts, int count) {
+void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, int count) {
 	// Prim should now be either TRIANGLES or RECTs.
 	_dbg_assert_(prim == GE_PRIM_RECTANGLES || prim == GE_PRIM_TRIANGLES);
 
@@ -327,17 +327,16 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType pr
 
 	switch (prim) {
 	case GE_PRIM_RECTANGLES:
-		for (int i = 0; i < count / 2; i++) {
-			uint16_t z = screenVerts[i + 1].z;  // depth from second vertex
+		for (int i = 0; i < count; i += 2) {
+			uint16_t z = tz[i + 1];  // depth from second vertex
 			// TODO: Should clip coordinates to the scissor rectangle.
 			// We remove the subpixel information here.
-			DepthRasterRect(depth, depthStride, screenVerts[i].x, screenVerts[i].y, screenVerts[i + 1].x, screenVerts[i + 1].y,
-				z, compareMode);
+			DepthRasterRect(depth, depthStride, tx[i], ty[i], tx[i + 1], ty[i + 1], z, compareMode);
 		}
 		break;
 	case GE_PRIM_TRIANGLES:
-		for (int i = 0; i < count / 3; i++) {
-			DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, screenVerts + i * 3, compareMode);
+		for (int i = 0; i < count; i += 3) {
+			DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, &tx[i], &ty[i], &tz[i], compareMode);
 		}
 		break;
 	default:
diff --git a/GPU/Common/DepthRaster.h b/GPU/Common/DepthRaster.h
index 50ef309d936c..50e7a2577ddb 100644
--- a/GPU/Common/DepthRaster.h
+++ b/GPU/Common/DepthRaster.h
@@ -6,7 +6,7 @@
 struct DepthScreenVertex {
 	int x;
 	int y;
-	uint16_t z;
+	int z;
 };
 
 // Specialized, very limited depth-only rasterizer.
@@ -17,7 +17,7 @@ struct DepthScreenVertex {
 class VertexDecoder;
 struct TransformedVertex;
 
-int DepthRasterClipIndexedTriangles(DepthScreenVertex *screenVerts, const float *transformed, const uint16_t *indexBuffer, int count);
+int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count);
 void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int count, VertexDecoder *dec, u32 vertTypeID);
-void DepthRasterConvertTransformed(DepthScreenVertex *screenVerts, GEPrimitiveType prim, const TransformedVertex *transformed, int count);
-void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const DepthScreenVertex *screenVerts, int count);
+void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, GEPrimitiveType prim, const TransformedVertex *transformed, int count);
+void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, int count);
diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp
index 60024326fbe5..8f857f20acb9 100644
--- a/GPU/Common/DrawEngineCommon.cpp
+++ b/GPU/Common/DrawEngineCommon.cpp
@@ -38,7 +38,9 @@
 enum {
 	TRANSFORMED_VERTEX_BUFFER_SIZE = VERTEX_BUFFER_MAX * sizeof(TransformedVertex),
 	DEPTH_TRANSFORMED_SIZE = VERTEX_BUFFER_MAX * 4,
-	DEPTH_SCREENVERTS_SIZE = VERTEX_BUFFER_MAX * sizeof(DepthScreenVertex),
+	DEPTH_SCREENVERTS_COMPONENT_COUNT = VERTEX_BUFFER_MAX,
+	DEPTH_SCREENVERTS_COMPONENT_SIZE = DEPTH_SCREENVERTS_COMPONENT_COUNT * sizeof(int) + 384,
+	DEPTH_SCREENVERTS_SIZE = DEPTH_SCREENVERTS_COMPONENT_SIZE * 3,
 };
 
 DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) {
@@ -54,7 +56,7 @@ DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) {
 	useDepthRaster_ = PSP_CoreParameter().compat.flags().SoftwareRasterDepth;
 	if (useDepthRaster_) {
 		depthTransformed_ = (float *)AllocateMemoryPages(DEPTH_TRANSFORMED_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
-		depthScreenVerts_ = (DepthScreenVertex *)AllocateMemoryPages(DEPTH_SCREENVERTS_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
+		depthScreenVerts_ = (int *)AllocateMemoryPages(DEPTH_SCREENVERTS_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
 	}
 }
 
@@ -933,12 +935,16 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder
 		numDec += drawVerts_[i].vertexCount;
 	}
 
+	int *tx = depthScreenVerts_;
+	int *ty = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT;
+	int *tz = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2;
+
 	// Clip and triangulate using the index buffer.
-	int outVertCount = DepthRasterClipIndexedTriangles(depthScreenVerts_, depthTransformed_, decIndex_, numDec);
+	int outVertCount = DepthRasterClipIndexedTriangles(tx, ty, tz, depthTransformed_, decIndex_, numDec);
 
 	DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(),
 		GE_PRIM_TRIANGLES, gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(),
-		depthScreenVerts_, outVertCount);
+		tx, ty, tz, outVertCount);
 }
 
 void DrawEngineCommon::DepthRasterPretransformed(GEPrimitiveType prim, const TransformedVertex *inVerts, int count) {
@@ -955,8 +961,12 @@ void DrawEngineCommon::DepthRasterPretransformed(GEPrimitiveType prim, const Tra
 
 	_dbg_assert_(prim != GE_PRIM_TRIANGLE_STRIP && prim != GE_PRIM_TRIANGLE_FAN);
 
-	DepthRasterConvertTransformed(depthScreenVerts_, prim, inVerts, count);
+	int *tx = depthScreenVerts_;
+	int *ty = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT;
+	int *tz = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2;
+
+	DepthRasterConvertTransformed(tx, ty, tz, prim, inVerts, count);
 	DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(),
 		prim, gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(),
-		depthScreenVerts_, count);
+		tx, ty, tz, count);
 }
diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h
index 64e8478cd06e..2df31f2fba96 100644
--- a/GPU/Common/DrawEngineCommon.h
+++ b/GPU/Common/DrawEngineCommon.h
@@ -357,5 +357,5 @@ class DrawEngineCommon {
 	bool useDepthRaster_ = false;
 
 	float *depthTransformed_ = nullptr;
-	DepthScreenVertex *depthScreenVerts_ = nullptr;
+	int *depthScreenVerts_ = nullptr;
 };

From 820e7369b9b07f975fbfabca2ebc175b45d3bbb7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Fri, 20 Dec 2024 16:26:51 +0100
Subject: [PATCH 17/28] Speed up DepthRasterClipIndexedTriangles with CrossSIMD

---
 GPU/Common/DepthRaster.cpp      | 66 +++++++++++++++++----------------
 GPU/Common/DrawEngineCommon.cpp | 16 ++++++++
 2 files changed, 50 insertions(+), 32 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index dbf3a2f06824..b0bbb44d2761 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -245,19 +245,21 @@ void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const f
 int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count) {
 	bool cullEnabled = gstate.isCullEnabled();
 
-	const float viewportX = gstate.getViewportXCenter();
-	const float viewportY = gstate.getViewportYCenter();
-	const float viewportZ = gstate.getViewportZCenter();
-	const float viewportScaleX = gstate.getViewportXScale();
-	const float viewportScaleY = gstate.getViewportYScale();
-	const float viewportScaleZ = gstate.getViewportZScale();
+	// TODO: On ARM we can do better by keeping these in lanes instead of splatting.
+	// However, hard to find a common abstraction.
+	const Vec4F32 viewportX = Vec4F32::Splat(gstate.getViewportXCenter());
+	const Vec4F32 viewportY = Vec4F32::Splat(gstate.getViewportYCenter());
+	const Vec4F32 viewportZ = Vec4F32::Splat(gstate.getViewportZCenter());
+	const Vec4F32 viewportScaleX = Vec4F32::Splat(gstate.getViewportXScale());
+	const Vec4F32 viewportScaleY = Vec4F32::Splat(gstate.getViewportYScale());
+	const Vec4F32 viewportScaleZ = Vec4F32::Splat(gstate.getViewportZScale());
+
+	const Vec4F32 offsetX = Vec4F32::Splat(gstate.getOffsetX());  // We remove the 16 scale here
+	const Vec4F32 offsetY = Vec4F32::Splat(gstate.getOffsetY());
 
 	bool cullCCW = false;
 
-	// OK, we now have the coordinates. Let's transform, we can actually do this in-place.
-
 	int outCount = 0;
-
 	for (int i = 0; i < count; i += 3) {
 		const float *verts[3] = {
 			transformed + indexBuffer[i] * 4,
@@ -271,29 +273,29 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *tran
 			continue;
 		}
 
-		for (int c = 0; c < 3; c++) {
-			const float *src = verts[c];
-			float invW = 1.0f / src[3];
-
-			float x = src[0] * invW;
-			float y = src[1] * invW;
-			float z = src[2] * invW;
-
-			float screen[3];
-			screen[0] = (x * viewportScaleX + viewportX) * 16.0f - gstate.getOffsetX16();
-			screen[1] = (y * viewportScaleY + viewportY) * 16.0f - gstate.getOffsetY16();
-			screen[2] = (z * viewportScaleZ + viewportZ);
-			if (screen[2] < 0.0f) {
-				screen[2] = 0.0f;
-			}
-			if (screen[2] >= 65535.0f) {
-				screen[2] = 65535.0f;
-			}
-			tx[outCount] = screen[0] * (1.0f / 16.0f);  // We ditch the subpixel precision here.
-			ty[outCount] = screen[1] * (1.0f / 16.0f);
-			tz[outCount] = screen[2];
-			outCount++;
-		}
+		// These names are wrong .. until we transpose.
+		Vec4F32 x = Vec4F32::Load(verts[0]);
+		Vec4F32 y = Vec4F32::Load(verts[1]);
+		Vec4F32 z = Vec4F32::Load(verts[2]);
+		Vec4F32 w = Vec4F32::Zero();
+		Vec4F32::Transpose(x, y, z, w);
+		// Now the names are accurate! Since we only have three vertices, the fourth member of each vector is zero
+		// and will not be stored (well it will be stored, but it'll be overwritten by the next vertex).
+		Vec4F32 recipW = w.Recip();
+
+		x *= recipW;
+		y *= recipW;
+		z *= recipW;
+
+		Vec4F32 screen[3];
+		screen[0] = (x * viewportScaleX + viewportX) - offsetX;
+		screen[1] = (y * viewportScaleY + viewportY) - offsetY;
+		screen[2] = (z * viewportScaleZ + viewportZ).Clamp(0.0f, 65535.0f);
+
+		VecS32FromF32(screen[0]).Store(tx + outCount);
+		VecS32FromF32(screen[1]).Store(ty + outCount);
+		VecS32FromF32(screen[2]).Store(tz + outCount);
+		outCount += 3;
 	}
 	return outCount;
 }
diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp
index 8f857f20acb9..70da71a8fa09 100644
--- a/GPU/Common/DrawEngineCommon.cpp
+++ b/GPU/Common/DrawEngineCommon.cpp
@@ -941,6 +941,14 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder
 
 	// Clip and triangulate using the index buffer.
 	int outVertCount = DepthRasterClipIndexedTriangles(tx, ty, tz, depthTransformed_, decIndex_, numDec);
+	if (outVertCount & 15) {
+		// Zero padding
+		for (int i = outVertCount; i < ((outVertCount + 16) & ~15); i++) {
+			tx[i] = 0;
+			ty[i] = 0;
+			tz[i] = 0;
+		}
+	}
 
 	DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(),
 		GE_PRIM_TRIANGLES, gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(),
@@ -966,6 +974,14 @@ void DrawEngineCommon::DepthRasterPretransformed(GEPrimitiveType prim, const Tra
 	int *tz = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2;
 
 	DepthRasterConvertTransformed(tx, ty, tz, prim, inVerts, count);
+	if (count & 15) {
+		// Zero padding
+		for (int i = count; i < ((count + 16) & ~15); i++) {
+			tx[i] = 0;
+			ty[i] = 0;
+			tz[i] = 0;
+		}
+	}
 	DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(),
 		prim, gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(),
 		tx, ty, tz, count);

From 65692d036e89b5f85f1c9fd5444849b9b029360e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Sat, 21 Dec 2024 01:18:55 +0100
Subject: [PATCH 18/28] CrossSIMD: possible buildfix?

---
 Common/Math/CrossSIMD.h | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
index 65fd02b3d3df..945ea166b15e 100644
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@@ -108,7 +108,9 @@ struct Vec4F32 {
 	}
 };
 
-inline Vec4S32 VecS32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvtps_epi32(f.v) }; }
+inline Vec4S32 VecS32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvttps_epi32(f.v) }; }
+inline Vec4S32 VecS32FromF32Round(Vec4F32 f) { return Vec4S32{ _mm_cvtps_epi32(f.v) }; }
+inline Vec4F32 VecF32FromS32(Vec4S32 f) { return Vec4F32{ _mm_cvtepi32_ps(f.v) }; }
 
 struct Vec4U16 {
 	__m128i v;  // we only use the lower 64 bits.
@@ -218,14 +220,14 @@ struct Vec4F32 {
 
 	// One of many possible solutions. Sometimes we could also use vld4q_f32 probably..
 	static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
-		float32x4_t temp0 = vzip1q_s32(col0.v, col2.v);
-		float32x4_t temp1 = vzip2q_s32(col0.v, col2.v);
-		float32x4_t temp2 = vzip1q_s32(col1.v, col3.v);
-		float32x4_t temp3 = vzip2q_s32(col1.v, col3.v);
-		col0.v = vzip1q_s32(temp0, temp2);
-		col1.v = vzip2q_s32(temp0, temp2);
-		col2.v = vzip1q_s32(temp1, temp3);
-		col3.v = vzip2q_s32(temp1, temp3);
+		float32x4_t temp0 = vzip1q_f32(col0.v, col2.v);
+		float32x4_t temp1 = vzip2q_f32(col0.v, col2.v);
+		float32x4_t temp2 = vzip1q_f32(col1.v, col3.v);
+		float32x4_t temp3 = vzip2q_f32(col1.v, col3.v);
+		col0.v = vzip1q_f32(temp0, temp2);
+		col1.v = vzip2q_f32(temp0, temp2);
+		col2.v = vzip1q_f32(temp1, temp3);
+		col3.v = vzip2q_f32(temp1, temp3);
 	}
 
 	inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {

From a344d0225f2c2cc7337c0d841e4e48bac2b95969 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Fri, 20 Dec 2024 20:00:06 +0100
Subject: [PATCH 19/28] DepthRaster: Fix bug where we used the wrong vertex
 count.

---
 GPU/Common/DepthRaster.cpp      | 13 +++++++++----
 GPU/Common/DepthRaster.h        |  2 +-
 GPU/Common/DrawEngineCommon.cpp | 20 ++++++++++++++++----
 GPU/Common/DrawEngineCommon.h   |  2 +-
 GPU/Vulkan/DrawEngineVulkan.cpp |  2 +-
 5 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index b0bbb44d2761..0810f1778cde 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -161,6 +161,8 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
 
 	// END triangle setup.
 
+	// Here we should draw four triangles in a sequence.
+
 	// Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY)
 	for (int r = startY; r < endY; r++,
 		row++,
@@ -211,7 +213,7 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
 	} // for each row
 }
 
-void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int count, VertexDecoder *dec, u32 vertTypeID) {
+void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, VertexDecoder *dec, u32 vertTypeID) {
 	// TODO: Ditch skinned and morphed prims for now since we don't have a fast way to skin without running the full decoder.
 	_dbg_assert_((vertTypeID & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) == 0);
 
@@ -220,22 +222,25 @@ void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const f
 
 	Mat4F32 mat(worldviewproj);
 
+	const u8 *startPtr = (const u8 *)vertexData + indexLowerBound * vertexStride;
+	int count = indexUpperBound - indexLowerBound + 1;
+
 	switch (vertTypeID & GE_VTYPE_POS_MASK) {
 	case GE_VTYPE_POS_FLOAT:
 		for (int i = 0; i < count; i++) {
-			const float *data = (const float *)((const u8 *)vertexData + vertexStride * i + offset);
+			const float *data = (const float *)(startPtr + i * vertexStride + offset);
 			Vec4F32::Load(data).AsVec3ByMatrix44(mat).Store(dest + i * 4);
 		}
 		break;
 	case GE_VTYPE_POS_16BIT:
 		for (int i = 0; i < count; i++) {
-			const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset));
+			const s16 *data = ((const s16 *)((const s8 *)startPtr + i * vertexStride + offset));
 			Vec4F32::LoadConvertS16(data).Mul(1.0f / 32768.f).AsVec3ByMatrix44(mat).Store(dest + i * 4);
 		}
 		break;
 	case GE_VTYPE_POS_8BIT:
 		for (int i = 0; i < count; i++) {
-			const s8 *data = (const s8 *)vertexData + i * vertexStride + offset;
+			const s8 *data = (const s8 *)startPtr + i * vertexStride + offset;
 			Vec4F32::LoadConvertS8(data).Mul(1.0f / 128.0f).AsVec3ByMatrix44(mat).Store(dest + i * 4);
 		}
 		break;
diff --git a/GPU/Common/DepthRaster.h b/GPU/Common/DepthRaster.h
index 50e7a2577ddb..e92c1a1348ed 100644
--- a/GPU/Common/DepthRaster.h
+++ b/GPU/Common/DepthRaster.h
@@ -18,6 +18,6 @@ class VertexDecoder;
 struct TransformedVertex;
 
 int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count);
-void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int count, VertexDecoder *dec, u32 vertTypeID);
+void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, VertexDecoder *dec, u32 vertTypeID);
 void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, GEPrimitiveType prim, const TransformedVertex *transformed, int count);
 void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, int count);
diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp
index 70da71a8fa09..bbe36fb479e7 100644
--- a/GPU/Common/DrawEngineCommon.cpp
+++ b/GPU/Common/DrawEngineCommon.cpp
@@ -903,7 +903,7 @@ bool DrawEngineCommon::DescribeCodePtr(const u8 *ptr, std::string &name) const {
 	}
 }
 
-void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID) {
+void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID, int vertexCount) {
 	switch (prim) {
 	case GE_PRIM_INVALID:
 	case GE_PRIM_KEEP_PREVIOUS:
@@ -931,8 +931,20 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder
 	// Decode.
 	int numDec = 0;
 	for (int i = 0; i < numDrawVerts_; i++) {
-		DecodeAndTransformForDepthRaster(depthTransformed_ + numDec * 4, prim, worldviewproj, drawVerts_[i].verts, drawVerts_[i].vertexCount, dec, vertTypeID);
-		numDec += drawVerts_[i].vertexCount;
+		DeferredVerts &dv = drawVerts_[i];
+
+		int indexLowerBound = dv.indexLowerBound;
+		drawVertexOffsets_[i] = numDec - indexLowerBound;
+
+		int indexUpperBound = dv.indexUpperBound;
+		if (indexUpperBound + 1 - indexLowerBound + numDec >= VERTEX_BUFFER_MAX) {
+			// Hit our limit! Stop decoding in this draw.
+			break;
+		}
+
+		// Decode the verts (and at the same time apply morphing/skinning). Simple.
+		DecodeAndTransformForDepthRaster(depthTransformed_ + numDec * 4, prim, worldviewproj, dv.verts, indexLowerBound, indexUpperBound, dec, vertTypeID);
+		numDec += indexUpperBound - indexLowerBound + 1;
 	}
 
 	int *tx = depthScreenVerts_;
@@ -940,7 +952,7 @@ void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder
 	int *tz = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2;
 
 	// Clip and triangulate using the index buffer.
-	int outVertCount = DepthRasterClipIndexedTriangles(tx, ty, tz, depthTransformed_, decIndex_, numDec);
+	int outVertCount = DepthRasterClipIndexedTriangles(tx, ty, tz, depthTransformed_, decIndex_, vertexCount);
 	if (outVertCount & 15) {
 		// Zero padding
 		for (int i = outVertCount; i < ((outVertCount + 16) & ~15); i++) {
diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h
index 2df31f2fba96..053c4c31f55b 100644
--- a/GPU/Common/DrawEngineCommon.h
+++ b/GPU/Common/DrawEngineCommon.h
@@ -175,7 +175,7 @@ class DrawEngineCommon {
 
 	void ApplyFramebufferRead(FBOTexState *fboTexState);
 
-	void DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID);
+	void DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID, int vertexCount);
 	void DepthRasterPretransformed(GEPrimitiveType prim, const TransformedVertex *inVerts, int count);
 
 	static inline int IndexSize(u32 vtype) {
diff --git a/GPU/Vulkan/DrawEngineVulkan.cpp b/GPU/Vulkan/DrawEngineVulkan.cpp
index e1775722ee3e..ebe3d022df2a 100644
--- a/GPU/Vulkan/DrawEngineVulkan.cpp
+++ b/GPU/Vulkan/DrawEngineVulkan.cpp
@@ -371,7 +371,7 @@ void DrawEngineVulkan::Flush() {
 			renderManager->Draw(descSetIndex, ARRAY_SIZE(dynamicUBOOffsets), dynamicUBOOffsets, vbuf, vbOffset, vertexCount);
 		}
 		if (useDepthRaster_) {
-			DepthRasterTransform(prim, dec_, dec_->VertexType());
+			DepthRasterTransform(prim, dec_, dec_->VertexType(), vertexCount);
 		}
 	} else {
 		PROFILE_THIS_SCOPE("soft");

From f886578c0e84cd28e482d3f3da9d4e7e80d543c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Fri, 20 Dec 2024 20:08:33 +0100
Subject: [PATCH 20/28] DepthRaster: Fix backface culling

---
 GPU/Common/DepthRaster.cpp | 45 +++++++++++++++++++++++++-------------
 GPU/GPUState.h             |  2 +-
 GPU/ge_constants.h         |  5 +++++
 3 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index 0810f1778cde..9532e7b3cad7 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -97,12 +97,12 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
 	verts[0].x = tx[0];
 	verts[0].y = ty[0];
 	verts[0].z = tz[0];
-	verts[1].x = tx[2];
-	verts[1].y = ty[2];
-	verts[1].z = tz[2];
-	verts[2].x = tx[1];
-	verts[2].y = ty[1];
-	verts[2].z = tz[1];
+	verts[1].x = tx[1];
+	verts[1].y = ty[1];
+	verts[1].z = tz[1];
+	verts[2].x = tx[2];
+	verts[2].y = ty[2];
+	verts[2].z = tz[2];
 
 	// use fixed-point only for X and Y.  Avoid work for Z and W.
 	int startX = std::max(std::min(std::min(verts[0].x, verts[1].x), verts[2].x), tileStartX);
@@ -249,6 +249,7 @@ void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const f
 
 int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count) {
 	bool cullEnabled = gstate.isCullEnabled();
+	GECullMode cullMode = gstate.getCullMode();
 
 	// TODO: On ARM we can do better by keeping these in lanes instead of splatting.
 	// However, hard to find a common abstraction.
@@ -265,11 +266,16 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *tran
 	bool cullCCW = false;
 
 	int outCount = 0;
+
+	int flipCull = 0;
+	if (cullEnabled && cullMode == GE_CULL_CW) {
+		flipCull = 3;
+	}
 	for (int i = 0; i < count; i += 3) {
 		const float *verts[3] = {
 			transformed + indexBuffer[i] * 4,
-			transformed + indexBuffer[i + 1] * 4,
-			transformed + indexBuffer[i + 2] * 4,
+			transformed + indexBuffer[i + (1 ^ flipCull)] * 4,
+			transformed + indexBuffer[i + (2 ^ flipCull)] * 4,
 		};
 
 		// Check if any vertex is behind the 0 plane.
@@ -292,15 +298,24 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *tran
 		y *= recipW;
 		z *= recipW;
 
-		Vec4F32 screen[3];
-		screen[0] = (x * viewportScaleX + viewportX) - offsetX;
-		screen[1] = (y * viewportScaleY + viewportY) - offsetY;
-		screen[2] = (z * viewportScaleZ + viewportZ).Clamp(0.0f, 65535.0f);
+		Vec4S32 screen[3];
+		screen[0] = VecS32FromF32((x * viewportScaleX + viewportX) - offsetX);
+		screen[1] = VecS32FromF32((y * viewportScaleY + viewportY) - offsetY);
+		screen[2] = VecS32FromF32((z * viewportScaleZ + viewportZ).Clamp(0.0f, 65535.0f));
 
-		VecS32FromF32(screen[0]).Store(tx + outCount);
-		VecS32FromF32(screen[1]).Store(ty + outCount);
-		VecS32FromF32(screen[2]).Store(tz + outCount);
+		screen[0].Store(tx + outCount);
+		screen[1].Store(ty + outCount);
+		screen[2].Store(tz + outCount);
 		outCount += 3;
+
+		if (!cullEnabled) {
+			// If culling is off, shuffle the three vectors to produce the opposite triangle, and store them after.
+			// Or on ARM we might be better off just storing individual elements...
+			screen[0].SwapLowerElements().Store(tx + outCount);
+			screen[1].SwapLowerElements().Store(ty + outCount);
+			screen[2].SwapLowerElements().Store(tz + outCount);
+			outCount += 3;
+		}
 	}
 	return outCount;
 }
diff --git a/GPU/GPUState.h b/GPU/GPUState.h
index b5318331750b..e2814d5a1ce7 100644
--- a/GPU/GPUState.h
+++ b/GPU/GPUState.h
@@ -227,7 +227,7 @@ struct GPUgstate {
 
 	// Cull
 	bool isCullEnabled() const { return cullfaceEnable & 1; }
-	int getCullMode()   const { return cullmode & 1; }
+	GECullMode getCullMode()   const { return (GECullMode)(cullmode & 1); }
 
 	// Color Mask
 	bool isClearModeColorMask() const { return (clearmode&0x100) != 0; }
diff --git a/GPU/ge_constants.h b/GPU/ge_constants.h
index 1d123e162093..f4a366f80250 100644
--- a/GPU/ge_constants.h
+++ b/GPU/ge_constants.h
@@ -623,6 +623,11 @@ enum GEPatchPrimType
 	GE_PATCHPRIM_UNKNOWN = 3,
 };
 
+enum GECullMode {
+	GE_CULL_CW = 0,
+	GE_CULL_CCW = 1,
+};
+
 inline GEPrimitiveType PatchPrimToPrim(GEPatchPrimType type) {
 	switch (type) {
 	case GE_PATCHPRIM_TRIANGLES: return GE_PRIM_TRIANGLES;

From ad1809875a8f8da561171209c94aac135f32868a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Fri, 20 Dec 2024 20:28:15 +0100
Subject: [PATCH 21/28] Minor sign check optimization

---
 GPU/Common/DepthRaster.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index 9532e7b3cad7..ef7c4bc78f20 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -184,10 +184,12 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
 			beta += A1,
 			gamma += A2)
 		{
-			int mask = alpha >= 0 && beta >= 0 && gamma >= 0;
+			int mask = alpha | beta | gamma;
 			// Early out if all of this quad's pixels are outside the triangle.
-			if (!mask) {
+			if (mask < 0) {
 				continue;
+			} else {
+				mask = 1;
 			}
 			// Compute barycentric-interpolated depth. Could also compute it incrementally.
 			float depth = zz[0] + beta * zz[1] + gamma * zz[2];

From d1b50ea543c4ad62a5c5b523d73653c37865252e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Sat, 21 Dec 2024 01:19:10 +0100
Subject: [PATCH 22/28] Comment

---
 GPU/Common/DepthRaster.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index ef7c4bc78f20..1da6e6ac9a2b 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -184,12 +184,10 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
 			beta += A1,
 			gamma += A2)
 		{
-			int mask = alpha | beta | gamma;
+			int mask = (alpha | beta | gamma) >= 0;
 			// Early out if all of this quad's pixels are outside the triangle.
-			if (mask < 0) {
+			if (!mask) {
 				continue;
-			} else {
-				mask = 1;
 			}
 			// Compute barycentric-interpolated depth. Could also compute it incrementally.
 			float depth = zz[0] + beta * zz[1] + gamma * zz[2];
@@ -312,7 +310,12 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *tran
 
 		if (!cullEnabled) {
 			// If culling is off, shuffle the three vectors to produce the opposite triangle, and store them after.
-			// Or on ARM we might be better off just storing individual elements...
+
+			// HOWEVER! I realized that this is not the optimal layout, after all.
+			// We should group 4 triangles at a time and interleave them (so we first have all X of vertex 0,
+			// then all X of vertex 1, and so on). This seems solvable with another transpose, if we can easily
+			// collect four triangles at a time...
+
 			screen[0].SwapLowerElements().Store(tx + outCount);
 			screen[1].SwapLowerElements().Store(ty + outCount);
 			screen[2].SwapLowerElements().Store(tz + outCount);

From 2051d55c90202570ae1170c114c3750745f45440 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Sat, 21 Dec 2024 10:56:56 +0100
Subject: [PATCH 23/28] CrossSIMD: Add a bunch more functonality for use by the
 rasterizer

---
 Common/Math/CrossSIMD.h   | 100 ++++++++++++++++++++++++++++++++++----
 Common/Math/SIMDHeaders.h |  18 +++++++
 unittest/UnitTest.cpp     |   7 ++-
 3 files changed, 113 insertions(+), 12 deletions(-)

diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
index 945ea166b15e..f723ea57d6d6 100644
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@@ -39,13 +39,29 @@ struct Vec4S32 {
 			_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 0, 1))
 		};
 	}
+	Vec4S32 SignBits32ToMask() {
+		return Vec4S32{
+			_mm_srai_epi32(v, 31)
+		};
+	}
 
 	Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ _mm_add_epi32(v, other.v) }; }
 	Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ _mm_sub_epi32(v, other.v) }; }
+	Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ _mm_or_si128(v, other.v) }; }
+	Vec4S32 operator &(Vec4S32 other) const { return Vec4S32{ _mm_and_si128(v, other.v) }; }
+	Vec4S32 operator ^(Vec4S32 other) const { return Vec4S32{ _mm_xor_si128(v, other.v) }; }
+	// TODO: andnot
+	void operator +=(Vec4S32 other) { v = _mm_add_epi32(v, other.v); }
+	void operator -=(Vec4S32 other) { v = _mm_sub_epi32(v, other.v); }
+
 	// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
 	Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; }  // (ab3,ab2,ab1,ab0)
 };
 
+inline bool AnyZeroSignBit(Vec4S32 value) {
+	return _mm_movemask_ps(_mm_castsi128_ps(value.v)) != 0xF;
+}
+
 struct Vec4F32 {
 	__m128 v;
 
@@ -108,9 +124,8 @@ struct Vec4F32 {
 	}
 };
 
-inline Vec4S32 VecS32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvttps_epi32(f.v) }; }
-inline Vec4S32 VecS32FromF32Round(Vec4F32 f) { return Vec4S32{ _mm_cvtps_epi32(f.v) }; }
-inline Vec4F32 VecF32FromS32(Vec4S32 f) { return Vec4F32{ _mm_cvtepi32_ps(f.v) }; }
+inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvttps_epi32(f.v) }; }
+inline Vec4F32 Vec4F32FromS32(Vec4S32 f) { return Vec4F32{ _mm_cvtepi32_ps(f.v) }; }
 
 struct Vec4U16 {
 	__m128i v;  // we only use the lower 64 bits.
@@ -121,11 +136,36 @@ struct Vec4U16 {
 	static Vec4U16 Load(const uint16_t *mem) { return Vec4U16{ _mm_loadl_epi64((__m128i *)mem) }; }
 	void Store(uint16_t *mem) { _mm_storel_epi64((__m128i *)mem, v); }
 
-	static Vec4U16 Max(Vec4U16 a, Vec4U16 b) { return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) }; }
-	static Vec4U16 Min(Vec4U16 a, Vec4U16 b) { return Vec4U16{ _mm_max_epu16_SSE2(a.v, b.v) }; }
+	// NOTE: 16-bit signed saturation! Will work for a lot of things, but not all.
+	static Vec4U16 FromVec4S32(Vec4S32 v) {
+		return Vec4U16{ _mm_packu_epi32_SSE2(v.v)};
+	}
+	static Vec4U16 FromVec4F32(Vec4F32 v) {
+		return Vec4U16{ _mm_packu_epi32_SSE2(_mm_cvtps_epi32(v.v)) };
+	}
+
+	Vec4U16 operator |(Vec4U16 other) const { return Vec4U16{ _mm_or_si128(v, other.v) }; }
+	Vec4U16 operator &(Vec4U16 other) const { return Vec4U16{ _mm_and_si128(v, other.v) }; }
+	Vec4U16 operator ^(Vec4U16 other) const { return Vec4U16{ _mm_xor_si128(v, other.v) }; }
+
+	Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ _mm_max_epu16_SSE2(v, other.v) }; }
+	Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ _mm_max_epu16_SSE2(v, other.v) }; }
 	Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ _mm_cmplt_epu16(v, other.v) }; }
 };
 
+Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
+	__m128i temp = _mm_srai_epi32(v.v, 31);
+	return Vec4U16 {
+		_mm_packs_epi32(temp, temp)
+	};
+}
+
+Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) {
+	return Vec4U16{
+		_mm_andnot_si128(inverted.v, a.v)  // NOTE: with andnot, the first parameter is inverted, and then and is performed.
+	};
+}
+
 #elif PPSSPP_ARCH(ARM_NEON)
 
 struct Mat4F32 {
@@ -163,6 +203,12 @@ struct Vec4S32 {
 	Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ vaddq_s32(v, other.v) }; }
 	Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ vsubq_s32(v, other.v) }; }
 	Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
+	Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ vorrq_s32(v, other.v) }; }
+	Vec4S32 operator &(Vec4S32 other) const { return Vec4S32{ vandq_s32(v, other.v) }; }
+	Vec4S32 operator ^(Vec4S32 other) const { return Vec4S32{ veorq_s32(v, other.v) }; }
+
+	void operator +=(Vec4S32 other) { v = vaddq_s32(v, other.v); }
+	void operator -=(Vec4S32 other) { v = vsubq_s32(v, other.v); }
 };
 
 struct Vec4F32 {
@@ -178,14 +224,12 @@ struct Vec4F32 {
 
 	static Vec4F32 LoadConvertS16(const int16_t *src) {  // Note: will load 8 bytes
 		int16x4_t value = vld1_s16(src);
-		// 16-bit to 32-bit, use the upper words and an arithmetic shift right to sign extend
 		return Vec4F32{ vcvtq_f32_s32(vmovl_s16(value)) };
 	}
 
 	static Vec4F32 LoadConvertS8(const int8_t *src) {  // Note: will load 8 bytes
 		int8x8_t value = vld1_s8(src);
 		int16x4_t value16 = vget_low_s16(vmovl_s8(value));
-		// 16-bit to 32-bit, use the upper words and an arithmetic shift right to sign extend
 		return Vec4F32{ vcvtq_f32_s32(vmovl_s16(value)) };
 	}
 
@@ -244,7 +288,15 @@ struct Vec4F32 {
 	}
 };
 
-inline Vec4S32 VecS32FromF32(Vec4F32 f) { return Vec4S32{ vcvtq_s32_f32(f.v) }; }
+inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ vcvtq_s32_f32(f.v) }; }
+inline Vec4F32 Vec4F32FromS32(Vec4S32 f) { return Vec4F32{ vcvtq_f32_s32(f.v) }; }
+
+inline bool AnyZeroSignBit(Vec4S32 value) {
+	// Very suboptimal, let's optimize later.
+	int32x2_t prod = vand_s32(vget_low_s32(value.v), vget_high_s32(value.v));
+	int mask = vget_lane_s32(prod, 0) & vget_lane_s32(prod, 1);
+	return (mask & 0x80000000) != 0;
+}
 
 struct Vec4U16 {
 	uint16x4_t v;  // we only use the lower 64 bits.
@@ -254,15 +306,43 @@ struct Vec4U16 {
 	static Vec4U16 Load(const uint16_t *mem) { return Vec4U16{ vld1_u16(mem) }; }
 	void Store(uint16_t *mem) { vst1_u16(mem, v); }
 
-	static Vec4U16 Max(Vec4U16 a, Vec4U16 b) { return Vec4U16{ vmax_u16(a.v, b.v) }; }
-	static Vec4U16 Min(Vec4U16 a, Vec4U16 b) { return Vec4U16{ vmin_u16(a.v, b.v) }; }
+	static Vec4U16 FromVec4S32(Vec4S32 v) {
+		return Vec4U16{ vmovn_u16(v.v) };
+	}
+	static Vec4U16 FromVec4F32(Vec4F32 v) {
+		return Vec4U16{ vmovn_u16(vcvtq_u32_f32(v.v)) };
+	}
+
+	Vec4U16 operator |(Vec4U16 other) const { return Vec4U16{ vorr_u16(v, other.v) }; }
+	Vec4U16 operator &(Vec4U16 other) const { return Vec4U16{ vand_u16(v, other.v) }; }
+	Vec4U16 operator ^(Vec4U16 other) const { return Vec4U16{ veor_u16(v, other.v) }; }
+
+	Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ vmax_u16(v, other.v) }; }
+	Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ vmin_u16(v, other.v) }; }
 	Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ vclt_u16(v, other.v) }; }
 };
 
+Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
+	int32x4_t sign_mask = vshrq_n_s32(v.v, 31);
+	uint16x4_t result = vreinterpret_u16_s16(vmovn_s32(sign_mask));
+	return Vec4U16{ result };
+}
+
+Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) {
+	return Vec4U16{ vand_u16(a.v, vmvn_u16(inverted.v)) };
+}
+
 #else
 
 struct Vec4S32 {
 	s32 v[4];
+
+	Vec4S32 operator +(Vec4S32 other) const {
+		return Vec4S32{ { v[0] + other.v[0], v[1] + other.v[1], v[2] + other.v[2], v[3] + other.v[3], } };
+	}
+	Vec4S32 operator -(Vec4S32 other) const {
+		return Vec4S32{ { v[0] - other.v[0], v[1] - other.v[1], v[2] - other.v[2], v[3] - other.v[3], } };
+	}
 };
 
 #endif
diff --git a/Common/Math/SIMDHeaders.h b/Common/Math/SIMDHeaders.h
index 3f8500dfb273..9705dca55434 100644
--- a/Common/Math/SIMDHeaders.h
+++ b/Common/Math/SIMDHeaders.h
@@ -118,6 +118,24 @@ inline __m128i _mm_packu_epi32_SSE2(const __m128i v0) {
 	return _mm_castps_si128(_mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(3, 3, 2, 0)));
 }
 
+#ifdef __cplusplus
+
+alignas(16) static const uint32_t g_sign32[4] = { 0x00008000, 0x00008000, 0x00008000, 0x00008000 };
+alignas(16) static const uint32_t g_sign16[4] = { 0x80008000, 0x80008000, 0x80008000, 0x80008000 };
+
+// Alternate solution to the above, not sure if faster or slower.
+// SSE2 replacement for half of _mm_packus_epi32 but without the saturation.
+// Not ideal! pshufb would make this faster but that's SSSE3.
+inline __m128i _mm_packu1_epi32_SSE2(const __m128i v0) {
+	// Toggle the sign bit, pack, then toggle back.
+	__m128i toggled = _mm_sub_epi32(v0, _mm_load_si128((const __m128i *)g_sign32));
+	__m128i temp = _mm_packs_epi32(toggled, toggled);
+	__m128i restored = _mm_add_epi16(temp, _mm_load_si128((const __m128i *)g_sign16));
+	return restored;
+}
+
+#endif
+
 // SSE2 replacement for the entire _mm_packus_epi32 but without the saturation.
 // Not ideal! pshufb would make this faster but that's SSSE3.
 inline __m128i _mm_packu2_epi32_SSE2(const __m128i v0, const __m128i v1) {
diff --git a/unittest/UnitTest.cpp b/unittest/UnitTest.cpp
index fe3fba0af4b9..a087d205b96b 100644
--- a/unittest/UnitTest.cpp
+++ b/unittest/UnitTest.cpp
@@ -1114,12 +1114,15 @@ bool TestSIMD() {
 	__m128i a = _mm_set_epi16(0, 0x4444, 0, 0x3333, 0, 0x2222, 0, 0x1111);
 	__m128i b = _mm_set_epi16(0, (int16_t)0x8888, 0, 0x7777, 0, 0x6666, 0, 0x5555);
 	__m128i c = _mm_packu2_epi32_SSE2(a, b);
-	__m128i d = _mm_packus_epi32(a, b);
+	__m128i d = _mm_packu1_epi32_SSE2(b);
 
-	uint64_t testdata2[2];
+	uint64_t testdata2[4];
 	_mm_store_si128((__m128i *)testdata2, c);
+	_mm_store_si128((__m128i *)testdata2 + 1, d);
 	EXPECT_EQ_INT(testdata2[0], 0x4444333322221111);
 	EXPECT_EQ_INT(testdata2[1], 0x8888777766665555);
+	EXPECT_EQ_INT(testdata2[2], 0x8888777766665555);
+	EXPECT_EQ_INT(testdata2[2], 0x8888777766665555);
 #endif
 	return true;
 }

From 399570e4113b76f7b915389dc90209666e540da9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Sat, 21 Dec 2024 13:21:20 +0100
Subject: [PATCH 24/28] CrossSIMD: make the transpose function compatible with
 ARM32

---
 Common/Math/CrossSIMD.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
index f723ea57d6d6..982c859439c1 100644
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@@ -264,6 +264,8 @@ struct Vec4F32 {
 
 	// One of many possible solutions. Sometimes we could also use vld4q_f32 probably..
 	static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
+#if PPSSPP_ARCH(ARM64_NEON)
+		// Only works on ARM64
 		float32x4_t temp0 = vzip1q_f32(col0.v, col2.v);
 		float32x4_t temp1 = vzip2q_f32(col0.v, col2.v);
 		float32x4_t temp2 = vzip1q_f32(col1.v, col3.v);
@@ -272,6 +274,14 @@ struct Vec4F32 {
 		col1.v = vzip2q_f32(temp0, temp2);
 		col2.v = vzip1q_f32(temp1, temp3);
 		col3.v = vzip2q_f32(temp1, temp3);
+#else
+   		float32x4x2_t col01 = vtrnq_f32(col0.v, col1.v);
+        float32x4x2_t col23 = vtrnq_f32(col2.v, col3.v);
+        col0.v = vcombine_f32(vget_low_f32(col01.val[0]), vget_low_f32(col23.val[0]));
+        col1.v = vcombine_f32(vget_low_f32(col01.val[1]), vget_low_f32(col23.val[1]));
+        col2.v = vcombine_f32(vget_high_f32(col01.val[0]), vget_high_f32(col23.val[0]));
+        col3.v = vcombine_f32(vget_high_f32(col01.val[1]), vget_high_f32(col23.val[1]));
+#endif
 	}
 
 	inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {

From 73ae6da75793ca00b25f272542a8536a2e6abe2e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Sat, 21 Dec 2024 11:28:29 +0100
Subject: [PATCH 25/28] Reimplement the depth rasterizer with SIMD.

---
 GPU/Common/DepthRaster.cpp | 263 ++++++++++++++++++++-----------------
 1 file changed, 145 insertions(+), 118 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index 1da6e6ac9a2b..7075ebd9f77f 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -8,7 +8,14 @@
 #include "Common/Math/math_util.h"
 #include "GPU/Common/VertexDecoderCommon.h"
 
-void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, short depthValue, GEComparison depthCompare) {
+// We only need to support these three modes.
+enum class ZCompareMode {
+	Greater,  // Most common
+	Less,  // Less common
+	Always,  // Fairly common
+};
+
+void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, short depthValue, ZCompareMode compareMode) {
 	// Swap coordinates if needed, we don't back-face-cull rects.
 	// We also ignore the UV rotation here.
 	if (x1 > x2) {
@@ -26,8 +33,8 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2,
 	for (int y = y1; y < y2; y++) {
 		__m128i *ptr = (__m128i *)(dest + stride * y + x1);
 		int w = x2 - x1;
-		switch (depthCompare) {
-		case GE_COMP_ALWAYS:
+		switch (compareMode) {
+		case ZCompareMode::Always:
 			if (depthValue == 0) {
 				memset(ptr, 0, w * 2);
 			} else {
@@ -39,8 +46,6 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2,
 			}
 			break;
 			// TODO: Trailer
-		case GE_COMP_NEVER:
-			break;
 		default:
 			// TODO
 			break;
@@ -53,8 +58,8 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2,
 		uint16_t *ptr = (uint16_t *)(dest + stride * y + x1);
 		int w = x2 - x1;
 
-		switch (depthCompare) {
-		case GE_COMP_ALWAYS:
+		switch (compareMode) {
+		case ZCompareMode::Always:
 			if (depthValue == 0) {
 				memset(ptr, 0, w * 2);
 			} else {
@@ -66,8 +71,6 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2,
 			}
 			break;
 			// TODO: Trailer
-		case GE_COMP_NEVER:
-			break;
 		default:
 			// TODO
 			break;
@@ -78,10 +81,39 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2,
 #endif
 }
 
+alignas(16) static const int zero123[4]  = {0, 1, 2, 3};
+
+struct Edge {
+	// Dimensions of our pixel group
+	static const int stepXSize = 4;
+	static const int stepYSize = 1;
+
+	Vec4S32 oneStepX;
+	Vec4S32 oneStepY;
+
+	Vec4S32 init(int v0x, int v0y, int v1x, int v1y, int p0x, int p0y) {
+		// Edge setup
+		int A = v0y - v1y;
+		int B = v1x - v0x;
+		int C = v0x * v1y - v0y * v1x;
+
+		// Step deltas
+		oneStepX = Vec4S32::Splat(A * stepXSize);
+		oneStepY = Vec4S32::Splat(B * stepYSize);
+
+		// x/y values for initial pixel block. Add horizontal offsets.
+		Vec4S32 x = Vec4S32::Splat(p0x) + Vec4S32::LoadAligned(zero123);
+		Vec4S32 y = Vec4S32::Splat(p0y);
+
+		// Edge function values at origin
+		return Vec4S32::Splat(A) * x + Vec4S32::Splat(B) * y + Vec4S32::Splat(C);
+	}
+};
+
 // Adapted from Intel's depth rasterizer example.
 // Started with the scalar version, will SIMD-ify later.
 // x1/y1 etc are the scissor rect.
-void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, GEComparison compareMode) {
+void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, ZCompareMode compareMode) {
 	int tileStartX = x1;
 	int tileEndX = x2;
 
@@ -93,124 +125,90 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
 	// are slow on SSE2.
 
 	// Convert to whole pixels for now. Later subpixel precision.
-	DepthScreenVertex verts[3];
-	verts[0].x = tx[0];
-	verts[0].y = ty[0];
-	verts[0].z = tz[0];
-	verts[1].x = tx[1];
-	verts[1].y = ty[1];
-	verts[1].z = tz[1];
-	verts[2].x = tx[2];
-	verts[2].y = ty[2];
-	verts[2].z = tz[2];
+	int v0x = tx[0];
+	int v0y = ty[0];
+	int v0z = tz[0];
+	int v1x = tx[1];
+	int v1y = ty[1];
+	int v1z = tz[1];
+	int v2x = tx[2];
+	int v2y = ty[2];
+	int v2z = tz[2];
 
 	// use fixed-point only for X and Y.  Avoid work for Z and W.
-	int startX = std::max(std::min(std::min(verts[0].x, verts[1].x), verts[2].x), tileStartX);
-	int endX = std::min(std::max(std::max(verts[0].x, verts[1].x), verts[2].x) + 1, tileEndX);
-
-	int startY = std::max(std::min(std::min(verts[0].y, verts[1].y), verts[2].y), tileStartY);
-	int endY = std::min(std::max(std::max(verts[0].y, verts[1].y), verts[2].y) + 1, tileEndY);
-	if (endX == startX || endY == startY) {
+	// We use 4x1 tiles for simplicity.
+	int minX = std::max(std::min(std::min(v0x, v1x), v2x), tileStartX) & ~3;
+	int maxX = std::min(std::max(std::max(v0x, v1x), v2x) + 3, tileEndX) & ~3;
+	int minY = std::max(std::min(std::min(v0y, v1y), v2y), tileStartY);
+	int maxY = std::min(std::max(std::max(v0y, v1y), v2y), tileEndY);
+	if (maxX == minX || maxY == minY) {
 		// No pixels, or outside screen.
 		return;
 	}
+
 	// TODO: Cull really small triangles here.
 
-	// Fab(x, y) =     Ax       +       By     +      C              = 0
-	// Fab(x, y) = (ya - yb)x   +   (xb - xa)y + (xa * yb - xb * ya) = 0
-	// Compute A = (ya - yb) for the 3 line segments that make up each triangle
-	int A0 = verts[1].y - verts[2].y;
-	int A1 = verts[2].y - verts[0].y;
-	int A2 = verts[0].y - verts[1].y;
-
-	// Compute B = (xb - xa) for the 3 line segments that make up each triangle
-	int B0 = verts[2].x - verts[1].x;
-	int B1 = verts[0].x - verts[2].x;
-	int B2 = verts[1].x - verts[0].x;
-
-	// Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle
-	int C0 = verts[1].x * verts[2].y - verts[2].x * verts[1].y;
-	int C1 = verts[2].x * verts[0].y - verts[0].x * verts[2].y;
-	int C2 = verts[0].x * verts[1].y - verts[1].x * verts[0].y;
-
-	// Compute triangle area.
-	// TODO: Cull really small triangles here - we can just raise the comparison value below.
-	int triArea = A0 * verts[0].x + B0 * verts[0].y + C0;
+	Edge e01, e12, e20;
+
+	Vec4S32 w0_row = e12.init(v1x, v1y, v2x, v2y, minX, minY);
+	Vec4S32 w1_row = e20.init(v2x, v2y, v0x, v0y, minX, minY);
+	Vec4S32 w2_row = e01.init(v0x, v0y, v1x, v1y, minX, minY);
+
+	int triArea = (v1y - v2y) * v0x + (v2x - v1x) * v0y + (v1x * v2y - v2x * v1y);
 	if (triArea <= 0) {
-		// Too small to rasterize or backface culled
-		// NOTE: Just disabling this check won't enable two-sided rendering.
-		// Since it's not that common, let's just queue the triangles with both windings.
 		return;
 	}
-
-	int rowIdx = (startY * stride + startX);
-	int col = startX;
-	int row = startY;
-
-	// Calculate slopes at starting corner.
-	int alpha0 = (A0 * col) + (B0 * row) + C0;
-	int beta0 = (A1 * col) + (B1 * row) + C1;
-	int gamma0 = (A2 * col) + (B2 * row) + C2;
-
-	float oneOverTriArea = (1.0f / float(triArea));
-
-	float zz[3];
-	zz[0] = (float)verts[0].z;
-	zz[1] = (float)(verts[1].z - verts[0].z) * oneOverTriArea;
-	zz[2] = (float)(verts[2].z - verts[0].z) * oneOverTriArea;
-
-	// END triangle setup.
-
-	// Here we should draw four triangles in a sequence.
-
-	// Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY)
-	for (int r = startY; r < endY; r++,
-		row++,
-		rowIdx += stride,
-		alpha0 += B0,
-		beta0 += B1,
-		gamma0 += B2)
-	{
-		int idx = rowIdx;
-
-		// Restore row steppers.
-		int alpha = alpha0;
-		int beta = beta0;
-		int gamma = gamma0;
-
-		for (int c = startX; c < endX; c++,
-			idx++,
-			alpha += A0,
-			beta += A1,
-			gamma += A2)
-		{
-			int mask = (alpha | beta | gamma) >= 0;
-			// Early out if all of this quad's pixels are outside the triangle.
-			if (!mask) {
+	float oneOverTriArea = 1.0f / (float)triArea;
+
+	// Prepare to interpolate Z
+	Vec4F32 zz0 = Vec4F32::Splat((float)v0z);
+	Vec4F32 zz1 = Vec4F32::Splat((float)(v1z - v0z) * oneOverTriArea);
+	Vec4F32 zz2 = Vec4F32::Splat((float)(v2z - v0z) * oneOverTriArea);
+
+	// Rasterize
+	for (int y = minY; y <= maxY; y += Edge::stepYSize, w0_row += e12.oneStepY, w1_row += e20.oneStepY, w2_row += e01.oneStepY) {
+		// Barycentric coordinates at start of row
+		Vec4S32 w0 = w0_row;
+		Vec4S32 w1 = w1_row;
+		Vec4S32 w2 = w2_row;
+
+		uint16_t *rowPtr = depthBuf + stride * y;
+
+		for (int x = minX; x <= maxX; x += Edge::stepXSize, w0 += e12.oneStepX, w1 += e20.oneStepX, w2 += e01.oneStepX) {
+			// If p is on or inside all edges for any pixels,
+			// render those pixels.
+			Vec4S32 signCalc = w0 | w1 | w2;
+			if (!AnyZeroSignBit(signCalc)) {
 				continue;
 			}
-			// Compute barycentric-interpolated depth. Could also compute it incrementally.
-			float depth = zz[0] + beta * zz[1] + gamma * zz[2];
-			float previousDepthValue = (float)depthBuf[idx];
 
-			int depthMask;
+			Vec4U16 bufferValues = Vec4U16::Load(rowPtr + x);
+			Vec4U16 shortMaskInv = SignBits32ToMaskU16(signCalc);
+			// Now, the mask has 1111111 where we should preserve the contents of the depth buffer.
+
+			// Compute the Z value for all four pixels.
+			// float depth = zz[0] + beta * zz[1] + gamma * zz[2];
+			Vec4U16 shortZ = Vec4U16::FromVec4F32(zz0 + Vec4F32FromS32(w1) * zz1 + Vec4F32FromS32(w2) * zz2);
+
+			// TODO: Lift this switch out of the inner loop, or even out of the function with templating.
 			switch (compareMode) {
-			case GE_COMP_EQUAL:  depthMask = depth == previousDepthValue; break;
-			case GE_COMP_LESS: depthMask = depth < previousDepthValue; break;
-			case GE_COMP_LEQUAL: depthMask = depth <= previousDepthValue; break;
-			case GE_COMP_GEQUAL: depthMask = depth >= previousDepthValue; break;
-			case GE_COMP_GREATER: depthMask = depth > previousDepthValue; break;
-			case GE_COMP_NOTEQUAL: depthMask = depth != previousDepthValue; break;
-			case GE_COMP_ALWAYS:
-			default:
-				depthMask = 1;
+			case ZCompareMode::Greater:
+				// To implement the greater/greater-than comparison, we can combine mask and max.
+				// It might be better to do the math in float space on x86 due to SSE2 deficiencies.
+				// We use AndNot to zero out Z results, before doing Max with the buffer.
+				AndNot(shortZ, shortMaskInv).Max(bufferValues).Store(rowPtr + x);
+				break;
+			case ZCompareMode::Less:  // UNTESTED
+				// This time, we OR the mask and use .Min.
+				(shortZ | shortMaskInv).Min(bufferValues).Store(rowPtr + x);
+				break;
+			case ZCompareMode::Always:  // UNTESTED
+				// This could be replaced with a vblend operation.
+				((bufferValues & shortMaskInv) | AndNot(shortZ, shortMaskInv)).Store(rowPtr + x);
 				break;
 			}
-			int finalMask = mask & depthMask;
-			depth = finalMask == 1 ? depth : previousDepthValue;
-			depthBuf[idx] = (u16)depth;
-		} //for each column
-	} // for each row
+		}
+	}
 }
 
 void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, VertexDecoder *dec, u32 vertTypeID) {
@@ -299,9 +297,9 @@ int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *tran
 		z *= recipW;
 
 		Vec4S32 screen[3];
-		screen[0] = VecS32FromF32((x * viewportScaleX + viewportX) - offsetX);
-		screen[1] = VecS32FromF32((y * viewportScaleY + viewportY) - offsetY);
-		screen[2] = VecS32FromF32((z * viewportScaleZ + viewportZ).Clamp(0.0f, 65535.0f));
+		screen[0] = Vec4S32FromF32((x * viewportScaleX + viewportX) - offsetX);
+		screen[1] = Vec4S32FromF32((y * viewportScaleY + viewportY) - offsetY);
+		screen[2] = Vec4S32FromF32((z * viewportScaleZ + viewportZ).Clamp(0.0f, 65535.0f));
 
 		screen[0].Store(tx + outCount);
 		screen[1].Store(ty + outCount);
@@ -341,12 +339,41 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType pr
 	// Prim should now be either TRIANGLES or RECTs.
 	_dbg_assert_(prim == GE_PRIM_RECTANGLES || prim == GE_PRIM_TRIANGLES);
 
+	// Ignore draws where stencil operations are active?
+	if (gstate.isStencilTestEnabled()) {
+		// return;
+	}
+
 	GEComparison compareMode = gstate.getDepthTestFunction();
+
+	ZCompareMode comp;
+	// Ignore some useless compare modes.
+	switch (compareMode) {
+	case GE_COMP_NEVER:
+	case GE_COMP_EQUAL:
+		// These will never have a useful effect in Z-only raster.
+		return;
+	case GE_COMP_ALWAYS:
+		comp = ZCompareMode::Always;
+		break;
+	case GE_COMP_LEQUAL:
+	case GE_COMP_LESS:
+		comp = ZCompareMode::Less;
+		break;
+	case GE_COMP_GEQUAL:
+	case GE_COMP_GREATER:
+		comp = ZCompareMode::Greater;  // Most common
+		break;
+	case GE_COMP_NOTEQUAL:
+		// This is highly unusual, let's just ignore it.
+		return;
+	}
+
 	if (gstate.isModeClear()) {
 		if (!gstate.isClearModeDepthMask()) {
 			return;
 		}
-		compareMode = GE_COMP_ALWAYS;
+		comp = ZCompareMode::Always;
 	} else {
 		if (!gstate.isDepthTestEnabled() || !gstate.isDepthWriteEnabled())
 			return;
@@ -358,12 +385,12 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType pr
 			uint16_t z = tz[i + 1];  // depth from second vertex
 			// TODO: Should clip coordinates to the scissor rectangle.
 			// We remove the subpixel information here.
-			DepthRasterRect(depth, depthStride, tx[i], ty[i], tx[i + 1], ty[i + 1], z, compareMode);
+			DepthRasterRect(depth, depthStride, tx[i], ty[i], tx[i + 1], ty[i + 1], z, comp);
 		}
 		break;
 	case GE_PRIM_TRIANGLES:
 		for (int i = 0; i < count; i += 3) {
-			DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, &tx[i], &ty[i], &tz[i], compareMode);
+			DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, &tx[i], &ty[i], &tz[i], comp);
 		}
 		break;
 	default:

From 5df88fc1aab1feb92e60a6b14eec01f3b56e85b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Sat, 21 Dec 2024 11:33:37 +0100
Subject: [PATCH 26/28] Convert the rect implementation to CrossSIMD

---
 Common/Math/CrossSIMD.h    | 20 ++++++++++++++++++++
 GPU/Common/DepthRaster.cpp | 33 ++-------------------------------
 2 files changed, 22 insertions(+), 31 deletions(-)

diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
index 982c859439c1..83ac18add4ef 100644
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@@ -153,6 +153,16 @@ struct Vec4U16 {
 	Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ _mm_cmplt_epu16(v, other.v) }; }
 };
 
+struct Vec8U16 {
+	__m128i v;
+
+	static Vec8U16 Zero() { return Vec8U16{ _mm_setzero_si128() }; }
+	static Vec8U16 Splat(uint16_t value) { return Vec8U16{ _mm_set1_epi16((int16_t)value) }; }
+
+	static Vec8U16 Load(const uint16_t *mem) { return Vec8U16{ _mm_loadu_si128((__m128i *)mem) }; }
+	void Store(uint16_t *mem) { _mm_storeu_si128((__m128i *)mem, v); }
+};
+
 Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
 	__m128i temp = _mm_srai_epi32(v.v, 31);
 	return Vec4U16 {
@@ -342,6 +352,16 @@ Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) {
 	return Vec4U16{ vand_u16(a.v, vmvn_u16(inverted.v)) };
 }
 
+struct Vec8U16 {
+	uint16x8_t v;
+
+	static Vec8U16 Zero() { return Vec8U16{ vdupq_n_u16(0) }; }
+	static Vec8U16 Splat(uint16_t value) { return Vec8U16{ vdupq_n_u16(value) }; }
+
+	static Vec8U16 Load(const uint16_t *mem) { return Vec8U16{ vld1q_u16(mem) }; }
+	void Store(uint16_t *mem) { vst1q_u16(mem, v); }
+};
+
 #else
 
 struct Vec4S32 {
diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index 7075ebd9f77f..4443974059e3 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -28,43 +28,17 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2,
 		return;
 	}
 
-#if PPSSPP_ARCH(SSE2)
-	__m128i valueX8 = _mm_set1_epi16(depthValue);
-	for (int y = y1; y < y2; y++) {
-		__m128i *ptr = (__m128i *)(dest + stride * y + x1);
-		int w = x2 - x1;
-		switch (compareMode) {
-		case ZCompareMode::Always:
-			if (depthValue == 0) {
-				memset(ptr, 0, w * 2);
-			} else {
-				while (w >= 8) {
-					_mm_storeu_si128(ptr, valueX8);
-					ptr++;
-					w -= 8;
-				}
-			}
-			break;
-			// TODO: Trailer
-		default:
-			// TODO
-			break;
-		}
-	}
-
-#elif PPSSPP_ARCH(ARM64_NEON)
-	uint16x8_t valueX8 = vdupq_n_u16(depthValue);
+	Vec8U16 valueX8 = Vec8U16::Splat(depthValue);
 	for (int y = y1; y < y2; y++) {
 		uint16_t *ptr = (uint16_t *)(dest + stride * y + x1);
 		int w = x2 - x1;
-
 		switch (compareMode) {
 		case ZCompareMode::Always:
 			if (depthValue == 0) {
 				memset(ptr, 0, w * 2);
 			} else {
 				while (w >= 8) {
-					vst1q_u16(ptr, valueX8);
+					valueX8.Store(ptr);
 					ptr += 8;
 					w -= 8;
 				}
@@ -76,9 +50,6 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2,
 			break;
 		}
 	}
-#else
-	// Do nothing for now
-#endif
 }
 
 alignas(16) static const int zero123[4]  = {0, 1, 2, 3};

From 8cd86b47b5c49f39fd9490029bf8358133161a8a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Sat, 21 Dec 2024 11:44:49 +0100
Subject: [PATCH 27/28] AnyZeroSignBit arm fix, more crosssimd fixes. Now works
 on ARM.

---
 Common/Math/CrossSIMD.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
index 83ac18add4ef..55e7b86fc7f5 100644
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@@ -309,19 +309,20 @@ struct Vec4F32 {
 };
 
 inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ vcvtq_s32_f32(f.v) }; }
-inline Vec4F32 Vec4F32FromS32(Vec4S32 f) { return Vec4F32{ vcvtq_f32_s32(f.v) }; }
+inline Vec4F32 Vec4F32FromS32(Vec4S32 s) { return Vec4F32{ vcvtq_f32_s32(s.v) }; }
 
 inline bool AnyZeroSignBit(Vec4S32 value) {
 	// Very suboptimal, let's optimize later.
 	int32x2_t prod = vand_s32(vget_low_s32(value.v), vget_high_s32(value.v));
 	int mask = vget_lane_s32(prod, 0) & vget_lane_s32(prod, 1);
-	return (mask & 0x80000000) != 0;
+	return (mask & 0x80000000) == 0;
 }
 
 struct Vec4U16 {
-	uint16x4_t v;  // we only use the lower 64 bits.
+	uint16x4_t v;  // 64 bits.
 
 	static Vec4U16 Zero() { return Vec4U16{ vdup_n_u16(0) }; }
+	static Vec4U16 Splat(uint16_t value) { return Vec4U16{ vdup_n_u16(value) }; }
 
 	static Vec4U16 Load(const uint16_t *mem) { return Vec4U16{ vld1_u16(mem) }; }
 	void Store(uint16_t *mem) { vst1_u16(mem, v); }
@@ -330,7 +331,7 @@ struct Vec4U16 {
 		return Vec4U16{ vmovn_u16(v.v) };
 	}
 	static Vec4U16 FromVec4F32(Vec4F32 v) {
-		return Vec4U16{ vmovn_u16(vcvtq_u32_f32(v.v)) };
+		return Vec4U16{ vmovn_u32(vreinterpretq_u32_s32(vcvtq_s32_f32(v.v))) };
 	}
 
 	Vec4U16 operator |(Vec4U16 other) const { return Vec4U16{ vorr_u16(v, other.v) }; }

From 80cb57f8bb05d833b19ffac414cd71ddc7399109 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= <hrydgard@gmail.com>
Date: Sat, 21 Dec 2024 12:57:12 +0100
Subject: [PATCH 28/28] Cleanup

---
 GPU/Common/DepthRaster.cpp | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
index 4443974059e3..ca8f81cedfb3 100644
--- a/GPU/Common/DepthRaster.cpp
+++ b/GPU/Common/DepthRaster.cpp
@@ -118,6 +118,12 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
 	}
 
 	// TODO: Cull really small triangles here.
+	int triArea = (v1y - v2y) * v0x + (v2x - v1x) * v0y + (v1x * v2y - v2x * v1y);
+	if (triArea <= 0) {
+		return;
+	}
+
+	float oneOverTriArea = 1.0f / (float)triArea;
 
 	Edge e01, e12, e20;
 
@@ -125,12 +131,6 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
 	Vec4S32 w1_row = e20.init(v2x, v2y, v0x, v0y, minX, minY);
 	Vec4S32 w2_row = e01.init(v0x, v0y, v1x, v1y, minX, minY);
 
-	int triArea = (v1y - v2y) * v0x + (v2x - v1x) * v0y + (v1x * v2y - v2x * v1y);
-	if (triArea <= 0) {
-		return;
-	}
-	float oneOverTriArea = 1.0f / (float)triArea;
-
 	// Prepare to interpolate Z
 	Vec4F32 zz0 = Vec4F32::Splat((float)v0z);
 	Vec4F32 zz1 = Vec4F32::Splat((float)(v1z - v0z) * oneOverTriArea);
@@ -320,10 +320,6 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType pr
 	ZCompareMode comp;
 	// Ignore some useless compare modes.
 	switch (compareMode) {
-	case GE_COMP_NEVER:
-	case GE_COMP_EQUAL:
-		// These will never have a useful effect in Z-only raster.
-		return;
 	case GE_COMP_ALWAYS:
 		comp = ZCompareMode::Always;
 		break;
@@ -335,8 +331,14 @@ void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType pr
 	case GE_COMP_GREATER:
 		comp = ZCompareMode::Greater;  // Most common
 		break;
+	case GE_COMP_NEVER:
+	case GE_COMP_EQUAL:
+		// These will never have a useful effect in Z-only raster.
+		[[fallthrough]];
 	case GE_COMP_NOTEQUAL:
 		// This is highly unusual, let's just ignore it.
+		[[fallthrough]];
+	default:
 		return;
 	}