diff --git a/CMakeLists.txt b/CMakeLists.txt
index c1860a2b5332..fcf74005e7f9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1906,6 +1906,8 @@ set(GPU_SOURCES
 	GPU/Common/Draw2D.cpp
 	GPU/Common/Draw2D.h
 	GPU/Common/DepthBufferCommon.cpp
+	GPU/Common/DepthRaster.cpp
+	GPU/Common/DepthRaster.h
 	GPU/Common/TextureShaderCommon.cpp
 	GPU/Common/TextureShaderCommon.h
 	GPU/Common/DepalettizeShaderCommon.cpp
diff --git a/Common/Data/Convert/ColorConv.cpp b/Common/Data/Convert/ColorConv.cpp
index 72fac52f2f01..5c4df7fca808 100644
--- a/Common/Data/Convert/ColorConv.cpp
+++ b/Common/Data/Convert/ColorConv.cpp
@@ -65,7 +65,7 @@ void ConvertBGRA8888ToRGB888(u8 *dst, const u32 *src, u32 numPixels) {
 }
 
 #if PPSSPP_ARCH(SSE2)
-// fp64's improved version, see #19751
+// fp64's improved SSE2 version, see #19751. SSE4 no longer required here.
 static inline void ConvertRGBA8888ToRGBA5551(__m128i *dstp, const __m128i *srcp, u32 sseChunks) {
 	const __m128i maskRB = _mm_set1_epi32(0x00F800F8);
 	const __m128i maskGA = _mm_set1_epi32(0x8000F800);
@@ -76,7 +76,7 @@ static inline void ConvertRGBA8888ToRGBA5551(__m128i *dstp, const __m128i *srcp,
 		__m128i c0 = _mm_load_si128(&srcp[i + 0]);
 		__m128i c1 = _mm_load_si128(&srcp[i + 1]);
 
-		__m128i rb0 = _mm_and_si128(c0, maskRB);              // 00000000bbbbb00000000000rrrrr000
+		__m128i rb0 = _mm_and_si128(c0, maskRB);              // 00000000bbbbb00000000000rrrrr000 (each 32-bit lane)
 		__m128i rb1 = _mm_and_si128(c1, maskRB);              // 00000000bbbbb00000000000rrrrr000
 		__m128i ga0 = _mm_and_si128(c0, maskGA);              // a000000000000000ggggg00000000000
 		__m128i ga1 = _mm_and_si128(c1, maskGA);              // a000000000000000ggggg00000000000
diff --git a/Common/GPU/DataFormat.h b/Common/GPU/DataFormat.h
index 6f5cd2bc2eae..428146802a6c 100644
--- a/Common/GPU/DataFormat.h
+++ b/Common/GPU/DataFormat.h
@@ -74,6 +74,7 @@ bool DataFormatIsDepthStencil(DataFormat fmt);
 inline bool DataFormatIsColor(DataFormat fmt) {
 	return !DataFormatIsDepthStencil(fmt);
 }
+int DataFormatNumChannels(DataFormat fmt);
 bool DataFormatIsBlockCompressed(DataFormat fmt, int *blockSize);
 
 // Limited format support for now.
diff --git a/Common/GPU/Vulkan/thin3d_vulkan.cpp b/Common/GPU/Vulkan/thin3d_vulkan.cpp
index e3d135e8c0b1..b744235b892f 100644
--- a/Common/GPU/Vulkan/thin3d_vulkan.cpp
+++ b/Common/GPU/Vulkan/thin3d_vulkan.cpp
@@ -803,9 +803,15 @@ bool VKTexture::Create(VkCommandBuffer cmd, VulkanBarrierBatch *postBarriers, Vu
 	}
 
 	VkComponentMapping r8AsAlpha[4] = { VK_COMPONENT_SWIZZLE_ONE, VK_COMPONENT_SWIZZLE_ONE, VK_COMPONENT_SWIZZLE_ONE, VK_COMPONENT_SWIZZLE_R };
+	VkComponentMapping r8AsColor[4] = { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_ONE };
 
+	VkComponentMapping *swizzle = nullptr;
+	switch (desc.swizzle) {
+	case TextureSwizzle::R8_AS_ALPHA: swizzle = r8AsAlpha; break;
+	case TextureSwizzle::R8_AS_GRAYSCALE: swizzle = r8AsColor; break;
+	}
 	VulkanBarrierBatch barrier;
-	if (!vkTex_->CreateDirect(width_, height_, 1, mipLevels_, vulkanFormat, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, usageBits, &barrier, desc.swizzle == TextureSwizzle::R8_AS_ALPHA ? r8AsAlpha : nullptr)) {
+	if (!vkTex_->CreateDirect(width_, height_, 1, mipLevels_, vulkanFormat, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, usageBits, &barrier, swizzle)) {
 		ERROR_LOG(Log::G3D,  "Failed to create VulkanTexture: %dx%dx%d fmt %d, %d levels", width_, height_, depth_, (int)vulkanFormat, mipLevels_);
 		return false;
 	}
diff --git a/Common/GPU/thin3d.cpp b/Common/GPU/thin3d.cpp
index de8db83fb331..72fbe873e86a 100644
--- a/Common/GPU/thin3d.cpp
+++ b/Common/GPU/thin3d.cpp
@@ -118,6 +118,25 @@ bool DataFormatIsBlockCompressed(DataFormat fmt, int *blockSize) {
 	}
 }
 
+int DataFormatNumChannels(DataFormat fmt) {
+	switch (fmt) {
+	case DataFormat::D16:
+	case DataFormat::D32F:
+	case DataFormat::R8_UNORM:
+	case DataFormat::R16_UNORM:
+	case DataFormat::R16_FLOAT:
+	case DataFormat::R32_FLOAT:
+		return 1;
+	case DataFormat::R8G8B8A8_UNORM:
+	case DataFormat::R8G8B8A8_UNORM_SRGB:
+	case DataFormat::B8G8R8A8_UNORM:
+	case DataFormat::B8G8R8A8_UNORM_SRGB:
+		return 4;
+	default:
+		return 0;
+	}
+}
+
 RefCountedObject::~RefCountedObject() {
 	const int rc = refcount_.load();
 	_dbg_assert_msg_(rc == 0xDEDEDE, "Unexpected refcount %d in object of type '%s'", rc, name_);
diff --git a/Common/GPU/thin3d.h b/Common/GPU/thin3d.h
index ce62382718e2..f7a23795b017 100644
--- a/Common/GPU/thin3d.h
+++ b/Common/GPU/thin3d.h
@@ -643,6 +643,7 @@ typedef std::function<bool(uint8_t *data, const uint8_t *initData, uint32_t w, u
 enum class TextureSwizzle {
 	DEFAULT,
 	R8_AS_ALPHA,
+	R8_AS_GRAYSCALE,
 };
 
 struct TextureDesc {
diff --git a/Common/Math/CrossSIMD.h b/Common/Math/CrossSIMD.h
index 6ad03b832217..55e7b86fc7f5 100644
--- a/Common/Math/CrossSIMD.h
+++ b/Common/Math/CrossSIMD.h
@@ -6,3 +6,374 @@
 
 #include "Common/Math/SIMDHeaders.h"
 
+#if PPSSPP_ARCH(SSE2)
+
+struct Mat4F32 {
+	Mat4F32(const float *matrix) {
+		col0 = _mm_loadu_ps(matrix);
+		col1 = _mm_loadu_ps(matrix + 4);
+		col2 = _mm_loadu_ps(matrix + 8);
+		col3 = _mm_loadu_ps(matrix + 12);
+	}
+
+	__m128 col0;
+	__m128 col1;
+	__m128 col2;
+	__m128 col3;
+};
+
+struct Vec4S32 {
+	__m128i v;
+
+	static Vec4S32 Zero() { return Vec4S32{ _mm_setzero_si128() }; }
+	static Vec4S32 Splat(int lane) { return Vec4S32{ _mm_set1_epi32(lane) }; }
+
+	static Vec4S32 Load(const int *src) { return Vec4S32{ _mm_loadu_si128((const __m128i *)src) }; }
+	static Vec4S32 LoadAligned(const int *src) { return Vec4S32{ _mm_load_si128((const __m128i *)src) }; }
+	void Store(int *dst) { _mm_storeu_si128((__m128i *)dst, v); }
+	void StoreAligned(int *dst) { _mm_store_si128((__m128i *)dst, v);}
+
+	// Swaps the two lower elements. Useful for reversing triangles..
+	Vec4S32 SwapLowerElements() {
+		return Vec4S32{
+			_mm_shuffle_epi32(v, _MM_SHUFFLE(3, 2, 0, 1))
+		};
+	}
+	Vec4S32 SignBits32ToMask() {
+		return Vec4S32{
+			_mm_srai_epi32(v, 31)
+		};
+	}
+
+	Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ _mm_add_epi32(v, other.v) }; }
+	Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ _mm_sub_epi32(v, other.v) }; }
+	Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ _mm_or_si128(v, other.v) }; }
+	Vec4S32 operator &(Vec4S32 other) const { return Vec4S32{ _mm_and_si128(v, other.v) }; }
+	Vec4S32 operator ^(Vec4S32 other) const { return Vec4S32{ _mm_xor_si128(v, other.v) }; }
+	// TODO: andnot
+	void operator +=(Vec4S32 other) { v = _mm_add_epi32(v, other.v); }
+	void operator -=(Vec4S32 other) { v = _mm_sub_epi32(v, other.v); }
+
+	// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
+	Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; }  // (ab3,ab2,ab1,ab0)
+};
+
+inline bool AnyZeroSignBit(Vec4S32 value) {
+	return _mm_movemask_ps(_mm_castsi128_ps(value.v)) != 0xF;
+}
+
+struct Vec4F32 {
+	__m128 v;
+
+	static Vec4F32 Zero() { return Vec4F32{ _mm_setzero_ps() }; }
+	static Vec4F32 Splat(float lane) { return Vec4F32{ _mm_set1_ps(lane) }; }
+
+	static Vec4F32 Load(const float *src) { return Vec4F32{ _mm_loadu_ps(src) }; }
+	static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ _mm_load_ps(src) }; }
+	void Store(float *dst) { _mm_storeu_ps(dst, v); }
+	void StoreAligned (float *dst) { _mm_store_ps(dst, v); }
+
+	static Vec4F32 LoadConvertS16(const int16_t *src) {  // Note: will load 8 bytes
+		__m128i value = _mm_loadl_epi64((const __m128i *)src);
+		// 16-bit to 32-bit, use the upper words and an arithmetic shift right to sign extend
+		return Vec4F32{ _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(value, value), 16)) };
+	}
+
+	static Vec4F32 LoadConvertS8(const int8_t *src) {  // Note: will load 8 bytes
+		__m128i value = _mm_loadl_epi64((const __m128i *)src);
+		__m128i value16 = _mm_unpacklo_epi8(value, value);
+		// 16-bit to 32-bit, use the upper words and an arithmetic shift right to sign extend
+		return Vec4F32{ _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(value16, value16), 24)) };
+	}
+
+	static Vec4F32 FromVec4S32(Vec4S32 other) { return Vec4F32{ _mm_cvtepi32_ps(other.v) }; }
+
+	Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ _mm_add_ps(v, other.v) }; }
+	Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ _mm_sub_ps(v, other.v) }; }
+	Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ _mm_mul_ps(v, other.v) }; }
+	void operator +=(Vec4F32 other) { v = _mm_add_ps(v, other.v); }
+	void operator -=(Vec4F32 other) { v = _mm_sub_ps(v, other.v); }
+	void operator *=(Vec4F32 other) { v = _mm_mul_ps(v, other.v); }
+	void operator /=(Vec4F32 other) { v = _mm_div_ps(v, other.v); }
+	Vec4F32 operator *(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; }
+
+	Vec4F32 Mul(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; }
+	Vec4F32 Recip() { return Vec4F32{ _mm_rcp_ps(v) }; }
+
+	Vec4F32 Clamp(float lower, float higher) {
+		return Vec4F32{
+			_mm_min_ps(_mm_max_ps(v, _mm_set1_ps(lower)), _mm_set1_ps(higher))
+		};
+	}
+
+	inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
+		return Vec4F32{ _mm_add_ps(
+			_mm_add_ps(
+				_mm_mul_ps(m.col0, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0))),
+				_mm_mul_ps(m.col1, _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)))
+			),
+			_mm_add_ps(
+				_mm_mul_ps(m.col2, _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2))),
+				m.col3)
+			)
+		};
+	}
+
+	static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
+		_MM_TRANSPOSE4_PS(col0.v, col1.v, col2.v, col3.v);
+	}
+};
+
+inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvttps_epi32(f.v) }; }
+inline Vec4F32 Vec4F32FromS32(Vec4S32 f) { return Vec4F32{ _mm_cvtepi32_ps(f.v) }; }
+
+struct Vec4U16 {
+	__m128i v;  // we only use the lower 64 bits.
+
+	static Vec4U16 Zero() { return Vec4U16{ _mm_setzero_si128() }; }
+	// static Vec4U16 AllOnes() { return Vec4U16{ _mm_cmpeq_epi16(_mm_setzero_si128(), _mm_setzero_si128()) }; }
+
+	static Vec4U16 Load(const uint16_t *mem) { return Vec4U16{ _mm_loadl_epi64((__m128i *)mem) }; }
+	void Store(uint16_t *mem) { _mm_storel_epi64((__m128i *)mem, v); }
+
+	// NOTE: 16-bit signed saturation! Will work for a lot of things, but not all.
+	static Vec4U16 FromVec4S32(Vec4S32 v) {
+		return Vec4U16{ _mm_packu_epi32_SSE2(v.v)};
+	}
+	static Vec4U16 FromVec4F32(Vec4F32 v) {
+		return Vec4U16{ _mm_packu_epi32_SSE2(_mm_cvtps_epi32(v.v)) };
+	}
+
+	Vec4U16 operator |(Vec4U16 other) const { return Vec4U16{ _mm_or_si128(v, other.v) }; }
+	Vec4U16 operator &(Vec4U16 other) const { return Vec4U16{ _mm_and_si128(v, other.v) }; }
+	Vec4U16 operator ^(Vec4U16 other) const { return Vec4U16{ _mm_xor_si128(v, other.v) }; }
+
+	Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ _mm_max_epu16_SSE2(v, other.v) }; }
+	Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ _mm_max_epu16_SSE2(v, other.v) }; }
+	Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ _mm_cmplt_epu16(v, other.v) }; }
+};
+
+struct Vec8U16 {
+	__m128i v;
+
+	static Vec8U16 Zero() { return Vec8U16{ _mm_setzero_si128() }; }
+	static Vec8U16 Splat(uint16_t value) { return Vec8U16{ _mm_set1_epi16((int16_t)value) }; }
+
+	static Vec8U16 Load(const uint16_t *mem) { return Vec8U16{ _mm_loadu_si128((__m128i *)mem) }; }
+	void Store(uint16_t *mem) { _mm_storeu_si128((__m128i *)mem, v); }
+};
+
+Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
+	__m128i temp = _mm_srai_epi32(v.v, 31);
+	return Vec4U16 {
+		_mm_packs_epi32(temp, temp)
+	};
+}
+
+Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) {
+	return Vec4U16{
+		_mm_andnot_si128(inverted.v, a.v)  // NOTE: with andnot, the first parameter is inverted, and then and is performed.
+	};
+}
+
+#elif PPSSPP_ARCH(ARM_NEON)
+
+struct Mat4F32 {
+	Mat4F32(const float *matrix) {
+		col0 = vld1q_f32(matrix);
+		col1 = vld1q_f32(matrix + 4);
+		col2 = vld1q_f32(matrix + 8);
+		col3 = vld1q_f32(matrix + 12);
+	}
+	float32x4_t col0;
+	float32x4_t col1;
+	float32x4_t col2;
+	float32x4_t col3;
+};
+
+struct Vec4S32 {
+	int32x4_t v;
+
+	static Vec4S32 Zero() { return Vec4S32{ vdupq_n_s32(0) }; }
+	static Vec4S32 Splat(int lane) { return Vec4S32{ vdupq_n_s32(lane) }; }
+
+	static Vec4S32 Load(const int *src) { return Vec4S32{ vld1q_s32(src) }; }
+	static Vec4S32 LoadAligned(const int *src) { return Vec4S32{ vld1q_s32(src) }; }
+	void Store(int *dst) { vst1q_s32(dst, v); }
+	void StoreAligned(int *dst) { vst1q_s32(dst, v); }
+
+	// Swaps the two lower elements, but NOT the two upper ones. Useful for reversing triangles..
+	// This is quite awkward on ARM64 :/ Maybe there's a better solution?
+	Vec4S32 SwapLowerElements() {
+		float32x2_t upper = vget_high_s32(v);
+		float32x2_t lowerSwapped = vrev64_s32(vget_low_s32(v));
+		return Vec4S32{ vcombine_s32(lowerSwapped, upper) };
+	};
+
+	Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ vaddq_s32(v, other.v) }; }
+	Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ vsubq_s32(v, other.v) }; }
+	Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
+	Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ vorrq_s32(v, other.v) }; }
+	Vec4S32 operator &(Vec4S32 other) const { return Vec4S32{ vandq_s32(v, other.v) }; }
+	Vec4S32 operator ^(Vec4S32 other) const { return Vec4S32{ veorq_s32(v, other.v) }; }
+
+	void operator +=(Vec4S32 other) { v = vaddq_s32(v, other.v); }
+	void operator -=(Vec4S32 other) { v = vsubq_s32(v, other.v); }
+};
+
+struct Vec4F32 {
+	float32x4_t v;
+
+	static Vec4F32 Zero() { return Vec4F32{ vdupq_n_f32(0.0f) }; }
+	static Vec4F32 Splat(float lane) { return Vec4F32{ vdupq_n_f32(lane) }; }
+
+	static Vec4F32 Load(const float *src) { return Vec4F32{ vld1q_f32(src) }; }
+	static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ vld1q_f32(src) }; }
+	void Store(float *dst) { vst1q_f32(dst, v); }
+	void StoreAligned(float *dst) { vst1q_f32(dst, v); }
+
+	static Vec4F32 LoadConvertS16(const int16_t *src) {  // Note: will load 8 bytes
+		int16x4_t value = vld1_s16(src);
+		return Vec4F32{ vcvtq_f32_s32(vmovl_s16(value)) };
+	}
+
+	static Vec4F32 LoadConvertS8(const int8_t *src) {  // Note: will load 8 bytes
+		int8x8_t value = vld1_s8(src);
+		int16x4_t value16 = vget_low_s16(vmovl_s8(value));
+		return Vec4F32{ vcvtq_f32_s32(vmovl_s16(value)) };
+	}
+
+	static Vec4F32 FromVec4S32(Vec4S32 other) {
+		return Vec4F32{ vcvtq_f32_s32(other.v) };
+	}
+
+	Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ vaddq_f32(v, other.v) }; }
+	Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ vsubq_f32(v, other.v) }; }
+	Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ vmulq_f32(v, other.v) }; }
+	void operator +=(Vec4F32 other) { v = vaddq_f32(v, other.v); }
+	void operator -=(Vec4F32 other) { v = vsubq_f32(v, other.v); }
+	void operator *=(Vec4F32 other) { v = vmulq_f32(v, other.v); }
+	void operator /=(Vec4F32 other) { v = vmulq_f32(v, other.Recip().v); }
+	Vec4F32 operator *(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; }
+
+	Vec4F32 Mul(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; }
+	Vec4F32 Recip() {
+		float32x4_t recip = vrecpeq_f32(v);
+		// Use a couple Newton-Raphson steps to refine the estimate.
+		// May be able to get away with only one refinement, not sure!
+		recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
+		recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
+		return Vec4F32{ recip };
+	}
+
+	Vec4F32 Clamp(float lower, float higher) {
+		return Vec4F32{
+			vminq_f32(vmaxq_f32(v, vdupq_n_f32(lower)), vdupq_n_f32(higher))
+		};
+	}
+
+	// One of many possible solutions. Sometimes we could also use vld4q_f32 probably..
+	static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
+#if PPSSPP_ARCH(ARM64_NEON)
+		// Only works on ARM64
+		float32x4_t temp0 = vzip1q_f32(col0.v, col2.v);
+		float32x4_t temp1 = vzip2q_f32(col0.v, col2.v);
+		float32x4_t temp2 = vzip1q_f32(col1.v, col3.v);
+		float32x4_t temp3 = vzip2q_f32(col1.v, col3.v);
+		col0.v = vzip1q_f32(temp0, temp2);
+		col1.v = vzip2q_f32(temp0, temp2);
+		col2.v = vzip1q_f32(temp1, temp3);
+		col3.v = vzip2q_f32(temp1, temp3);
+#else
+   		float32x4x2_t col01 = vtrnq_f32(col0.v, col1.v);
+        float32x4x2_t col23 = vtrnq_f32(col2.v, col3.v);
+        col0.v = vcombine_f32(vget_low_f32(col01.val[0]), vget_low_f32(col23.val[0]));
+        col1.v = vcombine_f32(vget_low_f32(col01.val[1]), vget_low_f32(col23.val[1]));
+        col2.v = vcombine_f32(vget_high_f32(col01.val[0]), vget_high_f32(col23.val[0]));
+        col3.v = vcombine_f32(vget_high_f32(col01.val[1]), vget_high_f32(col23.val[1]));
+#endif
+	}
+
+	inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
+#if PPSSPP_ARCH(ARM64_NEON)
+		float32x4_t sum = vaddq_f32(
+			vaddq_f32(vmulq_laneq_f32(m.col0, v, 0), vmulq_laneq_f32(m.col1, v, 1)),
+			vaddq_f32(vmulq_laneq_f32(m.col2, v, 2), m.col3));
+#else
+		float32x4_t sum = vaddq_f32(
+			vaddq_f32(vmulq_lane_f32(m.col0, vget_low_f32(v), 0), vmulq_lane_f32(m.col1, vget_low_f32(v), 1)),
+			vaddq_f32(vmulq_lane_f32(m.col2, vget_high_f32(v), 0), m.col3));
+#endif
+		return Vec4F32{ sum };
+	}
+};
+
+inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ vcvtq_s32_f32(f.v) }; }
+inline Vec4F32 Vec4F32FromS32(Vec4S32 s) { return Vec4F32{ vcvtq_f32_s32(s.v) }; }
+
+inline bool AnyZeroSignBit(Vec4S32 value) {
+	// Very suboptimal, let's optimize later.
+	int32x2_t prod = vand_s32(vget_low_s32(value.v), vget_high_s32(value.v));
+	int mask = vget_lane_s32(prod, 0) & vget_lane_s32(prod, 1);
+	return (mask & 0x80000000) == 0;
+}
+
+struct Vec4U16 {
+	uint16x4_t v;  // 64 bits.
+
+	static Vec4U16 Zero() { return Vec4U16{ vdup_n_u16(0) }; }
+	static Vec4U16 Splat(uint16_t value) { return Vec4U16{ vdup_n_u16(value) }; }
+
+	static Vec4U16 Load(const uint16_t *mem) { return Vec4U16{ vld1_u16(mem) }; }
+	void Store(uint16_t *mem) { vst1_u16(mem, v); }
+
+	static Vec4U16 FromVec4S32(Vec4S32 v) {
+		return Vec4U16{ vmovn_u16(v.v) };
+	}
+	static Vec4U16 FromVec4F32(Vec4F32 v) {
+		return Vec4U16{ vmovn_u32(vreinterpretq_u32_s32(vcvtq_s32_f32(v.v))) };
+	}
+
+	Vec4U16 operator |(Vec4U16 other) const { return Vec4U16{ vorr_u16(v, other.v) }; }
+	Vec4U16 operator &(Vec4U16 other) const { return Vec4U16{ vand_u16(v, other.v) }; }
+	Vec4U16 operator ^(Vec4U16 other) const { return Vec4U16{ veor_u16(v, other.v) }; }
+
+	Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ vmax_u16(v, other.v) }; }
+	Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ vmin_u16(v, other.v) }; }
+	Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ vclt_u16(v, other.v) }; }
+};
+
+Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
+	int32x4_t sign_mask = vshrq_n_s32(v.v, 31);
+	uint16x4_t result = vreinterpret_u16_s16(vmovn_s32(sign_mask));
+	return Vec4U16{ result };
+}
+
+Vec4U16 AndNot(Vec4U16 a, Vec4U16 inverted) {
+	return Vec4U16{ vand_u16(a.v, vmvn_u16(inverted.v)) };
+}
+
+struct Vec8U16 {
+	uint16x8_t v;
+
+	static Vec8U16 Zero() { return Vec8U16{ vdupq_n_u16(0) }; }
+	static Vec8U16 Splat(uint16_t value) { return Vec8U16{ vdupq_n_u16(value) }; }
+
+	static Vec8U16 Load(const uint16_t *mem) { return Vec8U16{ vld1q_u16(mem) }; }
+	void Store(uint16_t *mem) { vst1q_u16(mem, v); }
+};
+
+#else
+
+struct Vec4S32 {
+	s32 v[4];
+
+	Vec4S32 operator +(Vec4S32 other) const {
+		return Vec4S32{ { v[0] + other.v[0], v[1] + other.v[1], v[2] + other.v[2], v[3] + other.v[3], } };
+	}
+	Vec4S32 operator -(Vec4S32 other) const {
+		return Vec4S32{ { v[0] - other.v[0], v[1] - other.v[1], v[2] - other.v[2], v[3] - other.v[3], } };
+	}
+};
+
+#endif
diff --git a/Common/Math/SIMDHeaders.h b/Common/Math/SIMDHeaders.h
index 8e812a7819e6..9705dca55434 100644
--- a/Common/Math/SIMDHeaders.h
+++ b/Common/Math/SIMDHeaders.h
@@ -118,6 +118,24 @@ inline __m128i _mm_packu_epi32_SSE2(const __m128i v0) {
 	return _mm_castps_si128(_mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(3, 3, 2, 0)));
 }
 
+#ifdef __cplusplus
+
+alignas(16) static const uint32_t g_sign32[4] = { 0x00008000, 0x00008000, 0x00008000, 0x00008000 };
+alignas(16) static const uint32_t g_sign16[4] = { 0x80008000, 0x80008000, 0x80008000, 0x80008000 };
+
+// Alternate solution to the above, not sure if faster or slower.
+// SSE2 replacement for half of _mm_packus_epi32 but without the saturation.
+// Not ideal! pshufb would make this faster but that's SSSE3.
+inline __m128i _mm_packu1_epi32_SSE2(const __m128i v0) {
+	// Toggle the sign bit, pack, then toggle back.
+	__m128i toggled = _mm_sub_epi32(v0, _mm_load_si128((const __m128i *)g_sign32));
+	__m128i temp = _mm_packs_epi32(toggled, toggled);
+	__m128i restored = _mm_add_epi16(temp, _mm_load_si128((const __m128i *)g_sign16));
+	return restored;
+}
+
+#endif
+
 // SSE2 replacement for the entire _mm_packus_epi32 but without the saturation.
 // Not ideal! pshufb would make this faster but that's SSSE3.
 inline __m128i _mm_packu2_epi32_SSE2(const __m128i v0, const __m128i v1) {
@@ -128,4 +146,26 @@ inline __m128i _mm_packu2_epi32_SSE2(const __m128i v0, const __m128i v1) {
 	return _mm_castps_si128(_mm_shuffle_ps(packed0, packed1, _MM_SHUFFLE(2, 0, 2, 0)));
 }
 
+// The below are not real SSE instructions in any generation, but should exist.
+
+// Return 0xFFFF where x <= y, else 0x0000.
+inline __m128i _mm_cmple_epu16(__m128i x, __m128i y) {
+	return _mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128());
+}
+
+// Return 0xFFFF where x >= y, else 0x0000.
+inline __m128i _mm_cmpge_epu16(__m128i x, __m128i y) {
+	return _mm_cmple_epu16(y, x);
+}
+
+// Return 0xFFFF where x > y, else 0x0000.
+inline __m128i _mm_cmpgt_epu16(__m128i x, __m128i y) {
+	return _mm_andnot_si128(_mm_cmpeq_epi16(x, y), _mm_cmple_epu16(y, x));
+}
+
+// Return 0xFFFF where x < y, else 0x0000.
+inline __m128i _mm_cmplt_epu16(__m128i x, __m128i y) {
+	return _mm_cmpgt_epu16(y, x);
+}
+
 #endif
diff --git a/Common/Math/fast/fast_matrix.c b/Common/Math/fast/fast_matrix.c
index 0402f366297e..d23ce3b0e0b2 100644
--- a/Common/Math/fast/fast_matrix.c
+++ b/Common/Math/fast/fast_matrix.c
@@ -6,8 +6,6 @@
 
 #if PPSSPP_ARCH(SSE2)
 
-#include "fast_matrix.h"
-
 void fast_matrix_mul_4x4_sse(float *dest, const float *a, const float *b) {
 	int i;
 	__m128 a_col_1 = _mm_loadu_ps(a);
diff --git a/Core/Compatibility.cpp b/Core/Compatibility.cpp
index 379042198011..cd2c50c48835 100644
--- a/Core/Compatibility.cpp
+++ b/Core/Compatibility.cpp
@@ -149,6 +149,7 @@ void Compatibility::CheckSettings(IniFile &iniFile, const std::string &gameID) {
 	CheckSetting(iniFile, gameID, "DisableMemcpySlicing", &flags_.DisableMemcpySlicing);
 	CheckSetting(iniFile, gameID, "ForceEnableGPUReadback", &flags_.ForceEnableGPUReadback);
 	CheckSetting(iniFile, gameID, "UseFFMPEGFindStreamInfo", &flags_.UseFFMPEGFindStreamInfo);
+	CheckSetting(iniFile, gameID, "SoftwareRasterDepth", &flags_.SoftwareRasterDepth);
 }
 
 void Compatibility::CheckVRSettings(IniFile &iniFile, const std::string &gameID) {
diff --git a/Core/Compatibility.h b/Core/Compatibility.h
index 8a0e33af4d34..4688df37c055 100644
--- a/Core/Compatibility.h
+++ b/Core/Compatibility.h
@@ -112,6 +112,7 @@ struct CompatFlags {
 	bool DisableMemcpySlicing;
 	bool ForceEnableGPUReadback;
 	bool UseFFMPEGFindStreamInfo;
+	bool SoftwareRasterDepth;
 };
 
 struct VRCompat {
diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp
new file mode 100644
index 000000000000..ca8f81cedfb3
--- /dev/null
+++ b/GPU/Common/DepthRaster.cpp
@@ -0,0 +1,372 @@
+#include <algorithm>
+#include <cstring>
+#include <cstdint>
+
+#include "Common/Math/CrossSIMD.h"
+#include "GPU/Common/DepthRaster.h"
+#include "GPU/Math3D.h"
+#include "Common/Math/math_util.h"
+#include "GPU/Common/VertexDecoderCommon.h"
+
+// We only need to support these three modes.
+enum class ZCompareMode {
+	Greater,  // Most common
+	Less,  // Less common
+	Always,  // Fairly common
+};
+
+void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, short depthValue, ZCompareMode compareMode) {
+	// Swap coordinates if needed, we don't back-face-cull rects.
+	// We also ignore the UV rotation here.
+	if (x1 > x2) {
+		std::swap(x1, x2);
+	}
+	if (y1 > y2) {
+		std::swap(y1, y2);
+	}
+	if (x1 == x2 || y1 == y2) {
+		return;
+	}
+
+	Vec8U16 valueX8 = Vec8U16::Splat(depthValue);
+	for (int y = y1; y < y2; y++) {
+		uint16_t *ptr = (uint16_t *)(dest + stride * y + x1);
+		int w = x2 - x1;
+		switch (compareMode) {
+		case ZCompareMode::Always:
+			if (depthValue == 0) {
+				memset(ptr, 0, w * 2);
+			} else {
+				while (w >= 8) {
+					valueX8.Store(ptr);
+					ptr += 8;
+					w -= 8;
+				}
+			}
+			break;
+			// TODO: Trailer
+		default:
+			// TODO
+			break;
+		}
+	}
+}
+
+alignas(16) static const int zero123[4]  = {0, 1, 2, 3};
+
+struct Edge {
+	// Dimensions of our pixel group
+	static const int stepXSize = 4;
+	static const int stepYSize = 1;
+
+	Vec4S32 oneStepX;
+	Vec4S32 oneStepY;
+
+	Vec4S32 init(int v0x, int v0y, int v1x, int v1y, int p0x, int p0y) {
+		// Edge setup
+		int A = v0y - v1y;
+		int B = v1x - v0x;
+		int C = v0x * v1y - v0y * v1x;
+
+		// Step deltas
+		oneStepX = Vec4S32::Splat(A * stepXSize);
+		oneStepY = Vec4S32::Splat(B * stepYSize);
+
+		// x/y values for initial pixel block. Add horizontal offsets.
+		Vec4S32 x = Vec4S32::Splat(p0x) + Vec4S32::LoadAligned(zero123);
+		Vec4S32 y = Vec4S32::Splat(p0y);
+
+		// Edge function values at origin
+		return Vec4S32::Splat(A) * x + Vec4S32::Splat(B) * y + Vec4S32::Splat(C);
+	}
+};
+
+// Adapted from Intel's depth rasterizer example.
+// Started with the scalar version, will SIMD-ify later.
+// x1/y1 etc are the scissor rect.
+void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, ZCompareMode compareMode) {
+	int tileStartX = x1;
+	int tileEndX = x2;
+
+	int tileStartY = y1;
+	int tileEndY = y2;
+
+	// BEGIN triangle setup. This should be done SIMD, four triangles at a time.
+	// Due to the many multiplications, we might want to do it in floating point as 32-bit integer muls
+	// are slow on SSE2.
+
+	// Convert to whole pixels for now. Later subpixel precision.
+	int v0x = tx[0];
+	int v0y = ty[0];
+	int v0z = tz[0];
+	int v1x = tx[1];
+	int v1y = ty[1];
+	int v1z = tz[1];
+	int v2x = tx[2];
+	int v2y = ty[2];
+	int v2z = tz[2];
+
+	// use fixed-point only for X and Y.  Avoid work for Z and W.
+	// We use 4x1 tiles for simplicity.
+	int minX = std::max(std::min(std::min(v0x, v1x), v2x), tileStartX) & ~3;
+	int maxX = std::min(std::max(std::max(v0x, v1x), v2x) + 3, tileEndX) & ~3;
+	int minY = std::max(std::min(std::min(v0y, v1y), v2y), tileStartY);
+	int maxY = std::min(std::max(std::max(v0y, v1y), v2y), tileEndY);
+	if (maxX == minX || maxY == minY) {
+		// No pixels, or outside screen.
+		return;
+	}
+
+	// TODO: Cull really small triangles here.
+	int triArea = (v1y - v2y) * v0x + (v2x - v1x) * v0y + (v1x * v2y - v2x * v1y);
+	if (triArea <= 0) {
+		return;
+	}
+
+	float oneOverTriArea = 1.0f / (float)triArea;
+
+	Edge e01, e12, e20;
+
+	Vec4S32 w0_row = e12.init(v1x, v1y, v2x, v2y, minX, minY);
+	Vec4S32 w1_row = e20.init(v2x, v2y, v0x, v0y, minX, minY);
+	Vec4S32 w2_row = e01.init(v0x, v0y, v1x, v1y, minX, minY);
+
+	// Prepare to interpolate Z
+	Vec4F32 zz0 = Vec4F32::Splat((float)v0z);
+	Vec4F32 zz1 = Vec4F32::Splat((float)(v1z - v0z) * oneOverTriArea);
+	Vec4F32 zz2 = Vec4F32::Splat((float)(v2z - v0z) * oneOverTriArea);
+
+	// Rasterize
+	for (int y = minY; y <= maxY; y += Edge::stepYSize, w0_row += e12.oneStepY, w1_row += e20.oneStepY, w2_row += e01.oneStepY) {
+		// Barycentric coordinates at start of row
+		Vec4S32 w0 = w0_row;
+		Vec4S32 w1 = w1_row;
+		Vec4S32 w2 = w2_row;
+
+		uint16_t *rowPtr = depthBuf + stride * y;
+
+		for (int x = minX; x <= maxX; x += Edge::stepXSize, w0 += e12.oneStepX, w1 += e20.oneStepX, w2 += e01.oneStepX) {
+			// If p is on or inside all edges for any pixels,
+			// render those pixels.
+			Vec4S32 signCalc = w0 | w1 | w2;
+			if (!AnyZeroSignBit(signCalc)) {
+				continue;
+			}
+
+			Vec4U16 bufferValues = Vec4U16::Load(rowPtr + x);
+			Vec4U16 shortMaskInv = SignBits32ToMaskU16(signCalc);
+			// Now, the mask has 1111111 where we should preserve the contents of the depth buffer.
+
+			// Compute the Z value for all four pixels.
+			// float depth = zz[0] + beta * zz[1] + gamma * zz[2];
+			Vec4U16 shortZ = Vec4U16::FromVec4F32(zz0 + Vec4F32FromS32(w1) * zz1 + Vec4F32FromS32(w2) * zz2);
+
+			// TODO: Lift this switch out of the inner loop, or even out of the function with templating.
+			switch (compareMode) {
+			case ZCompareMode::Greater:
+				// To implement the greater/greater-than comparison, we can combine mask and max.
+				// It might be better to do the math in float space on x86 due to SSE2 deficiencies.
+				// We use AndNot to zero out Z results, before doing Max with the buffer.
+				AndNot(shortZ, shortMaskInv).Max(bufferValues).Store(rowPtr + x);
+				break;
+			case ZCompareMode::Less:  // UNTESTED
+				// This time, we OR the mask and use .Min.
+				(shortZ | shortMaskInv).Min(bufferValues).Store(rowPtr + x);
+				break;
+			case ZCompareMode::Always:  // UNTESTED
+				// This could be replaced with a vblend operation.
+				((bufferValues & shortMaskInv) | AndNot(shortZ, shortMaskInv)).Store(rowPtr + x);
+				break;
+			}
+		}
+	}
+}
+
+void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, VertexDecoder *dec, u32 vertTypeID) {
+	// TODO: Ditch skinned and morphed prims for now since we don't have a fast way to skin without running the full decoder.
+	_dbg_assert_((vertTypeID & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) == 0);
+
+	int vertexStride = dec->VertexSize();
+	int offset = dec->posoff;
+
+	Mat4F32 mat(worldviewproj);
+
+	const u8 *startPtr = (const u8 *)vertexData + indexLowerBound * vertexStride;
+	int count = indexUpperBound - indexLowerBound + 1;
+
+	switch (vertTypeID & GE_VTYPE_POS_MASK) {
+	case GE_VTYPE_POS_FLOAT:
+		for (int i = 0; i < count; i++) {
+			const float *data = (const float *)(startPtr + i * vertexStride + offset);
+			Vec4F32::Load(data).AsVec3ByMatrix44(mat).Store(dest + i * 4);
+		}
+		break;
+	case GE_VTYPE_POS_16BIT:
+		for (int i = 0; i < count; i++) {
+			const s16 *data = ((const s16 *)((const s8 *)startPtr + i * vertexStride + offset));
+			Vec4F32::LoadConvertS16(data).Mul(1.0f / 32768.f).AsVec3ByMatrix44(mat).Store(dest + i * 4);
+		}
+		break;
+	case GE_VTYPE_POS_8BIT:
+		for (int i = 0; i < count; i++) {
+			const s8 *data = (const s8 *)startPtr + i * vertexStride + offset;
+			Vec4F32::LoadConvertS8(data).Mul(1.0f / 128.0f).AsVec3ByMatrix44(mat).Store(dest + i * 4);
+		}
+		break;
+	}
+}
+
+int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count) {
+	bool cullEnabled = gstate.isCullEnabled();
+	GECullMode cullMode = gstate.getCullMode();
+
+	// TODO: On ARM we can do better by keeping these in lanes instead of splatting.
+	// However, hard to find a common abstraction.
+	const Vec4F32 viewportX = Vec4F32::Splat(gstate.getViewportXCenter());
+	const Vec4F32 viewportY = Vec4F32::Splat(gstate.getViewportYCenter());
+	const Vec4F32 viewportZ = Vec4F32::Splat(gstate.getViewportZCenter());
+	const Vec4F32 viewportScaleX = Vec4F32::Splat(gstate.getViewportXScale());
+	const Vec4F32 viewportScaleY = Vec4F32::Splat(gstate.getViewportYScale());
+	const Vec4F32 viewportScaleZ = Vec4F32::Splat(gstate.getViewportZScale());
+
+	const Vec4F32 offsetX = Vec4F32::Splat(gstate.getOffsetX());  // We remove the 16 scale here
+	const Vec4F32 offsetY = Vec4F32::Splat(gstate.getOffsetY());
+
+	bool cullCCW = false;
+
+	int outCount = 0;
+
+	int flipCull = 0;
+	if (cullEnabled && cullMode == GE_CULL_CW) {
+		flipCull = 3;
+	}
+	for (int i = 0; i < count; i += 3) {
+		const float *verts[3] = {
+			transformed + indexBuffer[i] * 4,
+			transformed + indexBuffer[i + (1 ^ flipCull)] * 4,
+			transformed + indexBuffer[i + (2 ^ flipCull)] * 4,
+		};
+
+		// Check if any vertex is behind the 0 plane.
+		if (verts[0][3] < 0.0f || verts[1][3] < 0.0f || verts[2][3] < 0.0f) {
+			// Ditch this triangle. Later we should clip here.
+			continue;
+		}
+
+		// These names are wrong .. until we transpose.
+		Vec4F32 x = Vec4F32::Load(verts[0]);
+		Vec4F32 y = Vec4F32::Load(verts[1]);
+		Vec4F32 z = Vec4F32::Load(verts[2]);
+		Vec4F32 w = Vec4F32::Zero();
+		Vec4F32::Transpose(x, y, z, w);
+		// Now the names are accurate! Since we only have three vertices, the fourth member of each vector is zero
+		// and will not be stored (well it will be stored, but it'll be overwritten by the next vertex).
+		Vec4F32 recipW = w.Recip();
+
+		x *= recipW;
+		y *= recipW;
+		z *= recipW;
+
+		Vec4S32 screen[3];
+		screen[0] = Vec4S32FromF32((x * viewportScaleX + viewportX) - offsetX);
+		screen[1] = Vec4S32FromF32((y * viewportScaleY + viewportY) - offsetY);
+		screen[2] = Vec4S32FromF32((z * viewportScaleZ + viewportZ).Clamp(0.0f, 65535.0f));
+
+		screen[0].Store(tx + outCount);
+		screen[1].Store(ty + outCount);
+		screen[2].Store(tz + outCount);
+		outCount += 3;
+
+		if (!cullEnabled) {
+			// If culling is off, shuffle the three vectors to produce the opposite triangle, and store them after.
+
+			// HOWEVER! I realized that this is not the optimal layout, after all.
+			// We should group 4 triangles at a time and interleave them (so we first have all X of vertex 0,
+			// then all X of vertex 1, and so on). This seems solvable with another transpose, if we can easily
+			// collect four triangles at a time...
+
+			screen[0].SwapLowerElements().Store(tx + outCount);
+			screen[1].SwapLowerElements().Store(ty + outCount);
+			screen[2].SwapLowerElements().Store(tz + outCount);
+			outCount += 3;
+		}
+	}
+	return outCount;
+}
+
+void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, GEPrimitiveType prim, const TransformedVertex *transformed, int count) {
+	_dbg_assert_(prim == GE_PRIM_RECTANGLES || prim == GE_PRIM_TRIANGLES);
+
+	// TODO: This is basically a transpose, or AoS->SoA conversion. There may be fast ways.
+	for (int i = 0; i < count; i++) {
+		tx[i] = (int)transformed[i].pos[0];
+		ty[i] = (int)transformed[i].pos[1];
+		tz[i] = (u16)transformed[i].pos[2];
+	}
+}
+
+// Rasterizes screen-space vertices.
+void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, int count) {
+	// Prim should now be either TRIANGLES or RECTs.
+	_dbg_assert_(prim == GE_PRIM_RECTANGLES || prim == GE_PRIM_TRIANGLES);
+
+	// Ignore draws where stencil operations are active?
+	if (gstate.isStencilTestEnabled()) {
+		// return;
+	}
+
+	GEComparison compareMode = gstate.getDepthTestFunction();
+
+	ZCompareMode comp;
+	// Ignore some useless compare modes.
+	switch (compareMode) {
+	case GE_COMP_ALWAYS:
+		comp = ZCompareMode::Always;
+		break;
+	case GE_COMP_LEQUAL:
+	case GE_COMP_LESS:
+		comp = ZCompareMode::Less;
+		break;
+	case GE_COMP_GEQUAL:
+	case GE_COMP_GREATER:
+		comp = ZCompareMode::Greater;  // Most common
+		break;
+	case GE_COMP_NEVER:
+	case GE_COMP_EQUAL:
+		// These will never have a useful effect in Z-only raster.
+		[[fallthrough]];
+	case GE_COMP_NOTEQUAL:
+		// This is highly unusual, let's just ignore it.
+		[[fallthrough]];
+	default:
+		return;
+	}
+
+	if (gstate.isModeClear()) {
+		if (!gstate.isClearModeDepthMask()) {
+			return;
+		}
+		comp = ZCompareMode::Always;
+	} else {
+		if (!gstate.isDepthTestEnabled() || !gstate.isDepthWriteEnabled())
+			return;
+	}
+
+	switch (prim) {
+	case GE_PRIM_RECTANGLES:
+		for (int i = 0; i < count; i += 2) {
+			uint16_t z = tz[i + 1];  // depth from second vertex
+			// TODO: Should clip coordinates to the scissor rectangle.
+			// We remove the subpixel information here.
+			DepthRasterRect(depth, depthStride, tx[i], ty[i], tx[i + 1], ty[i + 1], z, comp);
+		}
+		break;
+	case GE_PRIM_TRIANGLES:
+		for (int i = 0; i < count; i += 3) {
+			DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, &tx[i], &ty[i], &tz[i], comp);
+		}
+		break;
+	default:
+		_dbg_assert_(false);
+	}
+}
diff --git a/GPU/Common/DepthRaster.h b/GPU/Common/DepthRaster.h
new file mode 100644
index 000000000000..e92c1a1348ed
--- /dev/null
+++ b/GPU/Common/DepthRaster.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "Common/CommonTypes.h"
+#include "GPU/ge_constants.h"
+
+struct DepthScreenVertex {
+	int x;
+	int y;
+	int z;
+};
+
+// Specialized, very limited depth-only rasterizer.
+// Meant to run in parallel with hardware rendering, in games that read back the depth buffer
+// for effects like lens flare.
+// So, we can be quite inaccurate without any issues, and skip a lot of functionality.
+
+class VertexDecoder;
+struct TransformedVertex;
+
+int DepthRasterClipIndexedTriangles(int *tx, int *ty, int *tz, const float *transformed, const uint16_t *indexBuffer, int count);
+void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, VertexDecoder *dec, u32 vertTypeID);
+void DepthRasterConvertTransformed(int *tx, int *ty, int *tz, GEPrimitiveType prim, const TransformedVertex *transformed, int count);
+void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const int *tx, const int *ty, const int *tz, int count);
diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp
index 818021a79b3a..bbe36fb479e7 100644
--- a/GPU/Common/DrawEngineCommon.cpp
+++ b/GPU/Common/DrawEngineCommon.cpp
@@ -23,9 +23,11 @@
 #include "Common/LogReporting.h"
 #include "Common/Math/SIMDHeaders.h"
 #include "Common/Math/lin/matrix4x4.h"
+#include "Core/System.h"
 #include "Core/Config.h"
 #include "GPU/Common/DrawEngineCommon.h"
 #include "GPU/Common/SplineCommon.h"
+#include "GPU/Common/DepthRaster.h"
 #include "GPU/Common/VertexDecoderCommon.h"
 #include "GPU/Common/SoftwareTransformCommon.h"
 #include "GPU/ge_constants.h"
@@ -34,7 +36,11 @@
 #define QUAD_INDICES_MAX 65536
 
 enum {
-	TRANSFORMED_VERTEX_BUFFER_SIZE = VERTEX_BUFFER_MAX * sizeof(TransformedVertex)
+	TRANSFORMED_VERTEX_BUFFER_SIZE = VERTEX_BUFFER_MAX * sizeof(TransformedVertex),
+	DEPTH_TRANSFORMED_SIZE = VERTEX_BUFFER_MAX * 4,
+	DEPTH_SCREENVERTS_COMPONENT_COUNT = VERTEX_BUFFER_MAX,
+	DEPTH_SCREENVERTS_COMPONENT_SIZE = DEPTH_SCREENVERTS_COMPONENT_COUNT * sizeof(int) + 384,
+	DEPTH_SCREENVERTS_SIZE = DEPTH_SCREENVERTS_COMPONENT_SIZE * 3,
 };
 
 DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) {
@@ -46,6 +52,12 @@ DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) {
 	decoded_ = (u8 *)AllocateMemoryPages(DECODED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
 	decIndex_ = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
 	indexGen.Setup(decIndex_);
+
+	useDepthRaster_ = PSP_CoreParameter().compat.flags().SoftwareRasterDepth;
+	if (useDepthRaster_) {
+		depthTransformed_ = (float *)AllocateMemoryPages(DEPTH_TRANSFORMED_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
+		depthScreenVerts_ = (int *)AllocateMemoryPages(DEPTH_SCREENVERTS_SIZE, MEM_PROT_READ | MEM_PROT_WRITE);
+	}
 }
 
 DrawEngineCommon::~DrawEngineCommon() {
@@ -53,6 +65,10 @@ DrawEngineCommon::~DrawEngineCommon() {
 	FreeMemoryPages(decIndex_, DECODED_INDEX_BUFFER_SIZE);
 	FreeMemoryPages(transformed_, TRANSFORMED_VERTEX_BUFFER_SIZE);
 	FreeMemoryPages(transformedExpanded_, 3 * TRANSFORMED_VERTEX_BUFFER_SIZE);
+	if (depthTransformed_) {
+		FreeMemoryPages(depthTransformed_, DEPTH_TRANSFORMED_SIZE);
+		FreeMemoryPages(depthScreenVerts_, DEPTH_SCREENVERTS_SIZE);
+	}
 	delete decJitCache_;
 	decoderMap_.Iterate([&](const uint32_t vtype, VertexDecoder *decoder) {
 		delete decoder;
@@ -886,3 +902,99 @@ bool DrawEngineCommon::DescribeCodePtr(const u8 *ptr, std::string &name) const {
 		return false;
 	}
 }
+
+void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID, int vertexCount) {
+	switch (prim) {
+	case GE_PRIM_INVALID:
+	case GE_PRIM_KEEP_PREVIOUS:
+	case GE_PRIM_LINES:
+	case GE_PRIM_LINE_STRIP:
+	case GE_PRIM_POINTS:
+		return;
+	default:
+		break;
+	}
+
+	if (vertTypeID & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) {
+		return;
+	}
+
+	float world[16];
+	float view[16];
+	float worldview[16];
+	float worldviewproj[16];
+	ConvertMatrix4x3To4x4(world, gstate.worldMatrix);
+	ConvertMatrix4x3To4x4(view, gstate.viewMatrix);
+	Matrix4ByMatrix4(worldview, world, view);
+	Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix);   // TODO: Include adjustments to the proj matrix?
+
+	// Decode.
+	int numDec = 0;
+	for (int i = 0; i < numDrawVerts_; i++) {
+		DeferredVerts &dv = drawVerts_[i];
+
+		int indexLowerBound = dv.indexLowerBound;
+		drawVertexOffsets_[i] = numDec - indexLowerBound;
+
+		int indexUpperBound = dv.indexUpperBound;
+		if (indexUpperBound + 1 - indexLowerBound + numDec >= VERTEX_BUFFER_MAX) {
+			// Hit our limit! Stop decoding in this draw.
+			break;
+		}
+
+		// Decode the verts (and at the same time apply morphing/skinning). Simple.
+		DecodeAndTransformForDepthRaster(depthTransformed_ + numDec * 4, prim, worldviewproj, dv.verts, indexLowerBound, indexUpperBound, dec, vertTypeID);
+		numDec += indexUpperBound - indexLowerBound + 1;
+	}
+
+	int *tx = depthScreenVerts_;
+	int *ty = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT;
+	int *tz = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2;
+
+	// Clip and triangulate using the index buffer.
+	int outVertCount = DepthRasterClipIndexedTriangles(tx, ty, tz, depthTransformed_, decIndex_, vertexCount);
+	if (outVertCount & 15) {
+		// Zero padding
+		for (int i = outVertCount; i < ((outVertCount + 16) & ~15); i++) {
+			tx[i] = 0;
+			ty[i] = 0;
+			tz[i] = 0;
+		}
+	}
+
+	DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(),
+		GE_PRIM_TRIANGLES, gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(),
+		tx, ty, tz, outVertCount);
+}
+
+void DrawEngineCommon::DepthRasterPretransformed(GEPrimitiveType prim, const TransformedVertex *inVerts, int count) {
+	switch (prim) {
+	case GE_PRIM_INVALID:
+	case GE_PRIM_KEEP_PREVIOUS:
+	case GE_PRIM_LINES:
+	case GE_PRIM_LINE_STRIP:
+	case GE_PRIM_POINTS:
+		return;
+	default:
+		break;
+	}
+
+	_dbg_assert_(prim != GE_PRIM_TRIANGLE_STRIP && prim != GE_PRIM_TRIANGLE_FAN);
+
+	int *tx = depthScreenVerts_;
+	int *ty = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT;
+	int *tz = depthScreenVerts_ + DEPTH_SCREENVERTS_COMPONENT_COUNT * 2;
+
+	DepthRasterConvertTransformed(tx, ty, tz, prim, inVerts, count);
+	if (count & 15) {
+		// Zero padding
+		for (int i = count; i < ((count + 16) & ~15); i++) {
+			tx[i] = 0;
+			ty[i] = 0;
+			tz[i] = 0;
+		}
+	}
+	DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(),
+		prim, gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(),
+		tx, ty, tz, count);
+}
diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h
index 0f8ab8a7515a..053c4c31f55b 100644
--- a/GPU/Common/DrawEngineCommon.h
+++ b/GPU/Common/DrawEngineCommon.h
@@ -27,6 +27,7 @@
 #include "GPU/Common/GPUStateUtils.h"
 #include "GPU/Common/IndexGenerator.h"
 #include "GPU/Common/VertexDecoderCommon.h"
+#include "GPU/Common/DepthRaster.h"
 
 class VertexDecoder;
 
@@ -158,6 +159,11 @@ class DrawEngineCommon {
 		_dbg_assert_(numDrawVerts_ == 0 && numDrawInds_ == 0);
 	}
 
+	// temporary hack
+	uint8_t *GetTempSpace() {
+		return decoded_ + 12 * 65536;
+	}
+
 protected:
 	virtual bool UpdateUseHWTessellation(bool enabled) const { return enabled; }
 	void UpdatePlanes();
@@ -169,6 +175,9 @@ class DrawEngineCommon {
 
 	void ApplyFramebufferRead(FBOTexState *fboTexState);
 
+	void DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID, int vertexCount);
+	void DepthRasterPretransformed(GEPrimitiveType prim, const TransformedVertex *inVerts, int count);
+
 	static inline int IndexSize(u32 vtype) {
 		const u32 indexType = (vtype & GE_VTYPE_IDX_MASK);
 		if (indexType == GE_VTYPE_IDX_16BIT) {
@@ -223,6 +232,11 @@ class DrawEngineCommon {
 	}
 
 	inline bool CollectedPureDraw() const {
+		// TODO: Do something faster.
+		if (useDepthRaster_) {
+			return false;
+		}
+
 		switch (seenPrims_) {
 		case 1 << GE_PRIM_TRIANGLE_STRIP:
 			return !anyCCWOrIndexed_ && numDrawInds_ == 1;
@@ -338,4 +352,10 @@ class DrawEngineCommon {
 	bool offsetOutsideEdge_;
 
 	GPUCommon *gpuCommon_;
+
+	// Software depth raster
+	bool useDepthRaster_ = false;
+
+	float *depthTransformed_ = nullptr;
+	int *depthScreenVerts_ = nullptr;
 };
diff --git a/GPU/Common/IndexGenerator.h b/GPU/Common/IndexGenerator.h
index 723f4caabd7c..48df11e97291 100644
--- a/GPU/Common/IndexGenerator.h
+++ b/GPU/Common/IndexGenerator.h
@@ -54,7 +54,7 @@ class IndexGenerator {
 	void TranslatePrim(int prim, int numInds, const u32_le *inds, int indexOffset, bool clockwise);
 
 	// This is really the number of generated indices, or 3x the number of triangles.
-	int VertexCount() const { return inds_ - indsBase_; }
+	int VertexCount() const { return (int)(inds_ - indsBase_); }
 
 private:
 	// Points (why index these? code simplicity)
diff --git a/GPU/Common/VertexDecoderCommon.h b/GPU/Common/VertexDecoderCommon.h
index 2793ad80eed5..4b905f03c956 100644
--- a/GPU/Common/VertexDecoderCommon.h
+++ b/GPU/Common/VertexDecoderCommon.h
@@ -122,7 +122,7 @@ class IndexConverter {
 // Reads decoded vertex formats in a convenient way. For software transform and debugging.
 class VertexReader {
 public:
-	VertexReader(u8 *base, const DecVtxFormat &decFmt, int vtype) : base_(base), data_(base), decFmt_(decFmt), vtype_(vtype) {}
+	VertexReader(const u8 *base, const DecVtxFormat &decFmt, int vtype) : base_(base), data_(base), decFmt_(decFmt), vtype_(vtype) {}
 
 	void ReadPos(float pos[3]) const {
 		// Only DEC_FLOAT_3 is supported.
@@ -297,8 +297,8 @@ class VertexReader {
 	}
 
 private:
-	u8 *base_;
-	u8 *data_;
+	const u8 *base_;
+	const u8 *data_;
 	DecVtxFormat decFmt_;
 	int vtype_;
 };
diff --git a/GPU/GPU.vcxproj b/GPU/GPU.vcxproj
index 5cb3ea9e6238..c27d08354936 100644
--- a/GPU/GPU.vcxproj
+++ b/GPU/GPU.vcxproj
@@ -346,6 +346,7 @@
   </ItemDefinitionGroup>
   <ItemGroup>
     <ClInclude Include="..\ext\xbrz\xbrz.h" />
+    <ClInclude Include="Common\DepthRaster.h" />
     <ClInclude Include="Common\ReplacedTexture.h" />
     <ClInclude Include="Common\TextureReplacer.h" />
     <ClInclude Include="Common\TextureShaderCommon.h" />
@@ -468,6 +469,7 @@
   <ItemGroup>
     <ClCompile Include="..\ext\xbrz\xbrz.cpp" />
     <ClCompile Include="Common\DepthBufferCommon.cpp" />
+    <ClCompile Include="Common\DepthRaster.cpp" />
     <ClCompile Include="Common\ReplacedTexture.cpp" />
     <ClCompile Include="Common\TextureReplacer.cpp" />
     <ClCompile Include="Common\TextureShaderCommon.cpp" />
diff --git a/GPU/GPU.vcxproj.filters b/GPU/GPU.vcxproj.filters
index 610ba94cbe33..1529b974c13f 100644
--- a/GPU/GPU.vcxproj.filters
+++ b/GPU/GPU.vcxproj.filters
@@ -279,6 +279,9 @@
     <ClInclude Include="Debugger\State.h">
       <Filter>Debugger</Filter>
     </ClInclude>
+    <ClInclude Include="Common\DepthRaster.h">
+      <Filter>Common</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="Math3D.cpp">
@@ -554,6 +557,9 @@
     <ClCompile Include="Debugger\State.cpp">
       <Filter>Debugger</Filter>
     </ClCompile>
+    <ClCompile Include="Common\DepthRaster.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <FxCompile Include="..\assets\shaders\tex_4xbrz.csh">
diff --git a/GPU/GPUState.h b/GPU/GPUState.h
index b5318331750b..e2814d5a1ce7 100644
--- a/GPU/GPUState.h
+++ b/GPU/GPUState.h
@@ -227,7 +227,7 @@ struct GPUgstate {
 
 	// Cull
 	bool isCullEnabled() const { return cullfaceEnable & 1; }
-	int getCullMode()   const { return cullmode & 1; }
+	GECullMode getCullMode()   const { return (GECullMode)(cullmode & 1); }
 
 	// Color Mask
 	bool isClearModeColorMask() const { return (clearmode&0x100) != 0; }
diff --git a/GPU/Vulkan/DrawEngineVulkan.cpp b/GPU/Vulkan/DrawEngineVulkan.cpp
index f1279b855a69..ebe3d022df2a 100644
--- a/GPU/Vulkan/DrawEngineVulkan.cpp
+++ b/GPU/Vulkan/DrawEngineVulkan.cpp
@@ -370,6 +370,9 @@ void DrawEngineVulkan::Flush() {
 		} else {
 			renderManager->Draw(descSetIndex, ARRAY_SIZE(dynamicUBOOffsets), dynamicUBOOffsets, vbuf, vbOffset, vertexCount);
 		}
+		if (useDepthRaster_) {
+			DepthRasterTransform(prim, dec_, dec_->VertexType(), vertexCount);
+		}
 	} else {
 		PROFILE_THIS_SCOPE("soft");
 		VertexDecoder *swDec = dec_;
@@ -438,6 +441,12 @@ void DrawEngineVulkan::Flush() {
 		swTransform.SetProjMatrix(gstate.projMatrix, gstate_c.vpWidth < 0, gstate_c.vpHeight < 0, trans, scale);
 
 		swTransform.Transform(prim, swDec->VertexType(), swDec->GetDecVtxFmt(), numDecodedVerts_, &result);
+
+		// At this point, rect and line primitives are still preserved as such. So, it's the best time to do software depth raster.
+		if (useDepthRaster_) {
+			DepthRasterPretransformed(prim, transformed_, numDecodedVerts_);
+		}
+
 		// Non-zero depth clears are unusual, but some drivers don't match drawn depth values to cleared values.
 		// Games sometimes expect exact matches (see #12626, for example) for equal comparisons.
 		if (result.action == SW_CLEAR && everUsedEqualDepth_ && gstate.isClearModeDepthMask() && result.depth > 0.0f && result.depth < 1.0f)
diff --git a/GPU/ge_constants.h b/GPU/ge_constants.h
index 1d123e162093..f4a366f80250 100644
--- a/GPU/ge_constants.h
+++ b/GPU/ge_constants.h
@@ -623,6 +623,11 @@ enum GEPatchPrimType
 	GE_PATCHPRIM_UNKNOWN = 3,
 };
 
+enum GECullMode {
+	GE_CULL_CW = 0,
+	GE_CULL_CCW = 1,
+};
+
 inline GEPrimitiveType PatchPrimToPrim(GEPatchPrimType type) {
 	switch (type) {
 	case GE_PATCHPRIM_TRIANGLES: return GE_PRIM_TRIANGLES;
diff --git a/UI/ImDebugger/ImDebugger.cpp b/UI/ImDebugger/ImDebugger.cpp
index 5f14a8529c0d..9cb082cf68bf 100644
--- a/UI/ImDebugger/ImDebugger.cpp
+++ b/UI/ImDebugger/ImDebugger.cpp
@@ -1552,4 +1552,5 @@ void ImConfig::SyncConfig(IniFile *ini, bool save) {
 
 	sync.SetSection(ini->GetOrCreateSection("Settings"));
 	sync.Sync("displayLatched", &displayLatched, false);
+	sync.Sync("realtimePixelPreview", &realtimePixelPreview, false);
 }
diff --git a/UI/ImDebugger/ImDebugger.h b/UI/ImDebugger/ImDebugger.h
index f1363d6a0f5f..c0fa9a037d56 100644
--- a/UI/ImDebugger/ImDebugger.h
+++ b/UI/ImDebugger/ImDebugger.h
@@ -153,6 +153,7 @@ struct ImConfig {
 	int selectedMemCheck = -1;
 	uint64_t selectedTexAddr = 0;
 
+	bool realtimePixelPreview = false;
 	int breakCount = 0;
 
 	bool displayLatched = false;
@@ -170,12 +171,14 @@ enum class ImCmd {
 	SHOW_IN_CPU_DISASM,
 	SHOW_IN_GE_DISASM,
 	SHOW_IN_MEMORY_VIEWER,  // param is address, param2 is viewer index
+	SHOW_IN_PIXEL_VIEWER,  // param is address, param2 is stride, |0x80000000 if depth, param3 is w/h
 };
 
 struct ImCommand {
 	ImCmd cmd;
 	uint32_t param;
 	uint32_t param2;
+	uint32_t param3;
 };
 
 struct ImControl {
diff --git a/UI/ImDebugger/ImGe.cpp b/UI/ImDebugger/ImGe.cpp
index 415cd0a7e133..52783719a545 100644
--- a/UI/ImDebugger/ImGe.cpp
+++ b/UI/ImDebugger/ImGe.cpp
@@ -149,12 +149,44 @@ void ImGePixelViewerWindow::Draw(ImConfig &cfg, ImControl &control, GPUDebugInte
 		if (ImGui::Button("Refresh")) {
 			viewer_.Snapshot();
 		}
+		if (ImGui::Button("Show cur depth")) {
+			viewer_.addr = gstate.getDepthBufRawAddress() | 0x04000000;
+			viewer_.format = GE_FORMAT_DEPTH16;
+			viewer_.stride = gstate.DepthBufStride();
+			viewer_.width = viewer_.stride;
+			viewer_.Snapshot();
+		}
+		if (ImGui::Button("Show cur color")) {
+			viewer_.addr = gstate.getFrameBufAddress();
+			viewer_.format = gstate.FrameBufFormat();
+			viewer_.stride = gstate.FrameBufStride();
+			viewer_.width = viewer_.stride;
+			viewer_.Snapshot();
+		}
+		ImGui::Checkbox("Realtime", &cfg.realtimePixelPreview);
 	}
 	ImGui::EndChild();
 
+	if (cfg.realtimePixelPreview) {
+		viewer_.Snapshot();
+	}
+
 	ImGui::SameLine();
 	if (ImGui::BeginChild("right")) {
+		ImVec2 p0 = ImGui::GetCursorScreenPos();
 		viewer_.Draw(gpuDebug, draw);
+		if (ImGui::IsItemHovered()) {
+			int x = (int)(ImGui::GetMousePos().x - p0.x);
+			int y = (int)(ImGui::GetMousePos().y - p0.y);
+			char temp[128];
+			if (viewer_.FormatValueAt(temp, sizeof(temp), x, y)) {
+				ImGui::Text("(%d, %d): %s", x, y, temp);
+			} else {
+				ImGui::Text("%d, %d: N/A", x, y);
+			}
+		} else {
+			ImGui::TextUnformatted("(no pixel hovered)");
+		}
 	}
 	ImGui::EndChild();
 	ImGui::End();
@@ -211,6 +243,12 @@ bool ImGePixelViewer::FormatValueAt(char *buf, size_t bufSize, int x, int y) con
 		snprintf(buf, bufSize, "%08x (raw: %04x)", RGBA5551ToRGBA8888(raw), raw);
 		break;
 	}
+	case GE_FORMAT_DEPTH16:
+	{
+		u16 raw = Memory::Read_U16(pixelAddr);
+		snprintf(buf, bufSize, "%0.4f (raw: %04x / %d)", (float)raw / 65535.0f, raw, raw);
+		break;
+	}
 	default:
 		snprintf(buf, bufSize, "N/A");
 		return false;
@@ -356,6 +394,7 @@ bool ImGeReadbackViewer::Draw(GPUDebugInterface *gpuDebug, Draw::DrawContext *dr
 			readbackFmt_ = Draw::DataFormat::R8G8B8A8_UNORM;
 			break;
 		case Draw::Aspect::DEPTH_BIT:
+			// TODO: Add fallback
 			readbackFmt_ = Draw::DataFormat::D32F;
 			break;
 		case Draw::Aspect::STENCIL_BIT:
@@ -385,14 +424,15 @@ bool ImGeReadbackViewer::Draw(GPUDebugInterface *gpuDebug, Draw::DrawContext *dr
 				}
 			}
 
+			Draw::DataFormat fmt = rbBpp == 1 ? Draw::DataFormat::R8_UNORM : Draw::DataFormat::R32_FLOAT;
 			Draw::TextureDesc desc{ Draw::TextureType::LINEAR2D,
-				rbBpp == 1 ? Draw::DataFormat::R8_UNORM : Draw::DataFormat::R32_FLOAT,
+				fmt,
 				(int)w,
 				(int)h,
 				1,
 				1,
 				false,
-				rbBpp == 1 ? Draw::TextureSwizzle::R8_AS_ALPHA : Draw::TextureSwizzle::DEFAULT,
+				Draw::DataFormatNumChannels(fmt) == 1 ? Draw::TextureSwizzle::R8_AS_GRAYSCALE: Draw::TextureSwizzle::DEFAULT,
 				"PixelViewer temp",
 				{ texData },
 				nullptr,
@@ -432,7 +472,9 @@ bool ImGeReadbackViewer::FormatValueAt(char *buf, size_t bufSize, int x, int y)
 	case Draw::DataFormat::D32F:
 	{
 		const float *read = (const float *)(data_ + offset);
-		snprintf(buf, bufSize, "%0.4f", *read);
+		float value = *read;
+		int ivalue = *read * 65535.0f;
+		snprintf(buf, bufSize, "%0.4f (raw: %04x / %d)", *read, ivalue, ivalue);
 		return true;
 	}
 	case Draw::DataFormat::S8:
@@ -1060,6 +1102,7 @@ void ImGeDebuggerWindow::Draw(ImConfig &cfg, ImControl &control, GPUDebugInterfa
 					DrawPreviewPrimitive(drawList, p0, previewPrim_, previewIndices_, previewVertices_, previewCount_, true, texW, texH);
 
 					drawList->PopClipRect();
+
 				} else {
 					ImGui::Text("(no valid texture bound)");
 					// In software mode, we should just decode the texture here.
diff --git a/UI/ImDebugger/ImGe.h b/UI/ImDebugger/ImGe.h
index c642b620efb7..41dc6a8b1f94 100644
--- a/UI/ImDebugger/ImGe.h
+++ b/UI/ImDebugger/ImGe.h
@@ -71,14 +71,14 @@ struct ImGePixelViewer : public PixelLookup {
 	}
 	bool FormatValueAt(char *buf, size_t bufSize, int x, int y) const override;
 
-	uint32_t addr = 0x04000000;
+	uint32_t addr = 0x04110000;
 	uint16_t stride = 512;
 	uint16_t width = 480;
 	uint16_t height = 272;
-	GEBufferFormat format = GE_FORMAT_565;
+	GEBufferFormat format = GE_FORMAT_DEPTH16;
 	bool useAlpha = false;
 	bool showAlpha = false;
-	float scale = 1.0f;
+	float scale = 20.0f;
 
 private:
 	void UpdateTexture(Draw::DrawContext *draw);
@@ -124,8 +124,6 @@ class ImGePixelViewerWindow {
 	}
 
 private:
-	void UpdateTexture(Draw::DrawContext *draw);
-
 	ImGePixelViewer viewer_;
 };
 
diff --git a/UWP/GPU_UWP/GPU_UWP.vcxproj b/UWP/GPU_UWP/GPU_UWP.vcxproj
index 7bb4b346bd8f..a7ba27a14097 100644
--- a/UWP/GPU_UWP/GPU_UWP.vcxproj
+++ b/UWP/GPU_UWP/GPU_UWP.vcxproj
@@ -109,6 +109,7 @@
     </Link>
   </ItemDefinitionGroup>
   <ItemGroup>
+    <ClInclude Include="..\..\GPU\Common\DepthRaster.h" />
     <ClInclude Include="..\..\GPU\Common\ReplacedTexture.h" />
     <ClInclude Include="..\..\GPU\Common\TextureReplacer.h" />
     <ClInclude Include="..\..\GPU\Common\TextureShaderCommon.h" />
@@ -177,6 +178,7 @@
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="..\..\GPU\Common\DepthBufferCommon.cpp" />
+    <ClCompile Include="..\..\GPU\Common\DepthRaster.cpp" />
     <ClCompile Include="..\..\GPU\Common\ReplacedTexture.cpp" />
     <ClCompile Include="..\..\GPU\Common\TextureReplacer.cpp" />
     <ClCompile Include="..\..\GPU\Common\TextureShaderCommon.cpp" />
@@ -261,4 +263,4 @@
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/UWP/GPU_UWP/GPU_UWP.vcxproj.filters b/UWP/GPU_UWP/GPU_UWP.vcxproj.filters
index 84b4c5d39630..31d14b549feb 100644
--- a/UWP/GPU_UWP/GPU_UWP.vcxproj.filters
+++ b/UWP/GPU_UWP/GPU_UWP.vcxproj.filters
@@ -80,6 +80,7 @@
     <ClCompile Include="..\..\GPU\Debugger\GECommandTable.cpp">
       <Filter>Debugger</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\GPU\Common\DepthRaster.cpp" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\GPU\Common\DepalettizeShaderCommon.h" />
@@ -163,10 +164,11 @@
     <ClInclude Include="..\..\GPU\Debugger\GECommandTable.h">
       <Filter>Debugger</Filter>
     </ClInclude>
+    <ClInclude Include="..\..\GPU\Common\DepthRaster.h" />
   </ItemGroup>
   <ItemGroup>
     <Filter Include="Debugger">
       <UniqueIdentifier>{49bcf7f6-518a-4ecd-af55-bda3a344efe7}</UniqueIdentifier>
     </Filter>
   </ItemGroup>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/android/jni/Android.mk b/android/jni/Android.mk
index 10ab9a5f77a3..dbd88097886f 100644
--- a/android/jni/Android.mk
+++ b/android/jni/Android.mk
@@ -530,6 +530,7 @@ EXEC_AND_LIB_FILES := \
   $(SRC)/GPU/Common/SoftwareTransformCommon.cpp.arm \
   $(SRC)/GPU/Common/ReinterpretFramebuffer.cpp \
   $(SRC)/GPU/Common/DepthBufferCommon.cpp \
+  $(SRC)/GPU/Common/DepthRaster.cpp \
   $(SRC)/GPU/Common/VertexDecoderCommon.cpp.arm \
   $(SRC)/GPU/Common/VertexDecoderHandwritten.cpp.arm \
   $(SRC)/GPU/Common/TextureCacheCommon.cpp.arm \
diff --git a/assets/compat.ini b/assets/compat.ini
index 8a374753efb2..0452c5b85484 100644
--- a/assets/compat.ini
+++ b/assets/compat.ini
@@ -1228,8 +1228,10 @@ ULJS19067 = true
 ULAS42247 = true
 ULAS42318 = true
 
+[SoftwareRasterDepth]
+
 [DisableFirstFrameReadback]
-# Wipeout Pure: Temporary workaround for lens flare flicker. See #13344
+# Wipeout Pure
 UCUS98612 = true
 UCJS10007 = true
 UCES00001 = true
diff --git a/ext/imgui/imgui_impl_thin3d.cpp b/ext/imgui/imgui_impl_thin3d.cpp
index bd302a55d51b..4c1ecba7a8e3 100644
--- a/ext/imgui/imgui_impl_thin3d.cpp
+++ b/ext/imgui/imgui_impl_thin3d.cpp
@@ -114,6 +114,10 @@ void ImGui_ImplThin3d_RenderDrawData(ImDrawData* draw_data, Draw::DrawContext *d
 				boundSampler = bd->fontSampler;
 			} else {
 				size_t index = (size_t)pcmd->TextureId - TEX_ID_OFFSET;
+				if (index >= bd->tempTextures.size()) {
+					WARN_LOG(Log::System, "Missing temp texture %d (out of %d)", index, (int)bd->tempTextures.size());
+					continue;
+				}
 				_dbg_assert_(index < bd->tempTextures.size());
 				switch (bd->tempTextures[index].type) {
 				case RegisteredTextureType::Framebuffer:
diff --git a/libretro/Makefile.common b/libretro/Makefile.common
index 804a1d72199f..c1cb5a454fed 100644
--- a/libretro/Makefile.common
+++ b/libretro/Makefile.common
@@ -543,6 +543,7 @@ SOURCES_CXX += \
 	$(GPUDIR)/Common/TextureScalerCommon.cpp \
 	$(GPUDIR)/Common/SoftwareTransformCommon.cpp \
 	$(GPUDIR)/Common/DepthBufferCommon.cpp \
+	$(GPUDIR)/Common/DepthRaster.cpp \
 	$(GPUDIR)/Common/StencilCommon.cpp \
 	$(GPUDIR)/Software/TransformUnit.cpp \
 	$(GPUDIR)/Software/SoftGpu.cpp \
diff --git a/unittest/UnitTest.cpp b/unittest/UnitTest.cpp
index 45c664a29af8..a087d205b96b 100644
--- a/unittest/UnitTest.cpp
+++ b/unittest/UnitTest.cpp
@@ -1112,14 +1112,17 @@ bool TestSIMD() {
 	EXPECT_EQ_INT(testdata[1], 0);
 
 	__m128i a = _mm_set_epi16(0, 0x4444, 0, 0x3333, 0, 0x2222, 0, 0x1111);
-	__m128i b = _mm_set_epi16(0, 0x8888, 0, 0x7777, 0, 0x6666, 0, 0x5555);
+	__m128i b = _mm_set_epi16(0, (int16_t)0x8888, 0, 0x7777, 0, 0x6666, 0, 0x5555);
 	__m128i c = _mm_packu2_epi32_SSE2(a, b);
-	__m128i d = _mm_packus_epi32(a, b);
+	__m128i d = _mm_packu1_epi32_SSE2(b);
 
-	uint64_t testdata2[2];
+	uint64_t testdata2[4];
 	_mm_store_si128((__m128i *)testdata2, c);
+	_mm_store_si128((__m128i *)testdata2 + 1, d);
 	EXPECT_EQ_INT(testdata2[0], 0x4444333322221111);
 	EXPECT_EQ_INT(testdata2[1], 0x8888777766665555);
+	EXPECT_EQ_INT(testdata2[2], 0x8888777766665555);
+	EXPECT_EQ_INT(testdata2[2], 0x8888777766665555);
 #endif
 	return true;
 }