From 44ba31fbc67198086c3a65cb3b6ef88872ee5c92 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 16 Sep 2018 23:57:20 -0700
Subject: [PATCH] Vulkan: Implement verex range culling.

Also D3D11, since they are very similar.
---
 GPU/Common/ShaderUniforms.cpp              | 38 ++++++++++++++++++++++
 GPU/Common/ShaderUniforms.h                |  8 ++++-
 GPU/Directx9/VertexShaderGeneratorDX9.cpp  |  2 +-
 GPU/Vulkan/VertexShaderGeneratorVulkan.cpp | 24 +++++++++++---
 4 files changed, 65 insertions(+), 7 deletions(-)

diff --git a/GPU/Common/ShaderUniforms.cpp b/GPU/Common/ShaderUniforms.cpp
index e2dee45c2826..5489df59b6e8 100644
--- a/GPU/Common/ShaderUniforms.cpp
+++ b/GPU/Common/ShaderUniforms.cpp
@@ -192,6 +192,44 @@ void BaseUpdateUniforms(UB_VS_FS_Base *ub, uint64_t dirtyUniforms, bool flipView
 		ub->depthRange[3] = viewZInvScale;
 	}
 
+	if (dirtyUniforms & DIRTY_CULLRANGE) {
+		// Account for the projection viewport adjustment when viewport is too large.
+		auto reverseViewportX = [](float x) {
+			float pspViewport = (x - gstate.getViewportXCenter()) * (1.0f / gstate.getViewportXScale());
+			return pspViewport * (1.0f / gstate_c.vpWidthScale);
+		};
+		auto reverseViewportY = [flipViewport](float y) {
+			float heightScale = gstate_c.vpHeightScale;
+			if (flipViewport) {
+				// For D3D11.
+				heightScale = -heightScale;
+			}
+			float pspViewport = (y - gstate.getViewportYCenter()) * (1.0f / gstate.getViewportYScale());
+			return pspViewport * (1.0f / gstate_c.vpHeightScale);
+		};
+		auto reverseViewportZ = [](float z) {
+			float pspViewport = (z - gstate.getViewportZCenter()) * (1.0f / gstate.getViewportZScale());
+			// Differs from GLES: depth is 0 to 1, not -1 to 1.
+			return (pspViewport - gstate_c.vpZOffset) * (1.0f / gstate_c.vpDepthScale) * 0.5f + 0.5f;
+		};
+		auto sortPair = [](float a, float b) {
+			return a > b ? std::make_pair(b, a) : std::make_pair(a, b);
+		};
+
+		// The PSP seems to use 0.12.4 for X and Y, and 0.16.0 for Z.
+		// Any vertex outside this range (unless depth clamp enabled) is discarded.
+		auto x = sortPair(reverseViewportX(0.0f), reverseViewportX(4096.0f));
+		auto y = sortPair(reverseViewportY(0.0f), reverseViewportY(4096.0f));
+		auto z = sortPair(reverseViewportZ(0.0f), reverseViewportZ(65535.5f));
+		// Since we have space in w, use it to pass the depth clamp flag.  We also pass NAN for w "discard".
+		float clampEnable = gstate.isDepthClampEnabled() ? 1.0f : 0.0f;
+
+		float minValues[4]{ x.first, y.first, z.first, clampEnable };
+		memcpy(ub->cullRangeMin, minValues, sizeof(ub->cullRangeMin));
+		float maxValues[4]{ x.second, y.second, z.second, NAN };
+		memcpy(ub->cullRangeMax, maxValues, sizeof(ub->cullRangeMax));
+	}
+
 	if (dirtyUniforms & DIRTY_BEZIERSPLINE) {
 		ub->spline_counts = BytesToUint32(gstate_c.spline_count_u, gstate_c.spline_count_v, gstate_c.spline_type_u, gstate_c.spline_type_v);
 	}
diff --git a/GPU/Common/ShaderUniforms.h b/GPU/Common/ShaderUniforms.h
index dd1e659604b4..5c5c29f16904 100644
--- a/GPU/Common/ShaderUniforms.h
+++ b/GPU/Common/ShaderUniforms.h
@@ -18,7 +18,7 @@ enum : uint64_t {
 };
 
 // TODO: Split into two structs, one for software transform and one for hardware transform, to save space.
-// 512 bytes. Probably can't get to 256 (nVidia's UBO alignment).
+// Currently 512 bytes. Probably can't get to 256 (nVidia's UBO alignment).
 // Every line here is a 4-float.
 struct UB_VS_FS_Base {
 	float proj[16];
@@ -32,6 +32,8 @@ struct UB_VS_FS_Base {
 	float matAmbient[4];
 	uint32_t spline_counts; uint32_t depal_mask_shift_off_fmt;  // 4 params packed into one.
 	int pad2; int pad3;
+	float cullRangeMin[4];
+	float cullRangeMax[4];
 	// Fragment data
 	float fogColor[4];
 	float texEnvColor[4];
@@ -58,6 +60,8 @@ R"(  mat4 proj_mtx;
   uint depal_mask_shift_off_fmt;
   int pad2;
   int pad3;
+  vec4 cullRangeMin;
+  vec4 cullRangeMax;
   vec3 fogcolor;
   vec3 texenv;
   ivec4 alphacolorref;
@@ -84,6 +88,8 @@ R"(  float4x4 u_proj;
   uint u_depal_mask_shift_off_fmt;
   int pad2;
   int pad3;
+  float4 u_cullRangeMin;
+  float4 u_cullRangeMax;
   float3 u_fogcolor;
   float3 u_texenv;
   uint4 u_alphacolorref;
diff --git a/GPU/Directx9/VertexShaderGeneratorDX9.cpp b/GPU/Directx9/VertexShaderGeneratorDX9.cpp
index 8cebd2dfdc15..1cd0f383c4fa 100644
--- a/GPU/Directx9/VertexShaderGeneratorDX9.cpp
+++ b/GPU/Directx9/VertexShaderGeneratorDX9.cpp
@@ -815,7 +815,7 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
 		}
 	}
 
-	if (lang == HLSL_DX9 && !isModeThrough) {
+	if (!isModeThrough) {
 		WRITE(p, "  float3 projPos = outPos.xyz / outPos.w;\n");
 		// Vertex range culling doesn't happen when depth is clamped, so only do this if in range.
 		WRITE(p, "  if (u_cullRangeMin.w <= 0.0f || (projPos.z >= u_cullRangeMin.z && projPos.z <= u_cullRangeMax.z)) {\n");
diff --git a/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp b/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp
index e901c4390043..c180631e9e27 100644
--- a/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp
+++ b/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp
@@ -317,13 +317,13 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) {
 			WRITE(p, "  v_fogdepth = position.w;\n");
 		}
 		if (isModeThrough) {
-			WRITE(p, "  gl_Position = base.proj_through_mtx * vec4(position.xyz, 1.0);\n");
+			WRITE(p, "  vec4 outPos = base.proj_through_mtx * vec4(position.xyz, 1.0);\n");
 		} else {
 			// The viewport is used in this case, so need to compensate for that.
 			if (gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
-				WRITE(p, "  gl_Position = depthRoundZVP(base.proj_mtx * vec4(position.xyz, 1.0));\n");
+				WRITE(p, "  vec4 outPos = depthRoundZVP(base.proj_mtx * vec4(position.xyz, 1.0));\n");
 			} else {
-				WRITE(p, "  gl_Position = base.proj_mtx * vec4(position.xyz, 1.0);\n");
+				WRITE(p, "  vec4 outPos = base.proj_mtx * vec4(position.xyz, 1.0);\n");
 			}
 		}
 	} else {
@@ -472,9 +472,9 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) {
 
 		// Final view and projection transforms.
 		if (gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
-			WRITE(p, "  gl_Position = depthRoundZVP(base.proj_mtx * viewPos);\n");
+			WRITE(p, "  vec4 outPos = depthRoundZVP(base.proj_mtx * viewPos);\n");
 		} else {
-			WRITE(p, "  gl_Position = base.proj_mtx * viewPos;\n");
+			WRITE(p, "  vec4 outPos = base.proj_mtx * viewPos;\n");
 		}
 
 		// TODO: Declare variables for dots for shade mapping if needed.
@@ -694,6 +694,20 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) {
 		if (enableFog)
 			WRITE(p, "  v_fogdepth = (viewPos.z + base.fogcoef.x) * base.fogcoef.y;\n");
 	}
+
+	if (!isModeThrough) {
+		WRITE(p, "  vec3 projPos = outPos.xyz / outPos.w;\n");
+		// Vertex range culling doesn't happen when depth is clamped, so only do this if in range.
+		WRITE(p, "  if (base.cullRangeMin.w <= 0.0f || (projPos.z >= base.cullRangeMin.z && projPos.z <= base.cullRangeMax.z)) {\n");
+		const char *outMin = "projPos.x < base.cullRangeMin.x || projPos.y < base.cullRangeMin.y || projPos.z < base.cullRangeMin.z";
+		const char *outMax = "projPos.x > base.cullRangeMax.x || projPos.y > base.cullRangeMax.y || projPos.z > base.cullRangeMax.z";
+		WRITE(p, "    if (%s || %s) {\n", outMin, outMax);
+		WRITE(p, "      outPos.w = base.cullRangeMax.w;\n");
+		WRITE(p, "    }\n");
+		WRITE(p, "  }\n");
+	}
+	WRITE(p, "  gl_Position = outPos;\n");
+
 	WRITE(p, "}\n");
 	return true;
 }