Skip to content

Commit

Permalink
Merge pull request #11393 from unknownbrackets/cullrange
Browse files Browse the repository at this point in the history
Implement vertex range culling
  • Loading branch information
hrydgard authored Sep 18, 2018
2 parents 3f34c7a + 52baec2 commit acfd688
Show file tree
Hide file tree
Showing 12 changed files with 179 additions and 40 deletions.
5 changes: 3 additions & 2 deletions GPU/Common/ShaderCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,15 @@ enum : uint64_t {

DIRTY_BEZIERSPLINE = 1ULL << 32,
DIRTY_TEXCLAMP = 1ULL << 33,
DIRTY_CULLRANGE = 1ULL << 34,

DIRTY_DEPAL = 1ULL << 34,
DIRTY_DEPAL = 1ULL << 35,

// space for 5 more uniform dirty flags. Remember to update DIRTY_ALL_UNIFORMS.

DIRTY_BONE_UNIFORMS = 0xFF000000ULL,

DIRTY_ALL_UNIFORMS = 0x7FFFFFFFFULL,
DIRTY_ALL_UNIFORMS = 0xFFFFFFFFFULL,
DIRTY_ALL_LIGHTS = DIRTY_LIGHT0 | DIRTY_LIGHT1 | DIRTY_LIGHT2 | DIRTY_LIGHT3,

// Other dirty elements that aren't uniforms!
Expand Down
47 changes: 47 additions & 0 deletions GPU/Common/ShaderUniforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,49 @@ static void ConvertProjMatrixToD3D11(Matrix4x4 &in) {
in.translateAndScale(trans, scale);
}

void CalcCullRange(float minValues[4], float maxValues[4], bool flipViewport, bool hasNegZ) {
// Account for the projection viewport adjustment when viewport is too large.
auto reverseViewportX = [](float x) {
float pspViewport = (x - gstate.getViewportXCenter()) * (1.0f / gstate.getViewportXScale());
return pspViewport * (1.0f / gstate_c.vpWidthScale);
};
auto reverseViewportY = [flipViewport](float y) {
float heightScale = gstate_c.vpHeightScale;
if (flipViewport) {
// For D3D11 and GLES non-buffered.
heightScale = -heightScale;
}
float pspViewport = (y - gstate.getViewportYCenter()) * (1.0f / gstate.getViewportYScale());
return pspViewport * (1.0f / gstate_c.vpHeightScale);
};
auto reverseViewportZ = [hasNegZ](float z) {
float pspViewport = (z - gstate.getViewportZCenter()) * (1.0f / gstate.getViewportZScale());
// Differs from GLES: depth is 0 to 1, not -1 to 1.
float realViewport = (pspViewport - gstate_c.vpZOffset) * (1.0f / gstate_c.vpDepthScale);
return hasNegZ ? realViewport : (realViewport * 0.5f + 0.5f);
};
auto sortPair = [](float a, float b) {
return a > b ? std::make_pair(b, a) : std::make_pair(a, b);
};

// The PSP seems to use 0.12.4 for X and Y, and 0.16.0 for Z.
// Any vertex outside this range (unless depth clamp enabled) is discarded.
auto x = sortPair(reverseViewportX(0.0f), reverseViewportX(4096.0f));
auto y = sortPair(reverseViewportY(0.0f), reverseViewportY(4096.0f));
auto z = sortPair(reverseViewportZ(0.0f), reverseViewportZ(65535.5f));
// Since we have space in w, use it to pass the depth clamp flag. We also pass NAN for w "discard".
float clampEnable = gstate.isDepthClampEnabled() ? 1.0f : 0.0f;

minValues[0] = x.first;
minValues[1] = y.first;
minValues[2] = z.first;
minValues[3] = clampEnable;
maxValues[0] = x.second;
maxValues[1] = y.second;
maxValues[2] = z.second;
maxValues[3] = NAN;
}

void BaseUpdateUniforms(UB_VS_FS_Base *ub, uint64_t dirtyUniforms, bool flipViewport) {
if (dirtyUniforms & DIRTY_TEXENV) {
Uint8x3ToFloat4(ub->texEnvColor, gstate.texenvcolor);
Expand Down Expand Up @@ -192,6 +235,10 @@ void BaseUpdateUniforms(UB_VS_FS_Base *ub, uint64_t dirtyUniforms, bool flipView
ub->depthRange[3] = viewZInvScale;
}

if (dirtyUniforms & DIRTY_CULLRANGE) {
CalcCullRange(ub->cullRangeMin, ub->cullRangeMax, flipViewport, false);
}

if (dirtyUniforms & DIRTY_BEZIERSPLINE) {
ub->spline_counts = BytesToUint32(gstate_c.spline_count_u, gstate_c.spline_count_v, gstate_c.spline_type_u, gstate_c.spline_type_v);
}
Expand Down
10 changes: 9 additions & 1 deletion GPU/Common/ShaderUniforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ enum : uint64_t {
};

// TODO: Split into two structs, one for software transform and one for hardware transform, to save space.
// 512 bytes. Probably can't get to 256 (nVidia's UBO alignment).
// Currently 512 bytes. Probably can't get to 256 (nVidia's UBO alignment).
// Every line here is a 4-float.
struct UB_VS_FS_Base {
float proj[16];
Expand All @@ -32,6 +32,8 @@ struct UB_VS_FS_Base {
float matAmbient[4];
uint32_t spline_counts; uint32_t depal_mask_shift_off_fmt; // 4 params packed into one.
int pad2; int pad3;
float cullRangeMin[4];
float cullRangeMax[4];
// Fragment data
float fogColor[4];
float texEnvColor[4];
Expand All @@ -58,6 +60,8 @@ R"( mat4 proj_mtx;
uint depal_mask_shift_off_fmt;
int pad2;
int pad3;
vec4 cullRangeMin;
vec4 cullRangeMax;
vec3 fogcolor;
vec3 texenv;
ivec4 alphacolorref;
Expand All @@ -84,6 +88,8 @@ R"( float4x4 u_proj;
uint u_depal_mask_shift_off_fmt;
int pad2;
int pad3;
float4 u_cullRangeMin;
float4 u_cullRangeMax;
float3 u_fogcolor;
float3 u_texenv;
uint4 u_alphacolorref;
Expand Down Expand Up @@ -175,6 +181,8 @@ static const char *cb_vs_bonesStr =
R"( float4x3 u_bone[8];
)";

void CalcCullRange(float minValues[4], float maxValues[4], bool flipViewport, bool hasNegZ);

void BaseUpdateUniforms(UB_VS_FS_Base *ub, uint64_t dirtyUniforms, bool flipViewport);
void LightUpdateUniforms(UB_VS_Lights *ub, uint64_t dirtyUniforms);
void BoneUpdateUniforms(UB_VS_Bones *ub, uint64_t dirtyUniforms);
Expand Down
12 changes: 10 additions & 2 deletions GPU/Directx9/ShaderManagerDX9.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include "GPU/Math3D.h"
#include "GPU/GPUState.h"
#include "GPU/ge_constants.h"
#include "GPU/Common/ShaderUniforms.h"
#include "GPU/Directx9/ShaderManagerDX9.h"
#include "GPU/Directx9/DrawEngineDX9.h"
#include "GPU/Directx9/FramebufferDX9.h"
Expand Down Expand Up @@ -314,7 +315,7 @@ void ShaderManagerDX9::PSUpdateUniforms(u64 dirtyUniforms) {
}

const uint64_t vsUniforms = DIRTY_PROJMATRIX | DIRTY_PROJTHROUGHMATRIX | DIRTY_WORLDMATRIX | DIRTY_VIEWMATRIX | DIRTY_TEXMATRIX |
DIRTY_FOGCOEF | DIRTY_BONE_UNIFORMS | DIRTY_UVSCALEOFFSET | DIRTY_DEPTHRANGE |
DIRTY_FOGCOEF | DIRTY_BONE_UNIFORMS | DIRTY_UVSCALEOFFSET | DIRTY_DEPTHRANGE | DIRTY_CULLRANGE |
DIRTY_AMBIENT | DIRTY_MATAMBIENTALPHA | DIRTY_MATSPECULAR | DIRTY_MATDIFFUSE | DIRTY_MATEMISSIVE | DIRTY_LIGHT0 | DIRTY_LIGHT1 | DIRTY_LIGHT2 | DIRTY_LIGHT3;

void ShaderManagerDX9::VSUpdateUniforms(u64 dirtyUniforms) {
Expand Down Expand Up @@ -425,7 +426,7 @@ void ShaderManagerDX9::VSUpdateUniforms(u64 dirtyUniforms) {
VSSetFloatArray(CONST_VS_UVSCALEOFFSET, uvscaleoff, 4);
}

if (dirtyUniforms & DIRTY_DEPTHRANGE) {
if (dirtyUniforms & DIRTY_DEPTHRANGE) {
// Depth is [0, 1] mapping to [minz, maxz], not too hard.
float vpZScale = gstate.getViewportZScale();
float vpZCenter = gstate.getViewportZCenter();
Expand All @@ -447,6 +448,13 @@ void ShaderManagerDX9::VSUpdateUniforms(u64 dirtyUniforms) {
float data[4] = { viewZScale, viewZCenter, viewZCenter, viewZInvScale };
VSSetFloatUniform4(CONST_VS_DEPTHRANGE, data);
}
if (dirtyUniforms & DIRTY_CULLRANGE) {
float minValues[4], maxValues[4];
CalcCullRange(minValues, maxValues, false, false);
VSSetFloatUniform4(CONST_VS_CULLRANGEMIN, minValues);
VSSetFloatUniform4(CONST_VS_CULLRANGEMAX, maxValues);
}

// Lighting
if (dirtyUniforms & DIRTY_AMBIENT) {
VSSetColorUniform3Alpha(CONST_VS_AMBIENT, gstate.ambientcolor, gstate.getAmbientA());
Expand Down
37 changes: 27 additions & 10 deletions GPU/Directx9/VertexShaderGeneratorDX9.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,10 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
if (!isModeThrough && gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
WRITE(p, "float4 u_depthRange : register(c%i);\n", CONST_VS_DEPTHRANGE);
}
if (!isModeThrough) {
WRITE(p, "float4 u_cullRangeMin : register(c%i);\n", CONST_VS_CULLRANGEMIN);
WRITE(p, "float4 u_cullRangeMax : register(c%i);\n", CONST_VS_CULLRANGEMAX);
}
} else {
WRITE(p, "cbuffer base : register(b0) {\n%s};\n", cb_baseStr);
WRITE(p, "cbuffer lights: register(b1) {\n%s};\n", cb_vs_lightsStr);
Expand Down Expand Up @@ -370,22 +374,22 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
}
if (lang == HLSL_D3D11 || lang == HLSL_D3D11_LEVEL9) {
if (isModeThrough) {
WRITE(p, " Out.gl_Position = mul(u_proj_through, float4(In.position.xyz, 1.0));\n");
WRITE(p, " float4 outPos = mul(u_proj_through, float4(In.position.xyz, 1.0));\n");
} else {
if (gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
WRITE(p, " Out.gl_Position = depthRoundZVP(mul(u_proj, float4(In.position.xyz, 1.0)));\n");
WRITE(p, " float4 outPos = depthRoundZVP(mul(u_proj, float4(In.position.xyz, 1.0)));\n");
} else {
WRITE(p, " Out.gl_Position = mul(u_proj, float4(In.position.xyz, 1.0));\n");
WRITE(p, " float4 outPos = mul(u_proj, float4(In.position.xyz, 1.0));\n");
}
}
} else {
if (isModeThrough) {
WRITE(p, " Out.gl_Position = mul(float4(In.position.xyz, 1.0), u_proj_through);\n");
WRITE(p, " float4 outPos = mul(float4(In.position.xyz, 1.0), u_proj_through);\n");
} else {
if (gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
WRITE(p, " Out.gl_Position = depthRoundZVP(mul(float4(In.position.xyz, 1.0), u_proj));\n");
WRITE(p, " float4 outPos = depthRoundZVP(mul(float4(In.position.xyz, 1.0), u_proj));\n");
} else {
WRITE(p, " Out.gl_Position = mul(float4(In.position.xyz, 1.0), u_proj);\n");
WRITE(p, " float4 outPos = mul(float4(In.position.xyz, 1.0), u_proj);\n");
}
}
}
Expand Down Expand Up @@ -577,16 +581,16 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
if (lang == HLSL_D3D11 || lang == HLSL_D3D11_LEVEL9) {
// Final view and projection transforms.
if (gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
WRITE(p, " Out.gl_Position = depthRoundZVP(mul(u_proj, viewPos));\n");
WRITE(p, " float4 outPos = depthRoundZVP(mul(u_proj, viewPos));\n");
} else {
WRITE(p, " Out.gl_Position = mul(u_proj, viewPos);\n");
WRITE(p, " float4 outPos = mul(u_proj, viewPos);\n");
}
} else {
// Final view and projection transforms.
if (gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
WRITE(p, " Out.gl_Position = depthRoundZVP(mul(viewPos, u_proj));\n");
WRITE(p, " float4 outPos = depthRoundZVP(mul(viewPos, u_proj));\n");
} else {
WRITE(p, " Out.gl_Position = mul(viewPos, u_proj);\n");
WRITE(p, " float4 outPos = mul(viewPos, u_proj);\n");
}
}

Expand Down Expand Up @@ -811,6 +815,19 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
}
}

if (!isModeThrough) {
WRITE(p, " float3 projPos = outPos.xyz / outPos.w;\n");
// Vertex range culling doesn't happen when depth is clamped, so only do this if in range.
WRITE(p, " if (u_cullRangeMin.w <= 0.0f || (projPos.z >= u_cullRangeMin.z && projPos.z <= u_cullRangeMax.z)) {\n");
const char *outMin = "projPos.x < u_cullRangeMin.x || projPos.y < u_cullRangeMin.y || projPos.z < u_cullRangeMin.z";
const char *outMax = "projPos.x > u_cullRangeMax.x || projPos.y > u_cullRangeMax.y || projPos.z > u_cullRangeMax.z";
WRITE(p, " if (%s || %s) {\n", outMin, outMax);
WRITE(p, " outPos.w = u_cullRangeMax.w;\n");
WRITE(p, " }\n");
WRITE(p, " }\n");
}
WRITE(p, " Out.gl_Position = outPos;\n");

WRITE(p, " return Out;\n");
WRITE(p, "}\n");
}
Expand Down
2 changes: 2 additions & 0 deletions GPU/Directx9/VertexShaderGeneratorDX9.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ namespace DX9 {
CONST_VS_BONE6 = 71,
CONST_VS_BONE7 = 74,
CONST_VS_BONE8 = 77,
CONST_VS_CULLRANGEMIN = 80,
CONST_VS_CULLRANGEMAX = 81,
};

};
13 changes: 11 additions & 2 deletions GPU/GLES/ShaderManagerGLES.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,10 @@
#include "GPU/Math3D.h"
#include "GPU/GPUState.h"
#include "GPU/ge_constants.h"
#include "GPU/Common/ShaderUniforms.h"
#include "GPU/GLES/ShaderManagerGLES.h"
#include "GPU/GLES/DrawEngineGLES.h"
#include "FramebufferManagerGLES.h"
#include "GPU/GLES/FramebufferManagerGLES.h"

Shader::Shader(GLRenderManager *render, const char *code, const std::string &desc, uint32_t glShaderType, bool useHWTransform, uint32_t attrMask, uint64_t uniformMask)
: render_(render), failed_(false), useHWTransform_(useHWTransform), attrMask_(attrMask), uniformMask_(uniformMask) {
Expand Down Expand Up @@ -116,6 +117,8 @@ LinkedShader::LinkedShader(GLRenderManager *render, VShaderID VSID, Shader *vs,
else
numBones = 0;
queries.push_back({ &u_depthRange, "u_depthRange" });
queries.push_back({ &u_cullRangeMin, "u_cullRangeMin" });
queries.push_back({ &u_cullRangeMax, "u_cullRangeMax" });

#ifdef USE_BONE_ARRAY
queries.push_back({ &u_bone, "u_bone" });
Expand Down Expand Up @@ -455,7 +458,7 @@ void LinkedShader::UpdateUniforms(u32 vertType, const ShaderID &vsid) {
if (dirty & DIRTY_TEXMATRIX) {
SetMatrix4x3(render_, &u_texmtx, gstate.tgenMatrix);
}
if ((dirty & DIRTY_DEPTHRANGE) && u_depthRange != -1) {
if (dirty & DIRTY_DEPTHRANGE) {
// Since depth is [-1, 1] mapping to [minz, maxz], this is easyish.
float vpZScale = gstate.getViewportZScale();
float vpZCenter = gstate.getViewportZCenter();
Expand All @@ -481,6 +484,12 @@ void LinkedShader::UpdateUniforms(u32 vertType, const ShaderID &vsid) {
float data[4] = { viewZScale, viewZCenter, viewZCenter, viewZInvScale };
SetFloatUniform4(render_, &u_depthRange, data);
}
if (dirty & DIRTY_CULLRANGE) {
float minValues[4], maxValues[4];
CalcCullRange(minValues, maxValues, g_Config.iRenderingMode == FB_NON_BUFFERED_MODE, true);
SetFloatUniform4(render_, &u_cullRangeMin, minValues);
SetFloatUniform4(render_, &u_cullRangeMax, maxValues);
}

if (dirty & DIRTY_STENCILREPLACEVALUE) {
float f = (float)gstate.getStencilTestRef() * (1.0f / 255.0f);
Expand Down
2 changes: 2 additions & 0 deletions GPU/GLES/ShaderManagerGLES.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ class LinkedShader {
int u_texmtx;
int u_world;
int u_depthRange; // x,y = viewport xscale/xcenter. z,w=clipping minz/maxz (?)
int u_cullRangeMin;
int u_cullRangeMax;

#ifdef USE_BONE_ARRAY
int u_bone; // array, size is numBones
Expand Down
34 changes: 29 additions & 5 deletions GPU/GLES/VertexShaderGeneratorGLES.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,10 @@ enum DoLightComputation {
//
// Now, the regular machinery will take over and do the calculation again.
//
// Depth is not clipped to the viewport, but does clip to "minz" and "maxz". It may also be clamped
// to 0 and 65535 if a depth clamping/clipping flag is set (x/y clipping is performed only if depth
// needs to be clamped.)
//
// All this above is for full transform mode.
// In through mode, the Z coordinate just goes straight through and there is no perspective division.
// We simulate this of course with pretty much an identity matrix. Rounding Z becomes very easy.
Expand Down Expand Up @@ -335,6 +339,12 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,
*uniformMask |= DIRTY_DEPTHRANGE;
}

if (!isModeThrough) {
WRITE(p, "uniform highp vec4 u_cullRangeMin;\n");
WRITE(p, "uniform highp vec4 u_cullRangeMax;\n");
*uniformMask |= DIRTY_CULLRANGE;
}

WRITE(p, "%s%s lowp vec4 v_color0;\n", shading, varying);
if (lmode) {
WRITE(p, "%s%s lowp vec3 v_color1;\n", shading, varying);
Expand Down Expand Up @@ -472,13 +482,13 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,
WRITE(p, " v_fogdepth = position.w;\n");
}
if (isModeThrough) {
WRITE(p, " gl_Position = u_proj_through * vec4(position.xyz, 1.0);\n");
WRITE(p, " vec4 outPos = u_proj_through * vec4(position.xyz, 1.0);\n");
} else {
// The viewport is used in this case, so need to compensate for that.
if (gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
WRITE(p, " gl_Position = depthRoundZVP(u_proj * vec4(position.xyz, 1.0));\n");
WRITE(p, " vec4 outPos = depthRoundZVP(u_proj * vec4(position.xyz, 1.0));\n");
} else {
WRITE(p, " gl_Position = u_proj * vec4(position.xyz, 1.0);\n");
WRITE(p, " vec4 outPos = u_proj * vec4(position.xyz, 1.0);\n");
}
}
} else {
Expand Down Expand Up @@ -671,9 +681,9 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,

// Final view and projection transforms.
if (gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
WRITE(p, " gl_Position = depthRoundZVP(u_proj * viewPos);\n");
WRITE(p, " vec4 outPos = depthRoundZVP(u_proj * viewPos);\n");
} else {
WRITE(p, " gl_Position = u_proj * viewPos;\n");
WRITE(p, " vec4 outPos = u_proj * viewPos;\n");
}

// TODO: Declare variables for dots for shade mapping if needed.
Expand Down Expand Up @@ -898,5 +908,19 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,
if (enableFog)
WRITE(p, " v_fogdepth = (viewPos.z + u_fogcoef.x) * u_fogcoef.y;\n");
}

if (!isModeThrough) {
WRITE(p, " vec3 projPos = outPos.xyz / outPos.w;\n");
// Vertex range culling doesn't happen when depth is clamped, so only do this if in range.
WRITE(p, " if (u_cullRangeMin.w <= 0.0f || (projPos.z >= u_cullRangeMin.z && projPos.z <= u_cullRangeMax.z)) {\n");
const char *outMin = "projPos.x < u_cullRangeMin.x || projPos.y < u_cullRangeMin.y || projPos.z < u_cullRangeMin.z";
const char *outMax = "projPos.x > u_cullRangeMax.x || projPos.y > u_cullRangeMax.y || projPos.z > u_cullRangeMax.z";
WRITE(p, " if (%s || %s) {\n", outMin, outMax);
WRITE(p, " outPos.w = u_cullRangeMax.w;\n");
WRITE(p, " }\n");
WRITE(p, " }\n");
}
WRITE(p, " gl_Position = outPos;\n");

WRITE(p, "}\n");
}
Loading

0 comments on commit acfd688

Please sign in to comment.