Skip to content

Commit

Permalink
Merge pull request #16103 from hrydgard/optimize-shader-constants
Browse files Browse the repository at this point in the history
Shrink our main uniform buffer by 32 bytes
  • Loading branch information
hrydgard authored Sep 26, 2022
2 parents 2097a9c + ce835d1 commit 89e6b10
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 46 deletions.
46 changes: 20 additions & 26 deletions GPU/Common/FragmentShaderGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
WRITE(p, "int roundAndScaleTo255i(in highp float x) { return int(floor(x * 255.0 + 0.5)); }\n");
}
if (enableColorTest && !colorTestAgainstZero) {
WRITE(p, "ivec3 roundAndScaleTo255iv(in highp vec3 x) { return ivec3(floor(x * 255.0 + 0.5)); }\n");
WRITE(p, "uint roundAndScaleTo8x4(in highp vec3 x) { uvec3 u = uvec3(floor(x * 255.0 + 0.5)); return u.r | (u.g << 8) | (u.b << 16); }\n");
}

WRITE(p, "layout (location = 0, index = 0) out vec4 fragColor0;\n");
Expand Down Expand Up @@ -262,7 +262,7 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
}
if (enableColorTest) {
if (compat.shaderLanguage == HLSL_D3D11) {
WRITE(p, "uvec3 roundAndScaleTo255iv(float3 x) { return (floor(x * 255.0f + 0.5f)); }\n");
WRITE(p, "uint roundAndScaleTo8x4(float3 x) { uvec3 u = (floor(x * 255.0f + 0.5f)); return u.r | (u.g << 8) | (u.b << 16); }\n");
} else {
WRITE(p, "vec3 roundAndScaleTo255v(float3 x) { return floor(x * 255.0f + 0.5f); }\n");
}
Expand Down Expand Up @@ -354,7 +354,7 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
WRITE(p, "uniform vec4 u_alphacolorref;\n");
if (compat.bitwiseOps && ((enableColorTest && !colorTestAgainstZero) || (enableAlphaTest && !alphaTestAgainstZero))) {
*uniformMask |= DIRTY_ALPHACOLORMASK;
WRITE(p, "uniform ivec4 u_alphacolormask;\n");
WRITE(p, "uniform uint u_alphacolormask;\n");
}
}
}
Expand Down Expand Up @@ -408,7 +408,7 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
}
if (enableColorTest && !colorTestAgainstZero) {
if (compat.bitwiseOps) {
WRITE(p, "ivec3 roundAndScaleTo255iv(in vec3 x) { return ivec3(floor(x * 255.0 + 0.5)); }\n");
WRITE(p, "uint roundAndScaleTo8x4(in vec3 x) { uvec3 u = uvec3(floor(x * 255.0 + 0.5)); return u.r | (u.g << 8) | (u.b << 16); }\n");
} else if (gl_extensions.gpuVendor == GPU_VENDOR_IMGTEC) {
WRITE(p, "vec3 roundTo255thv(in vec3 x) { vec3 y = x + (0.5/255.0); return y - fract(y * 255.0) * (1.0 / 255.0); }\n");
} else {
Expand Down Expand Up @@ -458,6 +458,12 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
WRITE(p, "}\n");
}

if (compat.bitwiseOps && enableColorTest) {
p.C("uvec3 unpackUVec3(highp uint x) {\n");
p.C(" return uvec3(x & 0xFF, (x >> 8) & 0xFF, (x >> 16) & 0xFF);\n");
p.C("}\n");
}

// PowerVR needs a custom modulo function. For some reason, this has far higher precision than the builtin one.
if ((gl_extensions.bugs & BUG_PVR_SHADER_PRECISION_BAD) && needShaderTexClamp) {
WRITE(p, "float mymod(float a, float b) { return a - b * floor(a / b); }\n");
Expand Down Expand Up @@ -873,7 +879,7 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
const char *alphaTestFuncs[] = { "#", "#", " != ", " == ", " >= ", " > ", " <= ", " < " };
if (alphaTestFuncs[alphaTestFunc][0] != '#') {
if (compat.bitwiseOps) {
WRITE(p, " if ((roundAndScaleTo255i(v.a) & u_alphacolormask.a) %s int(u_alphacolorref.a)) %s\n", alphaTestFuncs[alphaTestFunc], discardStatement);
WRITE(p, " if ((roundAndScaleTo255i(v.a) & int(u_alphacolormask >> 24)) %s int(u_alphacolorref.a)) %s\n", alphaTestFuncs[alphaTestFunc], discardStatement);
} else if (gl_extensions.gpuVendor == GPU_VENDOR_IMGTEC) {
// Work around bad PVR driver problem where equality check + discard just doesn't work.
if (alphaTestFunc != GE_COMP_NOTEQUAL) {
Expand Down Expand Up @@ -927,34 +933,22 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
}
} else {
const char *colorTestFuncs[] = { "#", "#", " != ", " == " };
if (colorTestFuncs[colorTestFunc][0] != '#') {
const char *test = colorTestFuncs[colorTestFunc];
if (test[0] != '#') {
// TODO: Unify these paths better.
if (compat.shaderLanguage == HLSL_D3D11) {
const char *test = colorTestFuncs[colorTestFunc];
WRITE(p, " uvec3 v_scaled = roundAndScaleTo255iv(v.rgb);\n");
WRITE(p, " uvec3 v_masked = v_scaled & u_alphacolormask.rgb;\n");
WRITE(p, " uvec3 colorTestRef = u_alphacolorref.rgb & u_alphacolormask.rgb;\n");
// We have to test the components separately, or we get incorrect results. See #10629.
WRITE(p, " if (v_masked.r %s colorTestRef.r && v_masked.g %s colorTestRef.g && v_masked.b %s colorTestRef.b) %s\n", test, test, test, discardStatement);
} else if (compat.shaderLanguage == HLSL_D3D9) {
const char *test = colorTestFuncs[colorTestFunc];
if (compat.shaderLanguage == HLSL_D3D9) {
// TODO: Use a texture to lookup bitwise ops instead?
WRITE(p, " vec3 colortest = roundAndScaleTo255v(v.rgb);\n");
WRITE(p, " if ((colortest.r %s u_alphacolorref.r) && (colortest.g %s u_alphacolorref.g) && (colortest.b %s u_alphacolorref.b)) %s\n", test, test, test, discardStatement);
} else if (compat.bitwiseOps) {
WRITE(p, " ivec3 v_scaled = roundAndScaleTo255iv(v.rgb);\n");
if (compat.shaderLanguage == GLSL_VULKAN) {
// Apparently GLES3 does not support vector bitwise ops, but Vulkan does?
WRITE(p, " if ((v_scaled & u_alphacolormask.rgb) %s (u_alphacolorref.rgb & u_alphacolormask.rgb)) %s\n", colorTestFuncs[colorTestFunc], discardStatement);
} else {
const char *maskedFragColor = "ivec3(v_scaled.r & u_alphacolormask.r, v_scaled.g & u_alphacolormask.g, v_scaled.b & u_alphacolormask.b)";
const char *maskedColorRef = "ivec3(int(u_alphacolorref.r) & u_alphacolormask.r, int(u_alphacolorref.g) & u_alphacolormask.g, int(u_alphacolorref.b) & u_alphacolormask.b)";
WRITE(p, " if (%s %s %s) %s\n", maskedFragColor, colorTestFuncs[colorTestFunc], maskedColorRef, discardStatement);
}
WRITE(p, " uint v_uint = roundAndScaleTo8x4(v.rgb);\n");
WRITE(p, " uint v_masked = v_uint & u_alphacolormask;\n");
WRITE(p, " uint colorTestRef = roundAndScaleTo8x4(u_alphacolorref.rgb) & u_alphacolormask;\n");
WRITE(p, " if (v_masked %s colorTestRef) %s\n", test, discardStatement);
} else if (gl_extensions.gpuVendor == GPU_VENDOR_IMGTEC) {
WRITE(p, " if (roundTo255thv(v.rgb) %s u_alphacolorref.rgb) %s\n", colorTestFuncs[colorTestFunc], discardStatement);
WRITE(p, " if (roundTo255thv(v.rgb) %s u_alphacolorref.rgb) %s\n", test, discardStatement);
} else {
WRITE(p, " if (roundAndScaleTo255v(v.rgb) %s u_alphacolorref.rgb) %s\n", colorTestFuncs[colorTestFunc], discardStatement);
WRITE(p, " if (roundAndScaleTo255v(v.rgb) %s u_alphacolorref.rgb) %s\n", test, discardStatement);
}
} else {
WRITE(p, " %s\n", discardStatement);
Expand Down
2 changes: 1 addition & 1 deletion GPU/Common/ShaderUniforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ void BaseUpdateUniforms(UB_VS_FS_Base *ub, uint64_t dirtyUniforms, bool flipView
Uint8x3ToInt4_Alpha(ub->alphaColorRef, gstate.getColorTestRef(), gstate.getAlphaTestRef() & gstate.getAlphaTestMask());
}
if (dirtyUniforms & DIRTY_ALPHACOLORMASK) {
Uint8x3ToInt4_Alpha(ub->colorTestMask, gstate.getColorTestMask(), gstate.getAlphaTestMask());
ub->colorTestMask = gstate.getColorTestMask() | (gstate.getAlphaTestMask() << 24);
}
if (dirtyUniforms & DIRTY_FOGCOLOR) {
Uint8x3ToFloat4(ub->fogColor, gstate.fogcolor);
Expand Down
31 changes: 13 additions & 18 deletions GPU/Common/ShaderUniforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,9 @@ enum : uint64_t {
DIRTY_MATDIFFUSE | DIRTY_MATSPECULAR | DIRTY_MATEMISSIVE | DIRTY_AMBIENT,
};

// TODO: Split into two structs, one for software transform and one for hardware transform, to save space.
// Currently 512 bytes. Probably can't get to 256 (nVidia's UBO alignment).
// Currently 480 bytes. Probably can't get to 256 (nVidia's UBO alignment, also common in other vendors).
// Every line here is a 4-float.
struct UB_VS_FS_Base {
struct alignas(16) UB_VS_FS_Base {
float proj[16];
float proj_through[16];
float view[12];
Expand All @@ -29,21 +28,19 @@ struct UB_VS_FS_Base {
float uvScaleOffset[4];
float depthRange[4];
// Rotation is used only for software transform.
float fogCoef[2]; float stencil; float rotation;
float matAmbient[4];
float cullRangeMin[4];
float cullRangeMax[4];
uint32_t spline_counts; uint32_t depal_mask_shift_off_fmt; // 4 params packed into one.
uint32_t colorWriteMask; float mipBias;
// Fragment data
float fogColor[4];
float texEnvColor[4]; // .w is unused
float fogColor[4]; // .w is unused
float texEnvColor[3]; uint32_t colorTestMask;
int alphaColorRef[4];
int colorTestMask[4];
float blendFixA[4]; // .w is unused
float blendFixB[4]; // .w is unused
float blendFixA[3]; float stencil;
float blendFixB[3]; float rotation;
float texClamp[4];
float texClampOffset[4]; // .zw are unused
float texClampOffset[2]; float fogCoef[2];
};

static const char * const ub_baseStr =
Expand All @@ -54,9 +51,6 @@ R"( mat4 u_proj;
mat3x4 u_texmtx;
vec4 u_uvscaleoffset;
vec4 u_depthRange;
vec2 u_fogcoef;
float u_stencilReplaceValue;
float u_rotation;
vec4 u_matambientalpha;
vec4 u_cullRangeMin;
vec4 u_cullRangeMax;
Expand All @@ -66,17 +60,18 @@ R"( mat4 u_proj;
float u_mipBias;
vec3 u_fogcolor;
vec3 u_texenv;
uint u_alphacolormask;
ivec4 u_alphacolorref;
ivec4 u_alphacolormask;
vec3 u_blendFixA;
vec3 u_blendFixB;
vec3 u_blendFixA; float u_stencilReplaceValue;
vec3 u_blendFixB; float u_rotation;
vec4 u_texclamp;
vec2 u_texclampoff;
vec2 u_fogcoef;
)";

// 512 bytes. Would like to shrink more. Some colors only have 8-bit precision and we expand
// them to float unnecessarily, could just as well expand in the shader.
struct UB_VS_Lights {
struct alignas(16) UB_VS_Lights {
float ambientColor[4];
float materialDiffuse[4];
float materialSpecular[4];
Expand Down Expand Up @@ -129,7 +124,7 @@ R"( vec4 u_ambient;

// With some cleverness, we could get away with uploading just half this when only the four or five first
// bones are being used. This is 384b.
struct UB_VS_Bones {
struct alignas(16) UB_VS_Bones {
float bones[8][12];
};

Expand Down
2 changes: 1 addition & 1 deletion GPU/GLES/ShaderManagerGLES.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,7 @@ void LinkedShader::UpdateUniforms(u32 vertType, const ShaderID &vsid, bool useBu
SetColorUniform3Alpha255(render_, &u_alphacolorref, gstate.getColorTestRef(), gstate.getAlphaTestRef() & gstate.getAlphaTestMask());
}
if (dirty & DIRTY_ALPHACOLORMASK) {
SetColorUniform3iAlpha(render_, &u_alphacolormask, gstate.colortestmask, gstate.getAlphaTestMask());
render_->SetUniformUI1(&u_alphacolormask, gstate.getColorTestMask() | (gstate.getAlphaTestMask() << 24));
}
if (dirty & DIRTY_COLORWRITEMASK) {
render_->SetUniformUI1(&u_colorWriteMask, ~((gstate.pmska << 24) | (gstate.pmskc & 0xFFFFFF)));
Expand Down

0 comments on commit 89e6b10

Please sign in to comment.