From 29c41c6a359846835b57897704da1c63ca01c00e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 13 Apr 2018 12:25:57 +0200 Subject: [PATCH 1/8] Implement shader depal for Vulkan. See #10908. Bilinear filter not yet implemented. --- GPU/Common/DepalettizeShaderCommon.cpp | 2 +- GPU/Common/ShaderCommon.h | 7 ++- GPU/Common/ShaderId.cpp | 4 ++ GPU/Common/ShaderId.h | 2 +- GPU/Common/ShaderUniforms.cpp | 11 ++++ GPU/Common/ShaderUniforms.h | 9 +-- GPU/GPUState.h | 3 + GPU/Vulkan/DrawEngineVulkan.cpp | 63 +++++++++++++------- GPU/Vulkan/DrawEngineVulkan.h | 15 +++-- GPU/Vulkan/FragmentShaderGeneratorVulkan.cpp | 41 ++++++++++++- GPU/Vulkan/ShaderManagerVulkan.cpp | 6 +- GPU/Vulkan/TextureCacheVulkan.cpp | 27 +++++++-- GPU/Vulkan/VertexShaderGeneratorVulkan.cpp | 8 +-- 13 files changed, 151 insertions(+), 47 deletions(-) diff --git a/GPU/Common/DepalettizeShaderCommon.cpp b/GPU/Common/DepalettizeShaderCommon.cpp index a230ce163cd3..0ff61cc995b8 100644 --- a/GPU/Common/DepalettizeShaderCommon.cpp +++ b/GPU/Common/DepalettizeShaderCommon.cpp @@ -70,7 +70,7 @@ void GenerateDepalShader300(char *buffer, GEBufferFormat pixelFormat, ShaderLang int mask = gstate.getClutIndexMask(); int shift = gstate.getClutIndexShift(); int offset = gstate.getClutIndexStartPos(); - const GEPaletteFormat clutFormat = gstate.getClutPaletteFormat(); + GEPaletteFormat clutFormat = gstate.getClutPaletteFormat(); // Unfortunately sampling turned our texture into floating point. To avoid this, might be able // to declare them as isampler2D objects, but these require integer textures, which needs more work. // Anyhow, we simply work around this by converting back to integer. Hopefully there will be no loss of precision. diff --git a/GPU/Common/ShaderCommon.h b/GPU/Common/ShaderCommon.h index 6e4bfa8c8173..c26185885435 100644 --- a/GPU/Common/ShaderCommon.h +++ b/GPU/Common/ShaderCommon.h @@ -85,15 +85,16 @@ enum : uint64_t { DIRTY_BONEMATRIX6 = 1ULL << 30, DIRTY_BONEMATRIX7 = 1ULL << 31, - // These are for hardware tessellation DIRTY_BEZIERSPLINE = 1ULL << 32, DIRTY_TEXCLAMP = 1ULL << 33, - // space for 7 more uniforms. + DIRTY_DEPAL = 1ULL << 34, + + // space for 5 more uniform dirty flags. Remember to update DIRTY_ALL_UNIFORMS. DIRTY_BONE_UNIFORMS = 0xFF000000ULL, - DIRTY_ALL_UNIFORMS = 0x3FFFFFFFFULL, + DIRTY_ALL_UNIFORMS = 0x7FFFFFFFFULL, DIRTY_ALL_LIGHTS = DIRTY_LIGHT0 | DIRTY_LIGHT1 | DIRTY_LIGHT2 | DIRTY_LIGHT3, // Other dirty elements that aren't uniforms! diff --git a/GPU/Common/ShaderId.cpp b/GPU/Common/ShaderId.cpp index 3a8f0de37e09..4a33f7c21619 100644 --- a/GPU/Common/ShaderId.cpp +++ b/GPU/Common/ShaderId.cpp @@ -171,6 +171,7 @@ std::string FragmentShaderDesc(const ShaderID &id) { if (id.Bit(FS_BIT_COLOR_DOUBLE)) desc << "2x "; if (id.Bit(FS_BIT_FLATSHADE)) desc << "Flat "; if (id.Bit(FS_BIT_BGRA_TEXTURE)) desc << "BGRA "; + if (id.Bit(FS_BIT_SHADER_DEPAL)) desc << "Depal "; if (id.Bit(FS_BIT_SHADER_TEX_CLAMP)) { desc << "TClamp"; if (id.Bit(FS_BIT_CLAMP_S)) desc << "S"; @@ -236,6 +237,7 @@ void ComputeFragmentShaderID(ShaderID *id_out) { bool doTextureProjection = (gstate.getUVGenMode() == GE_TEXMAP_TEXTURE_MATRIX && MatrixNeedsProjection(gstate.tgenMatrix)); bool doTextureAlpha = gstate.isTextureAlphaUsed(); bool doFlatShading = gstate.getShadeMode() == GE_SHADE_FLAT; + bool useShaderDepal = gstate_c.useShaderDepal; ReplaceBlendType replaceBlend = ReplaceBlendWithShader(gstate_c.allowShaderBlend, gstate.FrameBufFormat()); ReplaceAlphaType stencilToAlpha = ReplaceAlphaWithStencil(replaceBlend); @@ -299,6 +301,8 @@ void ComputeFragmentShaderID(ShaderID *id_out) { id.SetBits(FS_BIT_BLENDFUNC_B, 4, gstate.getBlendFuncB()); } id.SetBit(FS_BIT_FLATSHADE, doFlatShading); + + id.SetBit(FS_BIT_SHADER_DEPAL, useShaderDepal); } *id_out = id; diff --git a/GPU/Common/ShaderId.h b/GPU/Common/ShaderId.h index afc7c5131334..7d2e1b1cddac 100644 --- a/GPU/Common/ShaderId.h +++ b/GPU/Common/ShaderId.h @@ -62,7 +62,7 @@ enum { FS_BIT_DO_TEXTURE = 1, FS_BIT_TEXFUNC = 2, // 3 bits FS_BIT_TEXALPHA = 5, - // 6 is free. + FS_BIT_SHADER_DEPAL = 6, FS_BIT_SHADER_TEX_CLAMP = 7, FS_BIT_CLAMP_S = 8, FS_BIT_CLAMP_T = 9, diff --git a/GPU/Common/ShaderUniforms.cpp b/GPU/Common/ShaderUniforms.cpp index 6a5a7572fee5..2e57ee6b721a 100644 --- a/GPU/Common/ShaderUniforms.cpp +++ b/GPU/Common/ShaderUniforms.cpp @@ -200,6 +200,17 @@ void BaseUpdateUniforms(UB_VS_FS_Base *ub, uint64_t dirtyUniforms, bool flipView if (dirtyUniforms & DIRTY_BEZIERSPLINE) { ub->spline_counts = BytesToUint32(gstate_c.spline_count_u, gstate_c.spline_count_v, gstate_c.spline_type_u, gstate_c.spline_type_v); } + + if (dirtyUniforms & DIRTY_DEPAL) { + int indexMask = gstate.getClutIndexMask(); + int indexShift = gstate.getClutIndexShift(); + int indexOffset = gstate.getClutIndexStartPos() >> 4; + int format = gstate_c.depalFramebufferFormat; + uint32_t val = BytesToUint32(indexMask, indexShift, indexOffset, format); + // Poke in a bilinear filter flag in the top bit. + val |= gstate.isMagnifyFilteringEnabled() << 31; + ub->depal_mask_shift_off_fmt = val; + } } void LightUpdateUniforms(UB_VS_Lights *ub, uint64_t dirtyUniforms) { diff --git a/GPU/Common/ShaderUniforms.h b/GPU/Common/ShaderUniforms.h index a4c6d8f936a3..dd1e659604b4 100644 --- a/GPU/Common/ShaderUniforms.h +++ b/GPU/Common/ShaderUniforms.h @@ -11,7 +11,7 @@ enum : uint64_t { DIRTY_WORLDMATRIX | DIRTY_PROJTHROUGHMATRIX | DIRTY_VIEWMATRIX | DIRTY_TEXMATRIX | DIRTY_ALPHACOLORREF | DIRTY_PROJMATRIX | DIRTY_FOGCOLOR | DIRTY_FOGCOEF | DIRTY_TEXENV | DIRTY_STENCILREPLACEVALUE | DIRTY_ALPHACOLORMASK | DIRTY_SHADERBLEND | DIRTY_UVSCALEOFFSET | DIRTY_TEXCLAMP | DIRTY_DEPTHRANGE | DIRTY_MATAMBIENTALPHA | - DIRTY_BEZIERSPLINE, + DIRTY_BEZIERSPLINE | DIRTY_DEPAL, DIRTY_LIGHT_UNIFORMS = DIRTY_LIGHT0 | DIRTY_LIGHT1 | DIRTY_LIGHT2 | DIRTY_LIGHT3 | DIRTY_MATDIFFUSE | DIRTY_MATSPECULAR | DIRTY_MATEMISSIVE | DIRTY_AMBIENT, @@ -30,7 +30,8 @@ struct UB_VS_FS_Base { float depthRange[4]; float fogCoef[2]; float stencil; float pad0; float matAmbient[4]; - uint32_t spline_counts; int pad1; int pad2; int pad3; + uint32_t spline_counts; uint32_t depal_mask_shift_off_fmt; // 4 params packed into one. + int pad2; int pad3; // Fragment data float fogColor[4]; float texEnvColor[4]; @@ -54,7 +55,7 @@ R"( mat4 proj_mtx; float stencilReplace; vec4 matambientalpha; uint spline_counts; - int pad1; + uint depal_mask_shift_off_fmt; int pad2; int pad3; vec3 fogcolor; @@ -80,7 +81,7 @@ R"( float4x4 u_proj; float u_stencilReplaceValue; float4 u_matambientalpha; uint u_spline_counts; - int pad1; + uint u_depal_mask_shift_off_fmt; int pad2; int pad3; float3 u_fogcolor; diff --git a/GPU/GPUState.h b/GPU/GPUState.h index 58bc5e38590e..e6724098c319 100644 --- a/GPU/GPUState.h +++ b/GPU/GPUState.h @@ -600,6 +600,9 @@ struct GPUStateCache { int spline_type_u; int spline_type_v; + bool useShaderDepal; + GEBufferFormat depalFramebufferFormat; + u32 getRelativeAddress(u32 data) const; void Reset(); void DoState(PointerWrap &p); diff --git a/GPU/Vulkan/DrawEngineVulkan.cpp b/GPU/Vulkan/DrawEngineVulkan.cpp index e8310abd1837..2b360b9518d5 100644 --- a/GPU/Vulkan/DrawEngineVulkan.cpp +++ b/GPU/Vulkan/DrawEngineVulkan.cpp @@ -64,10 +64,11 @@ enum { VAI_KILL_AGE = 120, VAI_UNRELIABLE_KILL_AGE = 240, VAI_UNRELIABLE_KILL_MA enum { DRAW_BINDING_TEXTURE = 0, DRAW_BINDING_2ND_TEXTURE = 1, - DRAW_BINDING_DYNUBO_BASE = 2, - DRAW_BINDING_DYNUBO_LIGHT = 3, - DRAW_BINDING_DYNUBO_BONE = 4, - DRAW_BINDING_TESS_STORAGE_BUF = 5, + DRAW_BINDING_DEPAL_TEXTURE = 2, + DRAW_BINDING_DYNUBO_BASE = 3, + DRAW_BINDING_DYNUBO_LIGHT = 4, + DRAW_BINDING_DYNUBO_BONE = 5, + DRAW_BINDING_TESS_STORAGE_BUF = 6, }; enum { @@ -95,7 +96,7 @@ DrawEngineVulkan::DrawEngineVulkan(VulkanContext *vulkan, Draw::DrawContext *dra void DrawEngineVulkan::InitDeviceObjects() { // All resources we need for PSP drawing. Usually only bindings 0 and 2-4 are populated. - VkDescriptorSetLayoutBinding bindings[6]{}; + VkDescriptorSetLayoutBinding bindings[7]{}; bindings[0].descriptorCount = 1; bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; bindings[0].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT; @@ -105,22 +106,26 @@ void DrawEngineVulkan::InitDeviceObjects() { bindings[1].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT; bindings[1].binding = DRAW_BINDING_2ND_TEXTURE; bindings[2].descriptorCount = 1; - bindings[2].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC; - bindings[2].stageFlags = VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT; - bindings[2].binding = DRAW_BINDING_DYNUBO_BASE; + bindings[2].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; // sampler is ignored though. + bindings[2].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT; + bindings[2].binding = DRAW_BINDING_DEPAL_TEXTURE; bindings[3].descriptorCount = 1; bindings[3].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC; - bindings[3].stageFlags = VK_SHADER_STAGE_VERTEX_BIT; - bindings[3].binding = DRAW_BINDING_DYNUBO_LIGHT; + bindings[3].stageFlags = VK_SHADER_STAGE_VERTEX_BIT | VK_SHADER_STAGE_FRAGMENT_BIT; + bindings[3].binding = DRAW_BINDING_DYNUBO_BASE; bindings[4].descriptorCount = 1; bindings[4].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC; bindings[4].stageFlags = VK_SHADER_STAGE_VERTEX_BIT; - bindings[4].binding = DRAW_BINDING_DYNUBO_BONE; - // Used only for hardware tessellation. + bindings[4].binding = DRAW_BINDING_DYNUBO_LIGHT; bindings[5].descriptorCount = 1; - bindings[5].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + bindings[5].descriptorType = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC; bindings[5].stageFlags = VK_SHADER_STAGE_VERTEX_BIT; - bindings[5].binding = DRAW_BINDING_TESS_STORAGE_BUF; + bindings[5].binding = DRAW_BINDING_DYNUBO_BONE; + // Used only for hardware tessellation. + bindings[6].descriptorCount = 1; + bindings[6].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + bindings[6].stageFlags = VK_SHADER_STAGE_VERTEX_BIT; + bindings[6].binding = DRAW_BINDING_TESS_STORAGE_BUF; VkDevice device = vulkan_->GetDevice(); @@ -388,6 +393,7 @@ VkDescriptorSet DrawEngineVulkan::GetOrCreateDescriptorSet(VkImageView imageView key.imageView_ = imageView; key.sampler_ = sampler; key.secondaryImageView_ = boundSecondary_; + key.depalImageView_ = boundDepal_; key.base_ = base; key.light_ = light; key.bone_ = bone; @@ -433,12 +439,11 @@ VkDescriptorSet DrawEngineVulkan::GetOrCreateDescriptorSet(VkImageView imageView // Even in release mode, this is bad. _assert_msg_(G3D, result == VK_SUCCESS, "Ran out of descriptor space in pool. sz=%d res=%d", (int)frame.descSets.size(), (int)result); - // We just don't write to the slots we don't care about. - // We need 8 now that we support secondary texture bindings. - VkWriteDescriptorSet writes[8]{}; + // We just don't write to the slots we don't care about, which is fine. + VkWriteDescriptorSet writes[7]{}; // Main texture int n = 0; - VkDescriptorImageInfo tex[2]{}; + VkDescriptorImageInfo tex[3]{}; if (imageView) { #ifdef VULKAN_USE_GENERAL_LAYOUT_FOR_COLOR tex[0].imageLayout = VK_IMAGE_LAYOUT_GENERAL; @@ -446,7 +451,7 @@ VkDescriptorSet DrawEngineVulkan::GetOrCreateDescriptorSet(VkImageView imageView tex[0].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; #endif tex[0].imageView = imageView; - tex[0].sampler = sampler; + tex[0].sampler = sampler; // We override sampling when doing depal. writes[n].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; writes[n].pNext = nullptr; writes[n].dstBinding = DRAW_BINDING_TEXTURE; @@ -459,7 +464,7 @@ VkDescriptorSet DrawEngineVulkan::GetOrCreateDescriptorSet(VkImageView imageView if (boundSecondary_) { #ifdef VULKAN_USE_GENERAL_LAYOUT_FOR_COLOR - tex[0].imageLayout = VK_IMAGE_LAYOUT_GENERAL; + tex[1].imageLayout = VK_IMAGE_LAYOUT_GENERAL; #else tex[1].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; #endif @@ -475,7 +480,23 @@ VkDescriptorSet DrawEngineVulkan::GetOrCreateDescriptorSet(VkImageView imageView n++; } - // Skipping 2nd texture for now. + if (boundDepal_) { +#ifdef VULKAN_USE_GENERAL_LAYOUT_FOR_COLOR + tex[2].imageLayout = VK_IMAGE_LAYOUT_GENERAL; +#else + tex[2].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; +#endif + tex[2].imageView = boundDepal_; + tex[2].sampler = samplerSecondary_; // doesn't matter, we use load + writes[n].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + writes[n].pNext = nullptr; + writes[n].dstBinding = DRAW_BINDING_DEPAL_TEXTURE; + writes[n].pImageInfo = &tex[2]; + writes[n].descriptorCount = 1; + writes[n].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + writes[n].dstSet = desc; + n++; + } // Tessellation data buffer. Make sure this is declared outside the if to avoid optimizer // shenanigans. diff --git a/GPU/Vulkan/DrawEngineVulkan.h b/GPU/Vulkan/DrawEngineVulkan.h index 74cf5a065568..0aec77065eaa 100644 --- a/GPU/Vulkan/DrawEngineVulkan.h +++ b/GPU/Vulkan/DrawEngineVulkan.h @@ -20,10 +20,12 @@ // The Descriptor Set used for the majority of PSP rendering looks like this: // // * binding 0: Texture/Sampler (the PSP texture) -// * binding 1: Secondary texture sampler for shader blending or depal palettes -// * binding 2: Base Uniform Buffer (includes fragment state) -// * binding 3: Light uniform buffer -// * binding 4: Bone uniform buffer +// * binding 1: Secondary texture sampler for shader blending +// * binding 2: Depal palette +// * binding 3: Base Uniform Buffer (includes fragment state) +// * binding 4: Light uniform buffer +// * binding 5: Bone uniform buffer +// * binding 6: Tess data storage buffer // // All shaders conform to this layout, so they are all compatible with the same descriptor set. // The format of the various uniform buffers may vary though - vertex shaders that don't skin @@ -177,6 +179,9 @@ class DrawEngineVulkan : public DrawEngineCommon { } void SetLineWidth(float lineWidth); + void SetDepalTexture(VkImageView depal) { + boundDepal_ = depal; + } private: struct FrameData; @@ -207,6 +212,7 @@ class DrawEngineVulkan : public DrawEngineCommon { // Secondary texture for shader blending VkImageView boundSecondary_ = VK_NULL_HANDLE; + VkImageView boundDepal_ = VK_NULL_HANDLE; VkSampler samplerSecondary_ = VK_NULL_HANDLE; // This one is actually never used since we use fetch. PrehashMap vai_; @@ -217,6 +223,7 @@ class DrawEngineVulkan : public DrawEngineCommon { struct DescriptorSetKey { VkImageView imageView_; VkImageView secondaryImageView_; + VkImageView depalImageView_; VkSampler sampler_; VkBuffer base_, light_, bone_; // All three UBO slots will be set to this. This will usually be identical // for all draws in a frame, except when the buffer has to grow. diff --git a/GPU/Vulkan/FragmentShaderGeneratorVulkan.cpp b/GPU/Vulkan/FragmentShaderGeneratorVulkan.cpp index 38764ba7d315..7b2ee0b61c46 100644 --- a/GPU/Vulkan/FragmentShaderGeneratorVulkan.cpp +++ b/GPU/Vulkan/FragmentShaderGeneratorVulkan.cpp @@ -60,6 +60,7 @@ bool GenerateVulkanGLSLFragmentShader(const FShaderID &id, char *buffer) { bool doTextureProjection = id.Bit(FS_BIT_DO_TEXTURE_PROJ); bool doTextureAlpha = id.Bit(FS_BIT_TEXALPHA); bool doFlatShading = id.Bit(FS_BIT_FLATSHADE); + bool shaderDepal = id.Bit(FS_BIT_SHADER_DEPAL); GEComparison alphaTestFunc = (GEComparison)id.Bits(FS_BIT_ALPHA_TEST_FUNC, 3); GEComparison colorTestFunc = (GEComparison)id.Bits(FS_BIT_COLOR_TEST_FUNC, 2); @@ -80,7 +81,7 @@ bool GenerateVulkanGLSLFragmentShader(const FShaderID &id, char *buffer) { const char *shading = doFlatShading ? "flat" : ""; - WRITE(p, "layout (std140, set = 0, binding = 2) uniform baseUBO {\n%s} base;\n", ub_baseStr); + WRITE(p, "layout (std140, set = 0, binding = 3) uniform baseUBO {\n%s} base;\n", ub_baseStr); if (doTexture) { WRITE(p, "layout (binding = 0) uniform sampler2D tex;\n"); } @@ -91,6 +92,10 @@ bool GenerateVulkanGLSLFragmentShader(const FShaderID &id, char *buffer) { } } + if (shaderDepal) { + WRITE(p, "layout (binding = 2) uniform sampler2D pal;\n"); + } + WRITE(p, "layout (location = 1) %s in vec4 v_color0;\n", shading); if (lmode) WRITE(p, "layout (location = 2) %s in vec3 v_color1;\n", shading); @@ -175,7 +180,39 @@ bool GenerateVulkanGLSLFragmentShader(const FShaderID &id, char *buffer) { } else { WRITE(p, " vec4 t = texture(tex, %s.xy);\n", texcoord); } - WRITE(p, " vec4 p = v_color0;\n"); + + if (shaderDepal) { + WRITE(p, " uint depalMask = (base.depal_mask_shift_off_fmt & 0xFF);\n"); + WRITE(p, " uint depalShift = (base.depal_mask_shift_off_fmt >> 8) & 0xFF;\n"); + WRITE(p, " uint depalOffset = ((base.depal_mask_shift_off_fmt >> 16) & 0xFF) << 4;\n"); + WRITE(p, " uint depalFmt = (base.depal_mask_shift_off_fmt >> 24) & 0x3;\n"); + WRITE(p, " bool bilinear = (base.depal_mask_shift_off_fmt >> 31) == 0;\n"); + WRITE(p, " vec2 fraction = fract(%s.xy);\n", texcoord); + WRITE(p, " uvec4 col; uint index0; uint index1; uint index2; uint index3;\n"); + WRITE(p, " switch (depalFmt) {\n"); // We might want to include fmt in the shader ID if this is a performance issue. + WRITE(p, " case 0:\n"); // 565 + WRITE(p, " col = uvec4(t.rgb * vec3(31.99, 63.99, 31.99), 0);\n"); + WRITE(p, " index0 = (col.b << 11) | (col.g << 5) | (col.r);\n"); + WRITE(p, " break;\n"); + WRITE(p, " case 1:\n"); // 5551 + WRITE(p, " col = uvec4(t.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n"); + WRITE(p, " index0 = (col.a << 15) | (col.b << 10) | (col.g << 5) | (col.r);\n"); + WRITE(p, " break;\n"); + WRITE(p, " case 2:\n"); // 4444 + WRITE(p, " col = uvec4(t.rgba * vec4(15.99, 15.99, 15.99, 15.99));\n"); + WRITE(p, " index0 = (col.a << 12) | (col.b << 8) | (col.g << 4) | (col.r);\n"); + WRITE(p, " break;\n"); + WRITE(p, " case 3:\n"); // 8888 + WRITE(p, " col = uvec4(t.rgba * vec4(255.99, 255.99, 255.99, 255.99));\n"); + WRITE(p, " index0 = (col.a << 24) | (col.b << 16) | (col.g << 8) | (col.r);\n"); + WRITE(p, " break;\n"); + WRITE(p, " };\n"); + WRITE(p, " index0 = ((index0 >> depalShift) & depalMask) | depalOffset;\n"); + WRITE(p, " t = texelFetch(pal, ivec2(index0, 0), 0);\n"); + } + + if (texFunc != GE_TEXFUNC_REPLACE || !doTextureAlpha) + WRITE(p, " vec4 p = v_color0;\n"); if (doTextureAlpha) { // texfmt == RGBA switch (texFunc) { diff --git a/GPU/Vulkan/ShaderManagerVulkan.cpp b/GPU/Vulkan/ShaderManagerVulkan.cpp index 8834aa3717d3..c89b4f1db3c2 100644 --- a/GPU/Vulkan/ShaderManagerVulkan.cpp +++ b/GPU/Vulkan/ShaderManagerVulkan.cpp @@ -61,10 +61,9 @@ VulkanFragmentShader::VulkanFragmentShader(VulkanContext *vulkan, FShaderID id, } ERROR_LOG(G3D, "Messages: %s", errorMessage.c_str()); ERROR_LOG(G3D, "Shader source:\n%s", code); -#ifdef SHADERLOG + OutputDebugStringA(LineNumberString(code).c_str()); OutputDebugStringA("Messages:\n"); OutputDebugStringA(errorMessage.c_str()); -#endif Reporting::ReportMessage("Vulkan error in shader compilation: info: %s / code: %s", errorMessage.c_str(), code); } else { success = vulkan_->CreateShaderModule(spirv, &module_); @@ -116,6 +115,7 @@ VulkanVertexShader::VulkanVertexShader(VulkanContext *vulkan, VShaderID id, cons } ERROR_LOG(G3D, "Messages: %s", errorMessage.c_str()); ERROR_LOG(G3D, "Shader source:\n%s", code); + OutputDebugStringA(LineNumberString(code).c_str()); OutputDebugStringUTF8("Messages:\n"); OutputDebugStringUTF8(errorMessage.c_str()); Reporting::ReportMessage("Vulkan error in shader compilation: info: %s / code: %s", errorMessage.c_str(), code); @@ -354,7 +354,7 @@ VulkanFragmentShader *ShaderManagerVulkan::GetFragmentShaderFromModule(VkShaderM // instantaneous. #define CACHE_HEADER_MAGIC 0xff51f420 -#define CACHE_VERSION 12 +#define CACHE_VERSION 13 struct VulkanCacheHeader { uint32_t magic; uint32_t version; diff --git a/GPU/Vulkan/TextureCacheVulkan.cpp b/GPU/Vulkan/TextureCacheVulkan.cpp index 6f1c14ad7673..1e833ea93949 100644 --- a/GPU/Vulkan/TextureCacheVulkan.cpp +++ b/GPU/Vulkan/TextureCacheVulkan.cpp @@ -327,6 +327,8 @@ void TextureCacheVulkan::BindTexture(TexCacheEntry *entry) { SamplerCacheKey key{}; UpdateSamplingParams(*entry, key); curSampler_ = samplerCache_.GetOrCreateSampler(key); + drawEngine_->SetDepalTexture(VK_NULL_HANDLE); + gstate_c.useShaderDepal = false; } void TextureCacheVulkan::Unbind() { @@ -336,10 +338,29 @@ void TextureCacheVulkan::Unbind() { } void TextureCacheVulkan::ApplyTextureFramebuffer(TexCacheEntry *entry, VirtualFramebuffer *framebuffer) { + SamplerCacheKey samplerKey{}; + SetFramebufferSamplingParams(framebuffer->bufferWidth, framebuffer->bufferHeight, samplerKey); + DepalShaderVulkan *depalShader = nullptr; uint32_t clutMode = gstate.clutformat & 0xFFFFFF; if ((entry->status & TexCacheEntry::STATUS_DEPALETTIZE) && !g_Config.bDisableSlowFramebufEffects) { - depalShader = depalShaderCache_->GetDepalettizeShader(clutMode, framebuffer->drawnFormat); + bool useShaderDepal = true; + if (useShaderDepal) { + depalShaderCache_->SetPushBuffer(drawEngine_->GetPushBufferForTextureData()); + const GEPaletteFormat clutFormat = gstate.getClutPaletteFormat(); + VulkanTexture *clutTexture = depalShaderCache_->GetClutTexture(clutFormat, clutHash_, clutBuf_); + drawEngine_->SetDepalTexture(clutTexture->GetImageView()); + // Only point filtering enabled. + samplerKey.magFilt = false; + samplerKey.minFilt = false; + samplerKey.mipFilt = false; + // Make sure to update the uniforms. + gstate_c.Dirty(DIRTY_DEPAL); + gstate_c.useShaderDepal = true; + gstate_c.depalFramebufferFormat = framebuffer->drawnFormat; + } else { + depalShader = depalShaderCache_->GetDepalettizeShader(clutMode, framebuffer->drawnFormat); + } } if (depalShader) { depalShaderCache_->SetPushBuffer(drawEngine_->GetPushBufferForTextureData()); @@ -430,14 +451,12 @@ void TextureCacheVulkan::ApplyTextureFramebuffer(TexCacheEntry *entry, VirtualFr } else { entry->status &= ~TexCacheEntry::STATUS_DEPALETTIZE; - framebufferManager_->RebindFramebuffer(); // TODO: This line should not be needed? + framebufferManager_->RebindFramebuffer(); // TODO: This line should usually not be needed. imageView_ = framebufferManagerVulkan_->BindFramebufferAsColorTexture(0, framebuffer, BINDFBCOLOR_MAY_COPY_WITH_UV | BINDFBCOLOR_APPLY_TEX_OFFSET); gstate_c.SetTextureFullAlpha(gstate.getTextureFormat() == GE_TFMT_5650); } - SamplerCacheKey samplerKey{}; - SetFramebufferSamplingParams(framebuffer->bufferWidth, framebuffer->bufferHeight, samplerKey); curSampler_ = samplerCache_.GetOrCreateSampler(samplerKey); InvalidateLastTexture(entry); } diff --git a/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp b/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp index d72d8c904a28..b4a22977fa6d 100644 --- a/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp +++ b/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp @@ -137,11 +137,11 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) { bool flipNormalTess = id.Bit(VS_BIT_NORM_REVERSE_TESS); WRITE(p, "\n"); - WRITE(p, "layout (std140, set = 0, binding = 2) uniform baseVars {\n%s} base;\n", ub_baseStr); + WRITE(p, "layout (std140, set = 0, binding = 3) uniform baseVars {\n%s} base;\n", ub_baseStr); if (enableLighting || doShadeMapping) - WRITE(p, "layout (std140, set = 0, binding = 3) uniform lightVars {\n%s} light;\n", ub_vs_lightsStr); + WRITE(p, "layout (std140, set = 0, binding = 4) uniform lightVars {\n%s} light;\n", ub_vs_lightsStr); if (enableBones) - WRITE(p, "layout (std140, set = 0, binding = 4) uniform boneVars {\n%s} bone;\n", ub_vs_bonesStr); + WRITE(p, "layout (std140, set = 0, binding = 5) uniform boneVars {\n%s} bone;\n", ub_vs_bonesStr); const char *shading = doFlatShading ? "flat " : ""; @@ -221,7 +221,7 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) { WRITE(p, " vec4 uv;\n"); WRITE(p, " vec4 color;\n"); WRITE(p, "};"); - WRITE(p, "layout (std430, set = 0, binding = 5) buffer s_tess_data {\n"); + WRITE(p, "layout (std430, set = 0, binding = 6) buffer s_tess_data {\n"); WRITE(p, " TessData data[];"); WRITE(p, "} tess_data;\n"); From 69bd427ca2ea1d3e105484f903376b936a07e445 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 13 Apr 2018 12:45:10 +0200 Subject: [PATCH 2/8] Shader depal: Implement bilinear filtering. --- GPU/GPUCommon.cpp | 2 +- GPU/Vulkan/FragmentShaderGeneratorVulkan.cpp | 55 +++++++++++++++++++- 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/GPU/GPUCommon.cpp b/GPU/GPUCommon.cpp index 713e4c412081..262ed460a783 100644 --- a/GPU/GPUCommon.cpp +++ b/GPU/GPUCommon.cpp @@ -161,7 +161,7 @@ const CommonCommandTableEntry commonCommandTable[] = { // These must flush on change, so that LoadClut doesn't have to always flush. { GE_CMD_CLUTADDR, FLAG_FLUSHBEFOREONCHANGE }, { GE_CMD_CLUTADDRUPPER, FLAG_FLUSHBEFOREONCHANGE }, - { GE_CMD_CLUTFORMAT, FLAG_FLUSHBEFOREONCHANGE, DIRTY_TEXTURE_PARAMS }, + { GE_CMD_CLUTFORMAT, FLAG_FLUSHBEFOREONCHANGE, DIRTY_TEXTURE_PARAMS | DIRTY_DEPAL }, // Morph weights. TODO: Remove precomputation? { GE_CMD_MORPHWEIGHT0, FLAG_FLUSHBEFOREONCHANGE | FLAG_EXECUTEONCHANGE, 0, &GPUCommon::Execute_MorphWeight }, diff --git a/GPU/Vulkan/FragmentShaderGeneratorVulkan.cpp b/GPU/Vulkan/FragmentShaderGeneratorVulkan.cpp index 7b2ee0b61c46..08327ec7475e 100644 --- a/GPU/Vulkan/FragmentShaderGeneratorVulkan.cpp +++ b/GPU/Vulkan/FragmentShaderGeneratorVulkan.cpp @@ -177,8 +177,18 @@ bool GenerateVulkanGLSLFragmentShader(const FShaderID &id, char *buffer) { if (doTextureProjection) { WRITE(p, " vec4 t = textureProj(tex, %s);\n", texcoord); + if (shaderDepal) { + WRITE(p, " vec4 t1 = textureProjOffset(tex, %s, ivec2(1, 0));\n", texcoord); + WRITE(p, " vec4 t2 = textureProjOffset(tex, %s, ivec2(0, 1));\n", texcoord); + WRITE(p, " vec4 t3 = textureProjOffset(tex, %s, ivec2(1, 1));\n", texcoord); + } } else { WRITE(p, " vec4 t = texture(tex, %s.xy);\n", texcoord); + if (shaderDepal) { + WRITE(p, " vec4 t1 = textureOffset(tex, %s.xy, ivec2(1, 0));\n", texcoord); + WRITE(p, " vec4 t2 = textureOffset(tex, %s.xy, ivec2(0, 1));\n", texcoord); + WRITE(p, " vec4 t3 = textureOffset(tex, %s.xy, ivec2(1, 1));\n", texcoord); + } } if (shaderDepal) { @@ -186,29 +196,72 @@ bool GenerateVulkanGLSLFragmentShader(const FShaderID &id, char *buffer) { WRITE(p, " uint depalShift = (base.depal_mask_shift_off_fmt >> 8) & 0xFF;\n"); WRITE(p, " uint depalOffset = ((base.depal_mask_shift_off_fmt >> 16) & 0xFF) << 4;\n"); WRITE(p, " uint depalFmt = (base.depal_mask_shift_off_fmt >> 24) & 0x3;\n"); - WRITE(p, " bool bilinear = (base.depal_mask_shift_off_fmt >> 31) == 0;\n"); + WRITE(p, " bool bilinear = (base.depal_mask_shift_off_fmt >> 31) != 0;\n"); WRITE(p, " vec2 fraction = fract(%s.xy);\n", texcoord); WRITE(p, " uvec4 col; uint index0; uint index1; uint index2; uint index3;\n"); WRITE(p, " switch (depalFmt) {\n"); // We might want to include fmt in the shader ID if this is a performance issue. WRITE(p, " case 0:\n"); // 565 WRITE(p, " col = uvec4(t.rgb * vec3(31.99, 63.99, 31.99), 0);\n"); WRITE(p, " index0 = (col.b << 11) | (col.g << 5) | (col.r);\n"); + WRITE(p, " if (bilinear) {\n"); + WRITE(p, " col = uvec4(t1.rgb * vec3(31.99, 63.99, 31.99), 0);\n"); + WRITE(p, " index1 = (col.b << 11) | (col.g << 5) | (col.r);\n"); + WRITE(p, " col = uvec4(t2.rgb * vec3(31.99, 63.99, 31.99), 0);\n"); + WRITE(p, " index2 = (col.b << 11) | (col.g << 5) | (col.r);\n"); + WRITE(p, " col = uvec4(t3.rgb * vec3(31.99, 63.99, 31.99), 0);\n"); + WRITE(p, " index3 = (col.b << 11) | (col.g << 5) | (col.r);\n"); + WRITE(p, " }\n"); WRITE(p, " break;\n"); WRITE(p, " case 1:\n"); // 5551 WRITE(p, " col = uvec4(t.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n"); WRITE(p, " index0 = (col.a << 15) | (col.b << 10) | (col.g << 5) | (col.r);\n"); + WRITE(p, " if (bilinear) {\n"); + WRITE(p, " col = uvec4(t1.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n"); + WRITE(p, " index1 = (col.a << 15) | (col.b << 10) | (col.g << 5) | (col.r);\n"); + WRITE(p, " col = uvec4(t2.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n"); + WRITE(p, " index2 = (col.a << 15) | (col.b << 10) | (col.g << 5) | (col.r);\n"); + WRITE(p, " col = uvec4(t3.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n"); + WRITE(p, " index3 = (col.a << 15) | (col.b << 10) | (col.g << 5) | (col.r);\n"); + WRITE(p, " }\n"); WRITE(p, " break;\n"); WRITE(p, " case 2:\n"); // 4444 WRITE(p, " col = uvec4(t.rgba * vec4(15.99, 15.99, 15.99, 15.99));\n"); WRITE(p, " index0 = (col.a << 12) | (col.b << 8) | (col.g << 4) | (col.r);\n"); + WRITE(p, " if (bilinear) {\n"); + WRITE(p, " col = uvec4(t1.rgba * vec4(15.99, 15.99, 15.99, 15.99));\n"); + WRITE(p, " index1 = (col.a << 12) | (col.b << 8) | (col.g << 4) | (col.r);\n"); + WRITE(p, " col = uvec4(t2.rgba * vec4(15.99, 15.99, 15.99, 15.99));\n"); + WRITE(p, " index2 = (col.a << 12) | (col.b << 8) | (col.g << 4) | (col.r);\n"); + WRITE(p, " col = uvec4(t3.rgba * vec4(15.99, 15.99, 15.99, 15.99));\n"); + WRITE(p, " index3 = (col.a << 12) | (col.b << 8) | (col.g << 4) | (col.r);\n"); + WRITE(p, " }\n"); WRITE(p, " break;\n"); WRITE(p, " case 3:\n"); // 8888 WRITE(p, " col = uvec4(t.rgba * vec4(255.99, 255.99, 255.99, 255.99));\n"); WRITE(p, " index0 = (col.a << 24) | (col.b << 16) | (col.g << 8) | (col.r);\n"); + WRITE(p, " if (bilinear) {\n"); + WRITE(p, " col = uvec4(t1.rgba * vec4(255.99, 255.99, 255.99, 255.99));\n"); + WRITE(p, " index1 = (col.a << 24) | (col.b << 16) | (col.g << 8) | (col.r);\n"); + WRITE(p, " col = uvec4(t2.rgba * vec4(255.99, 255.99, 255.99, 255.99));\n"); + WRITE(p, " index2 = (col.a << 24) | (col.b << 16) | (col.g << 8) | (col.r);\n"); + WRITE(p, " col = uvec4(t3.rgba * vec4(255.99, 255.99, 255.99, 255.99));\n"); + WRITE(p, " index3 = (col.a << 24) | (col.b << 16) | (col.g << 8) | (col.r);\n"); + WRITE(p, " }\n"); WRITE(p, " break;\n"); WRITE(p, " };\n"); WRITE(p, " index0 = ((index0 >> depalShift) & depalMask) | depalOffset;\n"); WRITE(p, " t = texelFetch(pal, ivec2(index0, 0), 0);\n"); + WRITE(p, " if (bilinear) {\n"); + WRITE(p, " index1 = ((index1 >> depalShift) & depalMask) | depalOffset;\n"); + WRITE(p, " index2 = ((index2 >> depalShift) & depalMask) | depalOffset;\n"); + WRITE(p, " index3 = ((index3 >> depalShift) & depalMask) | depalOffset;\n"); + WRITE(p, " t1 = texelFetch(pal, ivec2(index1, 0), 0);\n"); + WRITE(p, " t2 = texelFetch(pal, ivec2(index2, 0), 0);\n"); + WRITE(p, " t3 = texelFetch(pal, ivec2(index3, 0), 0);\n"); + WRITE(p, " t = mix(t, t1, fraction.x);\n"); + WRITE(p, " t2 = mix(t2, t3, fraction.x);\n"); + WRITE(p, " t = mix(t, t2, fraction.y);\n"); + WRITE(p, " }\n"); } if (texFunc != GE_TEXFUNC_REPLACE || !doTextureAlpha) From 81276c88626985136dd05c6fc10bfdea90b57517 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 13 Apr 2018 13:47:45 +0200 Subject: [PATCH 3/8] Fix various bugs. --- GPU/Vulkan/DrawEngineVulkan.cpp | 2 +- GPU/Vulkan/ShaderManagerVulkan.cpp | 4 ++++ GPU/Vulkan/TextureCacheVulkan.cpp | 13 +++++++++++-- GPU/Vulkan/TextureCacheVulkan.h | 1 + 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/GPU/Vulkan/DrawEngineVulkan.cpp b/GPU/Vulkan/DrawEngineVulkan.cpp index 2b360b9518d5..963898d26c4d 100644 --- a/GPU/Vulkan/DrawEngineVulkan.cpp +++ b/GPU/Vulkan/DrawEngineVulkan.cpp @@ -451,7 +451,7 @@ VkDescriptorSet DrawEngineVulkan::GetOrCreateDescriptorSet(VkImageView imageView tex[0].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; #endif tex[0].imageView = imageView; - tex[0].sampler = sampler; // We override sampling when doing depal. + tex[0].sampler = sampler; writes[n].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; writes[n].pNext = nullptr; writes[n].dstBinding = DRAW_BINDING_TEXTURE; diff --git a/GPU/Vulkan/ShaderManagerVulkan.cpp b/GPU/Vulkan/ShaderManagerVulkan.cpp index c89b4f1db3c2..7b2aeae038bc 100644 --- a/GPU/Vulkan/ShaderManagerVulkan.cpp +++ b/GPU/Vulkan/ShaderManagerVulkan.cpp @@ -61,9 +61,11 @@ VulkanFragmentShader::VulkanFragmentShader(VulkanContext *vulkan, FShaderID id, } ERROR_LOG(G3D, "Messages: %s", errorMessage.c_str()); ERROR_LOG(G3D, "Shader source:\n%s", code); +#ifdef SHADERLOG OutputDebugStringA(LineNumberString(code).c_str()); OutputDebugStringA("Messages:\n"); OutputDebugStringA(errorMessage.c_str()); +#endif Reporting::ReportMessage("Vulkan error in shader compilation: info: %s / code: %s", errorMessage.c_str(), code); } else { success = vulkan_->CreateShaderModule(spirv, &module_); @@ -115,9 +117,11 @@ VulkanVertexShader::VulkanVertexShader(VulkanContext *vulkan, VShaderID id, cons } ERROR_LOG(G3D, "Messages: %s", errorMessage.c_str()); ERROR_LOG(G3D, "Shader source:\n%s", code); +#ifdef SHADERLOG OutputDebugStringA(LineNumberString(code).c_str()); OutputDebugStringUTF8("Messages:\n"); OutputDebugStringUTF8(errorMessage.c_str()); +#endif Reporting::ReportMessage("Vulkan error in shader compilation: info: %s / code: %s", errorMessage.c_str(), code); } else { success = vulkan_->CreateShaderModule(spirv, &module_); diff --git a/GPU/Vulkan/TextureCacheVulkan.cpp b/GPU/Vulkan/TextureCacheVulkan.cpp index 1e833ea93949..a2f350596ac4 100644 --- a/GPU/Vulkan/TextureCacheVulkan.cpp +++ b/GPU/Vulkan/TextureCacheVulkan.cpp @@ -343,8 +343,10 @@ void TextureCacheVulkan::ApplyTextureFramebuffer(TexCacheEntry *entry, VirtualFr DepalShaderVulkan *depalShader = nullptr; uint32_t clutMode = gstate.clutformat & 0xFFFFFF; + + bool useShaderDepal = true; + if ((entry->status & TexCacheEntry::STATUS_DEPALETTIZE) && !g_Config.bDisableSlowFramebufEffects) { - bool useShaderDepal = true; if (useShaderDepal) { depalShaderCache_->SetPushBuffer(drawEngine_->GetPushBufferForTextureData()); const GEPaletteFormat clutFormat = gstate.getClutPaletteFormat(); @@ -358,6 +360,14 @@ void TextureCacheVulkan::ApplyTextureFramebuffer(TexCacheEntry *entry, VirtualFr gstate_c.Dirty(DIRTY_DEPAL); gstate_c.useShaderDepal = true; gstate_c.depalFramebufferFormat = framebuffer->drawnFormat; + const u32 bytesPerColor = clutFormat == GE_CMODE_32BIT_ABGR8888 ? sizeof(u32) : sizeof(u16); + const u32 clutTotalColors = clutMaxBytes_ / bytesPerColor; + TexCacheEntry::TexStatus alphaStatus = CheckAlpha(clutBuf_, getClutDestFormatVulkan(clutFormat), clutTotalColors, clutTotalColors, 1); + gstate_c.SetTextureFullAlpha(alphaStatus == TexCacheEntry::STATUS_ALPHA_FULL); + curSampler_ = samplerCache_.GetOrCreateSampler(samplerKey); + InvalidateLastTexture(entry); + imageView_ = framebufferManagerVulkan_->BindFramebufferAsColorTexture(0, framebuffer, BINDFBCOLOR_MAY_COPY_WITH_UV | BINDFBCOLOR_APPLY_TEX_OFFSET); + return; } else { depalShader = depalShaderCache_->GetDepalettizeShader(clutMode, framebuffer->drawnFormat); } @@ -456,7 +466,6 @@ void TextureCacheVulkan::ApplyTextureFramebuffer(TexCacheEntry *entry, VirtualFr gstate_c.SetTextureFullAlpha(gstate.getTextureFormat() == GE_TFMT_5650); } - curSampler_ = samplerCache_.GetOrCreateSampler(samplerKey); InvalidateLastTexture(entry); } diff --git a/GPU/Vulkan/TextureCacheVulkan.h b/GPU/Vulkan/TextureCacheVulkan.h index 3ad62fc74d7e..77f55aaea856 100644 --- a/GPU/Vulkan/TextureCacheVulkan.h +++ b/GPU/Vulkan/TextureCacheVulkan.h @@ -86,6 +86,7 @@ class TextureCacheVulkan : public TextureCacheCommon { lastBoundTexture = nullptr; gstate_c.Dirty(DIRTY_TEXTURE_PARAMS); } + void InvalidateLastTexture(TexCacheEntry *entry = nullptr) override { if (!entry || entry->vkTex == lastBoundTexture) { lastBoundTexture = nullptr; From 413a204138fac07d2eb99c1b519674cd8a03d6d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 13 Apr 2018 17:32:36 +0200 Subject: [PATCH 4/8] Vulkan: Semi-gross hack that massively improves the perf of MGS2:Acid. --- Core/Compatibility.cpp | 1 + Core/Compatibility.h | 1 + GPU/Vulkan/DrawEngineVulkan.cpp | 2 +- GPU/Vulkan/GPU_Vulkan.cpp | 15 ++++++ assets/compat.ini | 10 +++- ext/native/thin3d/GLRenderManager.cpp | 2 +- ext/native/thin3d/VulkanQueueRunner.cpp | 68 ++++++++++++++++++++++++- ext/native/thin3d/VulkanQueueRunner.h | 17 ++++++- 8 files changed, 111 insertions(+), 5 deletions(-) diff --git a/Core/Compatibility.cpp b/Core/Compatibility.cpp index b12daff7b317..2fed762c0d1b 100644 --- a/Core/Compatibility.cpp +++ b/Core/Compatibility.cpp @@ -57,6 +57,7 @@ void Compatibility::CheckSettings(IniFile &iniFile, const std::string &gameID) { CheckSetting(iniFile, gameID, "RequireDefaultCPUClock", &flags_.RequireDefaultCPUClock); CheckSetting(iniFile, gameID, "DisableReadbacks", &flags_.DisableReadbacks); CheckSetting(iniFile, gameID, "DisableAccurateDepth", &flags_.DisableAccurateDepth); + CheckSetting(iniFile, gameID, "MGS2AcidHack", &flags_.MGS2AcidHack); } void Compatibility::CheckSetting(IniFile &iniFile, const std::string &gameID, const char *option, bool *flag) { diff --git a/Core/Compatibility.h b/Core/Compatibility.h index 78cc86db12ac..6f60504a40f5 100644 --- a/Core/Compatibility.h +++ b/Core/Compatibility.h @@ -57,6 +57,7 @@ struct CompatFlags { bool RequireDefaultCPUClock; bool DisableReadbacks; bool DisableAccurateDepth; + bool MGS2AcidHack; }; class IniFile; diff --git a/GPU/Vulkan/DrawEngineVulkan.cpp b/GPU/Vulkan/DrawEngineVulkan.cpp index 963898d26c4d..7a5eeb3f0c53 100644 --- a/GPU/Vulkan/DrawEngineVulkan.cpp +++ b/GPU/Vulkan/DrawEngineVulkan.cpp @@ -372,7 +372,7 @@ VkResult DrawEngineVulkan::RecreateDescriptorPool(FrameData &frame, int newSize) VkDescriptorPoolSize dpTypes[3]; dpTypes[0].descriptorCount = frame.descPoolSize * 3; dpTypes[0].type = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC; - dpTypes[1].descriptorCount = frame.descPoolSize * 2; // Don't use these for tess anymore, need max two per set. + dpTypes[1].descriptorCount = frame.descPoolSize * 3; // Don't use these for tess anymore, need max three per set. dpTypes[1].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; dpTypes[2].descriptorCount = frame.descPoolSize; dpTypes[2].type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; diff --git a/GPU/Vulkan/GPU_Vulkan.cpp b/GPU/Vulkan/GPU_Vulkan.cpp index b5d78c9595bc..e69bf7454a8b 100644 --- a/GPU/Vulkan/GPU_Vulkan.cpp +++ b/GPU/Vulkan/GPU_Vulkan.cpp @@ -42,6 +42,8 @@ #include "GPU/Vulkan/FramebufferVulkan.h" #include "GPU/Vulkan/DrawEngineVulkan.h" #include "GPU/Vulkan/TextureCacheVulkan.h" +#include "thin3d/VulkanRenderManager.h" +#include "thin3d/VulkanQueueRunner.h" #include "Core/MIPS/MIPS.h" #include "Core/HLE/sceKernelThread.h" @@ -456,6 +458,15 @@ void GPU_Vulkan::InitDeviceObjects() { assert(!frameData_[i].push_); frameData_[i].push_ = new VulkanPushBuffer(vulkan_, 64 * 1024); } + + VulkanRenderManager *rm = (VulkanRenderManager *)draw_->GetNativeObject(Draw::NativeObject::RENDER_MANAGER); + uint32_t hacks = 0; + if (PSP_CoreParameter().compat.flags().MGS2AcidHack) { + hacks |= QUEUE_HACK_MGS2_ACID; + } + if (hacks) { + rm->GetQueueRunner()->EnableHacks(hacks); + } } void GPU_Vulkan::DestroyDeviceObjects() { @@ -467,6 +478,10 @@ void GPU_Vulkan::DestroyDeviceObjects() { frameData_[i].push_ = nullptr; } } + + // Need to turn off hacks when shutting down the GPU. Don't want them running in the menu. + VulkanRenderManager *rm = (VulkanRenderManager *)draw_->GetNativeObject(Draw::NativeObject::RENDER_MANAGER); + rm->GetQueueRunner()->EnableHacks(0); } void GPU_Vulkan::DeviceLost() { diff --git a/assets/compat.ini b/assets/compat.ini index e977c75b482e..8f9d668b054d 100644 --- a/assets/compat.ini +++ b/assets/compat.ini @@ -325,4 +325,12 @@ NPHG00092 = true NPEG00044 = true NPJG00120 = true UCJS10114 = true -UCES01401 = true \ No newline at end of file +UCES01401 = true + +[MGS2AcidHack] +ULES00008 = true +ULJM08001 = true +ULJM05001 = true +ULAS42007 = true +ULUS10006 = true +ULUS10077 = true diff --git a/ext/native/thin3d/GLRenderManager.cpp b/ext/native/thin3d/GLRenderManager.cpp index 32936bd4be3c..1e986b41956f 100644 --- a/ext/native/thin3d/GLRenderManager.cpp +++ b/ext/native/thin3d/GLRenderManager.cpp @@ -472,6 +472,7 @@ void GLRenderManager::Run(int frame) { auto &initStepsOnThread = frameData_[frame].initSteps; // queueRunner_.LogSteps(stepsOnThread); queueRunner_.RunInitSteps(initStepsOnThread); + initStepsOnThread.clear(); // Run this after RunInitSteps so any fresh GLRBuffers for the pushbuffers can get created. for (auto iter : frameData.activePushBuffers) { @@ -481,7 +482,6 @@ void GLRenderManager::Run(int frame) { queueRunner_.RunSteps(stepsOnThread); stepsOnThread.clear(); - initStepsOnThread.clear(); for (auto iter : frameData.activePushBuffers) { iter->MapDevice(bufferStrategy_); diff --git a/ext/native/thin3d/VulkanQueueRunner.cpp b/ext/native/thin3d/VulkanQueueRunner.cpp index 6994c96bfff8..465159ddc78e 100644 --- a/ext/native/thin3d/VulkanQueueRunner.cpp +++ b/ext/native/thin3d/VulkanQueueRunner.cpp @@ -347,7 +347,7 @@ VkRenderPass VulkanQueueRunner::GetRenderPass(const RPKey &key) { return pass; } -void VulkanQueueRunner::RunSteps(VkCommandBuffer cmd, const std::vector &steps) { +void VulkanQueueRunner::RunSteps(VkCommandBuffer cmd, std::vector &steps) { // Optimizes renderpasses, then sequences them. // Planned optimizations: // * Create copies of render target that are rendered to multiple times and textured from in sequence, and push those render passes @@ -397,6 +397,12 @@ void VulkanQueueRunner::RunSteps(VkCommandBuffer cmd, const std::vector &steps) { + // We want to turn a sequence of copy,render(1),copy,render(1),copy,render(1) to copy,copy,copy,render(n). + + for (int i = 0; i < (int)steps.size() - 3; i++) { + int last = -1; + if (!(steps[i]->stepType == VKRStepType::COPY && + steps[i + 1]->stepType == VKRStepType::RENDER && + steps[i + 2]->stepType == VKRStepType::COPY && + steps[i + 1]->render.numDraws == 1 && + steps[i]->copy.dst == steps[i + 2]->copy.dst)) + continue; + // Looks promising! Let's start by finding the last one. + for (int j = i; j < (int)steps.size(); j++) { + switch (steps[j]->stepType) { + case VKRStepType::RENDER: + if (steps[j]->render.numDraws > 1) + last = j - 1; + break; + case VKRStepType::COPY: + if (steps[j]->copy.dst != steps[i]->copy.dst) + last = j - 1; + break; + } + if (last != -1) + break; + } + + if (last != -1) { + // We've got a sequence from i to last that needs reordering. + // First, let's sort it, keeping the same length. + std::vector copies; + std::vector renders; + for (int n = i; n <= last; n++) { + if (steps[n]->stepType == VKRStepType::COPY) + copies.push_back(steps[n]); + else if (steps[n]->stepType == VKRStepType::RENDER) + renders.push_back(steps[n]); + } + // Write the copies back. TODO: Combine them too. + for (int j = 0; j < (int)copies.size(); j++) { + steps[i + j] = copies[j]; + } + // Write the renders back (so they will be deleted properly). + for (int j = 0; j < (int)renders.size(); j++) { + steps[i + j + copies.size()] = renders[j]; + } + assert(steps[i + j + copies.size()]->stepType == VKRStepType::RENDER); + // Combine the renders. + for (int j = 1; j < (int)renders.size(); j++) { + for (int k = 0; k < renders[j]->commands.size(); k++) { + steps[i + copies.size()]->commands.push_back(renders[j]->commands[k]); + } + steps[i + copies.size() + j]->stepType = VKRStepType::RENDER_SKIP; + } + // We're done. + break; + } + } +} + void VulkanQueueRunner::LogSteps(const std::vector &steps) { ILOG("======================================="); for (size_t i = 0; i < steps.size(); i++) { diff --git a/ext/native/thin3d/VulkanQueueRunner.h b/ext/native/thin3d/VulkanQueueRunner.h index 0dd3ba859752..e7f6df1fa283 100644 --- a/ext/native/thin3d/VulkanQueueRunner.h +++ b/ext/native/thin3d/VulkanQueueRunner.h @@ -10,6 +10,10 @@ class VKRFramebuffer; struct VKRImage; +enum { + QUEUE_HACK_MGS2_ACID = 1, +}; + enum class VKRRenderCommand : uint8_t { BIND_PIPELINE, STENCIL, @@ -152,7 +156,9 @@ class VulkanQueueRunner { backbuffer_ = fb; backbufferImage_ = img; } - void RunSteps(VkCommandBuffer cmd, const std::vector &steps); + + // RunSteps can modify steps but will leave it in a valid state. + void RunSteps(VkCommandBuffer cmd, std::vector &steps); void LogSteps(const std::vector &steps); void CreateDeviceObjects(); @@ -205,6 +211,10 @@ class VulkanQueueRunner { return found; } + void EnableHacks(uint32_t hacks) { + hacksEnabled_ = hacks; + } + private: void InitBackbufferRenderPass(); @@ -223,6 +233,8 @@ class VulkanQueueRunner { void ResizeReadbackBuffer(VkDeviceSize requiredSize); + void ApplyMGSHack(std::vector &steps); + static void SetupTransitionToTransferSrc(VKRImage &img, VkImageMemoryBarrier &barrier, VkPipelineStageFlags &stage, VkImageAspectFlags aspect); static void SetupTransitionToTransferDst(VKRImage &img, VkImageMemoryBarrier &barrier, VkPipelineStageFlags &stage, VkImageAspectFlags aspect); @@ -244,4 +256,7 @@ class VulkanQueueRunner { VkDeviceMemory readbackMemory_ = VK_NULL_HANDLE; VkBuffer readbackBuffer_ = VK_NULL_HANDLE; VkDeviceSize readbackBufferSize_ = 0; + + // TODO: Enable based on compat.ini. + uint32_t hacksEnabled_ = 0; }; From 0ac6cea34dd7f1594c3f9e9bf8c69e6f1300f79b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 13 Apr 2018 18:05:04 +0200 Subject: [PATCH 5/8] Add a queue processing hack for Sonic Rivals too. Now it's fast. --- Core/Compatibility.cpp | 1 + Core/Compatibility.h | 1 + GPU/Vulkan/GPU_Vulkan.cpp | 5 +- assets/compat.ini | 4 ++ ext/native/thin3d/VulkanQueueRunner.cpp | 93 +++++++++++++++++++++++-- ext/native/thin3d/VulkanQueueRunner.h | 2 + 6 files changed, 100 insertions(+), 6 deletions(-) diff --git a/Core/Compatibility.cpp b/Core/Compatibility.cpp index 2fed762c0d1b..1cc0ccb2190a 100644 --- a/Core/Compatibility.cpp +++ b/Core/Compatibility.cpp @@ -58,6 +58,7 @@ void Compatibility::CheckSettings(IniFile &iniFile, const std::string &gameID) { CheckSetting(iniFile, gameID, "DisableReadbacks", &flags_.DisableReadbacks); CheckSetting(iniFile, gameID, "DisableAccurateDepth", &flags_.DisableAccurateDepth); CheckSetting(iniFile, gameID, "MGS2AcidHack", &flags_.MGS2AcidHack); + CheckSetting(iniFile, gameID, "SonicRivalsHack", &flags_.SonicRivalsHack); } void Compatibility::CheckSetting(IniFile &iniFile, const std::string &gameID, const char *option, bool *flag) { diff --git a/Core/Compatibility.h b/Core/Compatibility.h index 6f60504a40f5..f65c68efda0c 100644 --- a/Core/Compatibility.h +++ b/Core/Compatibility.h @@ -58,6 +58,7 @@ struct CompatFlags { bool DisableReadbacks; bool DisableAccurateDepth; bool MGS2AcidHack; + bool SonicRivalsHack; }; class IniFile; diff --git a/GPU/Vulkan/GPU_Vulkan.cpp b/GPU/Vulkan/GPU_Vulkan.cpp index e69bf7454a8b..e774e735bc5d 100644 --- a/GPU/Vulkan/GPU_Vulkan.cpp +++ b/GPU/Vulkan/GPU_Vulkan.cpp @@ -461,9 +461,10 @@ void GPU_Vulkan::InitDeviceObjects() { VulkanRenderManager *rm = (VulkanRenderManager *)draw_->GetNativeObject(Draw::NativeObject::RENDER_MANAGER); uint32_t hacks = 0; - if (PSP_CoreParameter().compat.flags().MGS2AcidHack) { + if (PSP_CoreParameter().compat.flags().MGS2AcidHack) hacks |= QUEUE_HACK_MGS2_ACID; - } + if (PSP_CoreParameter().compat.flags().SonicRivalsHack) + hacks |= QUEUE_HACK_SONIC; if (hacks) { rm->GetQueueRunner()->EnableHacks(hacks); } diff --git a/assets/compat.ini b/assets/compat.ini index 8f9d668b054d..6d0de1a99f59 100644 --- a/assets/compat.ini +++ b/assets/compat.ini @@ -334,3 +334,7 @@ ULJM05001 = true ULAS42007 = true ULUS10006 = true ULUS10077 = true + +[SonicRivalsHack] +ULES00622 = true +ULUS10195 = true diff --git a/ext/native/thin3d/VulkanQueueRunner.cpp b/ext/native/thin3d/VulkanQueueRunner.cpp index 465159ddc78e..eba6f52dc2f5 100644 --- a/ext/native/thin3d/VulkanQueueRunner.cpp +++ b/ext/native/thin3d/VulkanQueueRunner.cpp @@ -398,9 +398,14 @@ void VulkanQueueRunner::RunSteps(VkCommandBuffer cmd, std::vector &st } // Queue hacks. - if (hacksEnabled_ & QUEUE_HACK_MGS2_ACID) { - // Massive speedup. - ApplyMGSHack(steps); + if (hacksEnabled_) { + if (hacksEnabled_ & QUEUE_HACK_MGS2_ACID) { + // Massive speedup. + ApplyMGSHack(steps); + } + if (hacksEnabled_ & QUEUE_HACK_SONIC) { + ApplySonicHack(steps); + } } for (size_t i = 0; i < steps.size(); i++) { @@ -460,6 +465,8 @@ void VulkanQueueRunner::ApplyMGSHack(std::vector &steps) { // First, let's sort it, keeping the same length. std::vector copies; std::vector renders; + copies.reserve((last - i) / 2); + renders.reserve((last - i) / 2); for (int n = i; n <= last; n++) { if (steps[n]->stepType == VKRStepType::COPY) copies.push_back(steps[n]); @@ -474,7 +481,7 @@ void VulkanQueueRunner::ApplyMGSHack(std::vector &steps) { for (int j = 0; j < (int)renders.size(); j++) { steps[i + j + copies.size()] = renders[j]; } - assert(steps[i + j + copies.size()]->stepType == VKRStepType::RENDER); + assert(steps[i + copies.size()]->stepType == VKRStepType::RENDER); // Combine the renders. for (int j = 1; j < (int)renders.size(); j++) { for (int k = 0; k < renders[j]->commands.size(); k++) { @@ -488,6 +495,84 @@ void VulkanQueueRunner::ApplyMGSHack(std::vector &steps) { } } +void VulkanQueueRunner::ApplySonicHack(std::vector &steps) { + // We want to turn a sequence of render(3),render(1),render(6),render(1),render(6),render(1),render(3) to + // render(1), render(1), render(1), render(6), render(6), render(6) + + for (int i = 0; i < (int)steps.size() - 4; i++) { + int last = -1; + if (!(steps[i]->stepType == VKRStepType::RENDER && + steps[i + 1]->stepType == VKRStepType::RENDER && + steps[i + 2]->stepType == VKRStepType::RENDER && + steps[i + 3]->stepType == VKRStepType::RENDER && + steps[i]->render.numDraws == 3 && + steps[i + 1]->render.numDraws == 1 && + steps[i + 2]->render.numDraws == 6 && + steps[i + 3]->render.numDraws == 1 && + steps[i]->render.framebuffer == steps[i + 2]->render.framebuffer && + steps[i + 1]->render.framebuffer == steps[i + 3]->render.framebuffer)) + continue; + // Looks promising! Let's start by finding the last one. + for (int j = i; j < (int)steps.size(); j++) { + switch (steps[j]->stepType) { + case VKRStepType::RENDER: + if ((j - i) & 1) { + if (steps[j]->render.framebuffer != steps[i + 1]->render.framebuffer) + last = j - 1; + if (steps[j]->render.numDraws != 1) + last = j - 1; + } else { + if (steps[j]->render.framebuffer != steps[i]->render.framebuffer) + last = j - 1; + if (steps[j]->render.numDraws != 3 && steps[j]->render.numDraws != 6) + last = j - 1; + } + } + if (last != -1) + break; + } + + if (last != -1) { + // We've got a sequence from i to last that needs reordering. + // First, let's sort it, keeping the same length. + std::vector type1; + std::vector type2; + type1.reserve((last - i) / 2); + type2.reserve((last - i) / 2); + for (int n = i; n <= last; n++) { + if (steps[n]->render.framebuffer == steps[i]->render.framebuffer) + type1.push_back(steps[n]); + else + type2.push_back(steps[n]); + } + + // Write the renders back in order. Same amount, so deletion will work fine. + for (int j = 0; j < (int)type1.size(); j++) { + steps[i + j] = type1[j]; + } + for (int j = 0; j < (int)type2.size(); j++) { + steps[i + j + type1.size()] = type2[j]; + } + + // Combine the renders. + for (int j = 1; j < (int)type1.size(); j++) { + for (int k = 0; k < (int)type1[j]->commands.size(); k++) { + steps[i]->commands.push_back(type1[j]->commands[k]); + } + steps[i + j]->stepType = VKRStepType::RENDER_SKIP; + } + for (int j = 1; j < (int)type2.size(); j++) { + for (int k = 0; k < (int)type2[j]->commands.size(); k++) { + steps[i + type1.size()]->commands.push_back(type2[j]->commands[k]); + } + steps[i + j + type1.size()]->stepType = VKRStepType::RENDER_SKIP; + } + // We're done. + break; + } + } +} + void VulkanQueueRunner::LogSteps(const std::vector &steps) { ILOG("======================================="); for (size_t i = 0; i < steps.size(); i++) { diff --git a/ext/native/thin3d/VulkanQueueRunner.h b/ext/native/thin3d/VulkanQueueRunner.h index e7f6df1fa283..d743bda73057 100644 --- a/ext/native/thin3d/VulkanQueueRunner.h +++ b/ext/native/thin3d/VulkanQueueRunner.h @@ -12,6 +12,7 @@ struct VKRImage; enum { QUEUE_HACK_MGS2_ACID = 1, + QUEUE_HACK_SONIC = 2, }; enum class VKRRenderCommand : uint8_t { @@ -234,6 +235,7 @@ class VulkanQueueRunner { void ResizeReadbackBuffer(VkDeviceSize requiredSize); void ApplyMGSHack(std::vector &steps); + void ApplySonicHack(std::vector &steps); static void SetupTransitionToTransferSrc(VKRImage &img, VkImageMemoryBarrier &barrier, VkPipelineStageFlags &stage, VkImageAspectFlags aspect); static void SetupTransitionToTransferDst(VKRImage &img, VkImageMemoryBarrier &barrier, VkPipelineStageFlags &stage, VkImageAspectFlags aspect); From 0479255f76855b410bf4e62bee5515317b4dde05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 13 Apr 2018 18:28:38 +0200 Subject: [PATCH 6/8] Let's try it on SR2 as well. --- assets/compat.ini | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/assets/compat.ini b/assets/compat.ini index 6d0de1a99f59..72ac34b4c946 100644 --- a/assets/compat.ini +++ b/assets/compat.ini @@ -336,5 +336,7 @@ ULUS10006 = true ULUS10077 = true [SonicRivalsHack] -ULES00622 = true -ULUS10195 = true +ULES00622 = true # SR1 +ULUS10195 = true # SR1 +ULUS10323 = true # SR2 +ULES00940 = true # SR2 \ No newline at end of file From fb7a63bd11ba785209babfe9631956110d46c9bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 13 Apr 2018 20:00:14 +0200 Subject: [PATCH 7/8] Implement shader depal for GL as well, but disabled by default. --- GPU/GLES/FragmentShaderGeneratorGLES.cpp | 94 +++++++++++++++++++++++- GPU/GLES/ShaderManagerGLES.cpp | 16 +++- GPU/GLES/ShaderManagerGLES.h | 4 + GPU/GLES/TextureCacheGLES.cpp | 35 ++++++++- GPU/GLES/TextureCacheGLES.h | 2 +- ext/native/thin3d/GLQueueRunner.cpp | 6 +- 6 files changed, 150 insertions(+), 7 deletions(-) diff --git a/GPU/GLES/FragmentShaderGeneratorGLES.cpp b/GPU/GLES/FragmentShaderGeneratorGLES.cpp index 913e6b582918..d8b117601f48 100644 --- a/GPU/GLES/FragmentShaderGeneratorGLES.cpp +++ b/GPU/GLES/FragmentShaderGeneratorGLES.cpp @@ -157,6 +157,7 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, uint64_t *uniform bool doTextureProjection = id.Bit(FS_BIT_DO_TEXTURE_PROJ); bool doTextureAlpha = id.Bit(FS_BIT_TEXALPHA); bool doFlatShading = id.Bit(FS_BIT_FLATSHADE); + bool shaderDepal = id.Bit(FS_BIT_SHADER_DEPAL); GEComparison alphaTestFunc = (GEComparison)id.Bits(FS_BIT_ALPHA_TEST_FUNC, 3); GEComparison colorTestFunc = (GEComparison)id.Bits(FS_BIT_COLOR_TEST_FUNC, 2); @@ -217,6 +218,12 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, uint64_t *uniform } } + if (shaderDepal) { + WRITE(p, "uniform sampler2D pal;\n"); + WRITE(p, "uniform int u_depal;\n"); + *uniformMask |= DIRTY_DEPAL; + } + StencilValueType replaceAlphaWithStencilType = (StencilValueType)id.Bits(FS_BIT_REPLACE_ALPHA_WITH_STENCIL_TYPE, 4); if (stencilToAlpha && replaceAlphaWithStencilType == STENCIL_VALUE_UNIFORM) { *uniformMask |= DIRTY_STENCILREPLACEVALUE; @@ -336,10 +343,95 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, uint64_t *uniform if (doTextureProjection) { WRITE(p, " vec4 t = %sProj(tex, %s);\n", texture, texcoord); + if (shaderDepal) { + WRITE(p, " vec4 t1 = %sProjOffset(tex, %s, ivec2(1, 0));\n", texture, texcoord); + WRITE(p, " vec4 t2 = %sProjOffset(tex, %s, ivec2(0, 1));\n", texture, texcoord); + WRITE(p, " vec4 t3 = %sProjOffset(tex, %s, ivec2(1, 1));\n", texture, texcoord); + } } else { WRITE(p, " vec4 t = %s(tex, %s.xy);\n", texture, texcoord); + if (shaderDepal) { + WRITE(p, " vec4 t1 = %sOffset(tex, %s.xy, ivec2(1, 0));\n", texture, texcoord); + WRITE(p, " vec4 t2 = %sOffset(tex, %s.xy, ivec2(0, 1));\n", texture, texcoord); + WRITE(p, " vec4 t3 = %sOffset(tex, %s.xy, ivec2(1, 1));\n", texture, texcoord); + } } - WRITE(p, " vec4 p = v_color0;\n"); + + if (shaderDepal) { + WRITE(p, " int depalMask = (u_depal & 0xFF);\n"); + WRITE(p, " int depalShift = ((u_depal >> 8) & 0xFF);\n"); + WRITE(p, " int depalOffset = (((u_depal >> 16) & 0xFF) << 4);\n"); + WRITE(p, " int depalFmt = ((u_depal >> 24) & 0x3);\n"); + WRITE(p, " bool bilinear = (u_depal >> 31) != 0;\n"); + WRITE(p, " vec2 fraction = fract(%s.xy);\n", texcoord); + WRITE(p, " ivec4 col; int index0; int index1; int index2; int index3;\n"); + WRITE(p, " switch (depalFmt) {\n"); // We might want to include fmt in the shader ID if this is a performance issue. + WRITE(p, " case 0:\n"); // 565 + WRITE(p, " col = ivec4(t.rgb * vec3(31.99, 63.99, 31.99), 0);\n"); + WRITE(p, " index0 = (col.b << 11) | (col.g << 5) | (col.r);\n"); + WRITE(p, " if (bilinear) {\n"); + WRITE(p, " col = ivec4(t1.rgb * vec3(31.99, 63.99, 31.99), 0);\n"); + WRITE(p, " index1 = (col.b << 11) | (col.g << 5) | (col.r);\n"); + WRITE(p, " col = ivec4(t2.rgb * vec3(31.99, 63.99, 31.99), 0);\n"); + WRITE(p, " index2 = (col.b << 11) | (col.g << 5) | (col.r);\n"); + WRITE(p, " col = ivec4(t3.rgb * vec3(31.99, 63.99, 31.99), 0);\n"); + WRITE(p, " index3 = (col.b << 11) | (col.g << 5) | (col.r);\n"); + WRITE(p, " }\n"); + WRITE(p, " break;\n"); + WRITE(p, " case 1:\n"); // 5551 + WRITE(p, " col = ivec4(t.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n"); + WRITE(p, " index0 = (col.a << 15) | (col.b << 10) | (col.g << 5) | (col.r);\n"); + WRITE(p, " if (bilinear) {\n"); + WRITE(p, " col = ivec4(t1.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n"); + WRITE(p, " index1 = (col.a << 15) | (col.b << 10) | (col.g << 5) | (col.r);\n"); + WRITE(p, " col = ivec4(t2.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n"); + WRITE(p, " index2 = (col.a << 15) | (col.b << 10) | (col.g << 5) | (col.r);\n"); + WRITE(p, " col = ivec4(t3.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n"); + WRITE(p, " index3 = (col.a << 15) | (col.b << 10) | (col.g << 5) | (col.r);\n"); + WRITE(p, " }\n"); + WRITE(p, " break;\n"); + WRITE(p, " case 2:\n"); // 4444 + WRITE(p, " col = ivec4(t.rgba * vec4(15.99, 15.99, 15.99, 15.99));\n"); + WRITE(p, " index0 = (col.a << 12) | (col.b << 8) | (col.g << 4) | (col.r);\n"); + WRITE(p, " if (bilinear) {\n"); + WRITE(p, " col = ivec4(t1.rgba * vec4(15.99, 15.99, 15.99, 15.99));\n"); + WRITE(p, " index1 = (col.a << 12) | (col.b << 8) | (col.g << 4) | (col.r);\n"); + WRITE(p, " col = ivec4(t2.rgba * vec4(15.99, 15.99, 15.99, 15.99));\n"); + WRITE(p, " index2 = (col.a << 12) | (col.b << 8) | (col.g << 4) | (col.r);\n"); + WRITE(p, " col = ivec4(t3.rgba * vec4(15.99, 15.99, 15.99, 15.99));\n"); + WRITE(p, " index3 = (col.a << 12) | (col.b << 8) | (col.g << 4) | (col.r);\n"); + WRITE(p, " }\n"); + WRITE(p, " break;\n"); + WRITE(p, " case 3:\n"); // 8888 + WRITE(p, " col = ivec4(t.rgba * vec4(255.99, 255.99, 255.99, 255.99));\n"); + WRITE(p, " index0 = (col.a << 24) | (col.b << 16) | (col.g << 8) | (col.r);\n"); + WRITE(p, " if (bilinear) {\n"); + WRITE(p, " col = ivec4(t1.rgba * vec4(255.99, 255.99, 255.99, 255.99));\n"); + WRITE(p, " index1 = (col.a << 24) | (col.b << 16) | (col.g << 8) | (col.r);\n"); + WRITE(p, " col = ivec4(t2.rgba * vec4(255.99, 255.99, 255.99, 255.99));\n"); + WRITE(p, " index2 = (col.a << 24) | (col.b << 16) | (col.g << 8) | (col.r);\n"); + WRITE(p, " col = ivec4(t3.rgba * vec4(255.99, 255.99, 255.99, 255.99));\n"); + WRITE(p, " index3 = (col.a << 24) | (col.b << 16) | (col.g << 8) | (col.r);\n"); + WRITE(p, " }\n"); + WRITE(p, " break;\n"); + WRITE(p, " };\n"); + WRITE(p, " index0 = ((index0 >> depalShift) & depalMask) | depalOffset;\n"); + WRITE(p, " t = texelFetch(pal, ivec2(index0, 0), 0);\n"); + WRITE(p, " if (bilinear) {\n"); + WRITE(p, " index1 = ((index1 >> depalShift) & depalMask) | depalOffset;\n"); + WRITE(p, " index2 = ((index2 >> depalShift) & depalMask) | depalOffset;\n"); + WRITE(p, " index3 = ((index3 >> depalShift) & depalMask) | depalOffset;\n"); + WRITE(p, " t1 = texelFetch(pal, ivec2(index1, 0), 0);\n"); + WRITE(p, " t2 = texelFetch(pal, ivec2(index2, 0), 0);\n"); + WRITE(p, " t3 = texelFetch(pal, ivec2(index3, 0), 0);\n"); + WRITE(p, " t = mix(t, t1, fraction.x);\n"); + WRITE(p, " t2 = mix(t2, t3, fraction.x);\n"); + WRITE(p, " t = mix(t, t2, fraction.y);\n"); + WRITE(p, " }\n"); + } + + if (texFunc != GE_TEXFUNC_REPLACE || !doTextureAlpha) + WRITE(p, " vec4 p = v_color0;\n"); if (doTextureAlpha) { // texfmt == RGBA switch (texFunc) { diff --git a/GPU/GLES/ShaderManagerGLES.cpp b/GPU/GLES/ShaderManagerGLES.cpp index 0fee3e6500ef..63bf981bcc76 100644 --- a/GPU/GLES/ShaderManagerGLES.cpp +++ b/GPU/GLES/ShaderManagerGLES.cpp @@ -103,6 +103,7 @@ LinkedShader::LinkedShader(GLRenderManager *render, VShaderID VSID, Shader *vs, queries.push_back({ &u_blendFixA, "u_blendFixA" }); queries.push_back({ &u_blendFixB, "u_blendFixB" }); queries.push_back({ &u_fbotexSize, "u_fbotexSize" }); + queries.push_back({ &u_pal, "pal" }); // Transform queries.push_back({ &u_view, "u_view" }); @@ -161,6 +162,7 @@ LinkedShader::LinkedShader(GLRenderManager *render, VShaderID VSID, Shader *vs, queries.push_back({ &u_spline_count_v, "u_spline_count_v" }); queries.push_back({ &u_spline_type_u, "u_spline_type_u" }); queries.push_back({ &u_spline_type_v, "u_spline_type_v" }); + queries.push_back({ &u_depal, "u_depal" }); attrMask = vs->GetAttrMask(); availableUniforms = vs->GetUniformMask() | fs->GetUniformMask(); @@ -169,6 +171,7 @@ LinkedShader::LinkedShader(GLRenderManager *render, VShaderID VSID, Shader *vs, initialize.push_back({ &u_tex, 0, 0 }); initialize.push_back({ &u_fbotex, 0, 1 }); initialize.push_back({ &u_testtex, 0, 2 }); + initialize.push_back({ &u_pal, 0, 3 }); // CLUT initialize.push_back({ &u_tess_pos_tex, 0, 4 }); // Texture unit 4 initialize.push_back({ &u_tess_tex_tex, 0, 5 }); // Texture unit 5 initialize.push_back({ &u_tess_col_tex, 0, 6 }); // Texture unit 6 @@ -283,6 +286,17 @@ void LinkedShader::UpdateUniforms(u32 vertType, const ShaderID &vsid) { if (!dirty) return; + if (dirty & DIRTY_DEPAL) { + int indexMask = gstate.getClutIndexMask(); + int indexShift = gstate.getClutIndexShift(); + int indexOffset = gstate.getClutIndexStartPos() >> 4; + int format = gstate_c.depalFramebufferFormat; + uint32_t val = BytesToUint32(indexMask, indexShift, indexOffset, format); + // Poke in a bilinear filter flag in the top bit. + val |= gstate.isMagnifyFilteringEnabled() << 31; + render_->SetUniformI1(&u_depal, val); + } + // Update any dirty uniforms before we draw if (dirty & DIRTY_PROJMATRIX) { Matrix4x4 flippedMatrix; @@ -810,7 +824,7 @@ std::string ShaderManagerGLES::DebugGetShaderString(std::string id, DebugShaderT // as sometimes these features might have an effect on the ID bits. #define CACHE_HEADER_MAGIC 0x83277592 -#define CACHE_VERSION 11 +#define CACHE_VERSION 12 struct CacheHeader { uint32_t magic; uint32_t version; diff --git a/GPU/GLES/ShaderManagerGLES.h b/GPU/GLES/ShaderManagerGLES.h index e04a9a8c8b0e..c9ca56f15a7d 100644 --- a/GPU/GLES/ShaderManagerGLES.h +++ b/GPU/GLES/ShaderManagerGLES.h @@ -85,6 +85,10 @@ class LinkedShader { int u_blendFixB; int u_fbotexSize; + // Shader depal + int u_pal; // the texture + int u_depal; // the params + // Fragment processing inputs int u_alphacolorref; int u_alphacolormask; diff --git a/GPU/GLES/TextureCacheGLES.cpp b/GPU/GLES/TextureCacheGLES.cpp index 38d5aa280a26..031c1bdad5d0 100644 --- a/GPU/GLES/TextureCacheGLES.cpp +++ b/GPU/GLES/TextureCacheGLES.cpp @@ -161,7 +161,7 @@ void TextureCacheGLES::UpdateSamplingParams(TexCacheEntry &entry, bool force) { render_->SetTextureSampler(0, sClamp ? GL_CLAMP_TO_EDGE : GL_REPEAT, tClamp ? GL_CLAMP_TO_EDGE : GL_REPEAT, MagFiltGL[magFilt], MinFiltGL[minFilt], aniso); } -void TextureCacheGLES::SetFramebufferSamplingParams(u16 bufferWidth, u16 bufferHeight) { +void TextureCacheGLES::SetFramebufferSamplingParams(u16 bufferWidth, u16 bufferHeight, bool forcePoint) { int minFilt; int magFilt; bool sClamp; @@ -171,6 +171,10 @@ void TextureCacheGLES::SetFramebufferSamplingParams(u16 bufferWidth, u16 bufferH GetSamplingParams(minFilt, magFilt, sClamp, tClamp, lodBias, 0, 0, mode); minFilt &= 1; // framebuffers can't mipmap. + if (forcePoint) { + minFilt &= ~1; + magFilt &= ~1; + } // Often the framebuffer will not match the texture size. We'll wrap/clamp in the shader in that case. // This happens whether we have OES_texture_npot or not. @@ -324,6 +328,7 @@ void TextureCacheGLES::BindTexture(TexCacheEntry *entry) { lastBoundTexture = entry->textureName; } UpdateSamplingParams(*entry, false); + gstate_c.useShaderDepal = false; } void TextureCacheGLES::Unbind() { @@ -434,7 +439,33 @@ class TextureShaderApplier { void TextureCacheGLES::ApplyTextureFramebuffer(TexCacheEntry *entry, VirtualFramebuffer *framebuffer) { DepalShader *depal = nullptr; uint32_t clutMode = gstate.clutformat & 0xFFFFFF; + +#if 0 + bool useShaderDepal = gstate_c.Supports(GPU_SUPPORTS_GLSL_ES_300); +#else + bool useShaderDepal = false; +#endif + if ((entry->status & TexCacheEntry::STATUS_DEPALETTIZE) && !g_Config.bDisableSlowFramebufEffects) { + if (useShaderDepal) { + const GEPaletteFormat clutFormat = gstate.getClutPaletteFormat(); + GLRTexture *clutTexture = depalShaderCache_->GetClutTexture(clutFormat, clutHash_, clutBuf_); + render_->BindTexture(TEX_SLOT_CLUT, clutTexture); + framebufferManagerGL_->BindFramebufferAsColorTexture(0, framebuffer, BINDFBCOLOR_MAY_COPY_WITH_UV | BINDFBCOLOR_APPLY_TEX_OFFSET); + SetFramebufferSamplingParams(framebuffer->bufferWidth, framebuffer->bufferHeight, true); + InvalidateLastTexture(); + + // Since we started/ended render passes, might need these. + gstate_c.Dirty(DIRTY_DEPAL); + gstate_c.useShaderDepal = true; + gstate_c.depalFramebufferFormat = framebuffer->drawnFormat; + const u32 bytesPerColor = clutFormat == GE_CMODE_32BIT_ABGR8888 ? sizeof(u32) : sizeof(u16); + const u32 clutTotalColors = clutMaxBytes_ / bytesPerColor; + TexCacheEntry::TexStatus alphaStatus = CheckAlpha((const uint8_t *)clutBuf_, getClutDestFormat(clutFormat), clutTotalColors, clutTotalColors, 1); + gstate_c.SetTextureFullAlpha(alphaStatus == TexCacheEntry::STATUS_ALPHA_FULL); + return; + } + depal = depalShaderCache_->GetDepalettizeShader(clutMode, framebuffer->drawnFormat); } if (depal) { @@ -472,7 +503,7 @@ void TextureCacheGLES::ApplyTextureFramebuffer(TexCacheEntry *entry, VirtualFram } framebufferManagerGL_->RebindFramebuffer(); - SetFramebufferSamplingParams(framebuffer->bufferWidth, framebuffer->bufferHeight); + SetFramebufferSamplingParams(framebuffer->bufferWidth, framebuffer->bufferHeight, false); InvalidateLastTexture(); diff --git a/GPU/GLES/TextureCacheGLES.h b/GPU/GLES/TextureCacheGLES.h index a88eb8d7ecbe..be60d7eb772d 100644 --- a/GPU/GLES/TextureCacheGLES.h +++ b/GPU/GLES/TextureCacheGLES.h @@ -63,7 +63,7 @@ class TextureCacheGLES : public TextureCacheCommon { } } - void SetFramebufferSamplingParams(u16 bufferWidth, u16 bufferHeight); + void SetFramebufferSamplingParams(u16 bufferWidth, u16 bufferHeight, bool forcePoint); bool GetCurrentTextureDebug(GPUDebugBuffer &buffer, int level) override; void DeviceLost(); diff --git a/ext/native/thin3d/GLQueueRunner.cpp b/ext/native/thin3d/GLQueueRunner.cpp index ce13132cce10..653125d3fe70 100644 --- a/ext/native/thin3d/GLQueueRunner.cpp +++ b/ext/native/thin3d/GLQueueRunner.cpp @@ -4,6 +4,7 @@ #include "GLRenderManager.h" #include "DataFormatGL.h" #include "base/logging.h" +#include "base/stringutil.h" #include "gfx/gl_common.h" #include "gfx/gl_debug_log.h" #include "gfx_es2/gpu_features.h" @@ -156,9 +157,10 @@ void GLQueueRunner::RunInitSteps(const std::vector &steps) { #ifdef _WIN32 OutputDebugStringUTF8(buf); - OutputDebugStringUTF8(vsCode); + if (vsCode) + OutputDebugStringUTF8(LineNumberString(vsCode).c_str()); if (fsCode) - OutputDebugStringUTF8(fsCode); + OutputDebugStringUTF8(LineNumberString(fsCode).c_str()); #endif delete[] buf; } else { From f17890623c4fe4c6274c6c97d40e84a5a72a4fb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 13 Apr 2018 20:57:36 +0200 Subject: [PATCH 8/8] Shader depal: fix bilinear filter coord --- GPU/GLES/FragmentShaderGeneratorGLES.cpp | 2 +- GPU/Vulkan/FragmentShaderGeneratorVulkan.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/GPU/GLES/FragmentShaderGeneratorGLES.cpp b/GPU/GLES/FragmentShaderGeneratorGLES.cpp index d8b117601f48..421d8a9f9243 100644 --- a/GPU/GLES/FragmentShaderGeneratorGLES.cpp +++ b/GPU/GLES/FragmentShaderGeneratorGLES.cpp @@ -363,7 +363,7 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, uint64_t *uniform WRITE(p, " int depalOffset = (((u_depal >> 16) & 0xFF) << 4);\n"); WRITE(p, " int depalFmt = ((u_depal >> 24) & 0x3);\n"); WRITE(p, " bool bilinear = (u_depal >> 31) != 0;\n"); - WRITE(p, " vec2 fraction = fract(%s.xy);\n", texcoord); + WRITE(p, " vec2 fraction = fract(%s.xy * vec2(textureSize(tex, 0).xy));\n", texcoord); WRITE(p, " ivec4 col; int index0; int index1; int index2; int index3;\n"); WRITE(p, " switch (depalFmt) {\n"); // We might want to include fmt in the shader ID if this is a performance issue. WRITE(p, " case 0:\n"); // 565 diff --git a/GPU/Vulkan/FragmentShaderGeneratorVulkan.cpp b/GPU/Vulkan/FragmentShaderGeneratorVulkan.cpp index 08327ec7475e..c00c35082b38 100644 --- a/GPU/Vulkan/FragmentShaderGeneratorVulkan.cpp +++ b/GPU/Vulkan/FragmentShaderGeneratorVulkan.cpp @@ -197,7 +197,7 @@ bool GenerateVulkanGLSLFragmentShader(const FShaderID &id, char *buffer) { WRITE(p, " uint depalOffset = ((base.depal_mask_shift_off_fmt >> 16) & 0xFF) << 4;\n"); WRITE(p, " uint depalFmt = (base.depal_mask_shift_off_fmt >> 24) & 0x3;\n"); WRITE(p, " bool bilinear = (base.depal_mask_shift_off_fmt >> 31) != 0;\n"); - WRITE(p, " vec2 fraction = fract(%s.xy);\n", texcoord); + WRITE(p, " vec2 fraction = fract(%s.xy * vec2(textureSize(tex, 0).xy));\n", texcoord); WRITE(p, " uvec4 col; uint index0; uint index1; uint index2; uint index3;\n"); WRITE(p, " switch (depalFmt) {\n"); // We might want to include fmt in the shader ID if this is a performance issue. WRITE(p, " case 0:\n"); // 565