diff --git a/Common/GPU/ShaderWriter.cpp b/Common/GPU/ShaderWriter.cpp index 0332e4d3460e..69219e0f9422 100644 --- a/Common/GPU/ShaderWriter.cpp +++ b/Common/GPU/ShaderWriter.cpp @@ -10,7 +10,6 @@ const char * const vulkan_glsl_preamble_fs = "#version 450\n" "#extension GL_ARB_separate_shader_objects : enable\n" "#extension GL_ARB_shading_language_420pack : enable\n" -"#extension GL_ARB_conservative_depth : enable\n" "#extension GL_ARB_shader_image_load_store : enable\n" "#define splat3(x) vec3(x)\n" "#define DISCARD discard\n" diff --git a/Common/GPU/Vulkan/VulkanBarrier.cpp b/Common/GPU/Vulkan/VulkanBarrier.cpp index 125d51ea6de9..e4f2d0908933 100644 --- a/Common/GPU/Vulkan/VulkanBarrier.cpp +++ b/Common/GPU/Vulkan/VulkanBarrier.cpp @@ -4,7 +4,7 @@ void VulkanBarrier::Flush(VkCommandBuffer cmd) { if (!imageBarriers_.empty()) { - vkCmdPipelineBarrier(cmd, srcStageMask_, dstStageMask_, 0, 0, nullptr, 0, nullptr, (uint32_t)imageBarriers_.size(), imageBarriers_.data()); + vkCmdPipelineBarrier(cmd, srcStageMask_, dstStageMask_, dependencyFlags_, 0, nullptr, 0, nullptr, (uint32_t)imageBarriers_.size(), imageBarriers_.data()); } imageBarriers_.clear(); srcStageMask_ = 0; diff --git a/Common/GPU/Vulkan/VulkanBarrier.h b/Common/GPU/Vulkan/VulkanBarrier.h index eb949dd2f047..0d5754b3f4f4 100644 --- a/Common/GPU/Vulkan/VulkanBarrier.h +++ b/Common/GPU/Vulkan/VulkanBarrier.h @@ -21,6 +21,7 @@ class VulkanBarrier { ) { srcStageMask_ |= srcStageMask; dstStageMask_ |= dstStageMask; + dependencyFlags_ |= VK_DEPENDENCY_BY_REGION_BIT; VkImageMemoryBarrier imageBarrier; imageBarrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; @@ -112,4 +113,5 @@ class VulkanBarrier { VkPipelineStageFlags srcStageMask_ = 0; VkPipelineStageFlags dstStageMask_ = 0; std::vector imageBarriers_; + VkDependencyFlags dependencyFlags_ = 0; }; diff --git a/Common/GPU/Vulkan/VulkanQueueRunner.cpp b/Common/GPU/Vulkan/VulkanQueueRunner.cpp index 9b7ae11ea5b5..2a4342af17ff 100644 --- a/Common/GPU/Vulkan/VulkanQueueRunner.cpp +++ b/Common/GPU/Vulkan/VulkanQueueRunner.cpp @@ -129,7 +129,7 @@ void VulkanQueueRunner::DestroyDeviceObjects() { } void VulkanQueueRunner::InitBackbufferRenderPass() { - VkAttachmentDescription attachments[2]; + VkAttachmentDescription attachments[2]{}; attachments[0].format = vulkan_->GetSwapchainFormat(); attachments[0].samples = VK_SAMPLE_COUNT_1_BIT; attachments[0].loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR; @@ -138,7 +138,6 @@ void VulkanQueueRunner::InitBackbufferRenderPass() { attachments[0].stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; attachments[0].initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; // We don't want to preserve the backbuffer between frames so we really don't care. attachments[0].finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR; // We only render once to the backbuffer per frame so we can do this here. - attachments[0].flags = 0; attachments[1].format = vulkan_->GetDeviceInfo().preferredDepthStencilFormat; // must use this same format later for the back depth buffer. attachments[1].samples = VK_SAMPLE_COUNT_1_BIT; @@ -148,7 +147,6 @@ void VulkanQueueRunner::InitBackbufferRenderPass() { attachments[1].stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE; attachments[1].initialLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; attachments[1].finalLayout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; - attachments[1].flags = 0; VkAttachmentReference color_reference{}; color_reference.attachment = 0; @@ -172,6 +170,7 @@ void VulkanQueueRunner::InitBackbufferRenderPass() { // For the built-in layout transitions. VkSubpassDependency dep{}; + dep.dependencyFlags = VK_DEPENDENCY_BY_REGION_BIT; dep.srcSubpass = VK_SUBPASS_EXTERNAL; dep.dstSubpass = 0; dep.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; @@ -208,12 +207,16 @@ static VkAttachmentStoreOp ConvertStoreAction(VKRRenderPassStoreAction action) { return VK_ATTACHMENT_STORE_OP_DONT_CARE; // avoid compiler warning } +// Self-dependency: https://github.com/gpuweb/gpuweb/issues/442#issuecomment-547604827 +// Also see https://www.khronos.org/registry/vulkan/specs/1.3-extensions/html/vkspec.html#synchronization-pipeline-barriers-subpass-self-dependencies VkRenderPass VulkanQueueRunner::GetRenderPass(const RPKey &key) { auto pass = renderPasses_.Get(key); if (pass) { return pass; } + bool selfDependency = true || key.selfDependencyColor; + VkAttachmentDescription attachments[2] = {}; attachments[0].format = VK_FORMAT_R8G8B8A8_UNORM; attachments[0].samples = VK_SAMPLE_COUNT_1_BIT; @@ -237,7 +240,7 @@ VkRenderPass VulkanQueueRunner::GetRenderPass(const RPKey &key) { VkAttachmentReference color_reference{}; color_reference.attachment = 0; - color_reference.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; + color_reference.layout = selfDependency ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; VkAttachmentReference depth_reference{}; depth_reference.attachment = 1; @@ -246,8 +249,13 @@ VkRenderPass VulkanQueueRunner::GetRenderPass(const RPKey &key) { VkSubpassDescription subpass{}; subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS; subpass.flags = 0; - subpass.inputAttachmentCount = 0; - subpass.pInputAttachments = nullptr; + if (selfDependency) { + subpass.inputAttachmentCount = 1; + subpass.pInputAttachments = &color_reference; + } else { + subpass.inputAttachmentCount = 0; + subpass.pInputAttachments = nullptr; + } subpass.colorAttachmentCount = 1; subpass.pColorAttachments = &color_reference; subpass.pResolveAttachments = nullptr; @@ -261,6 +269,20 @@ VkRenderPass VulkanQueueRunner::GetRenderPass(const RPKey &key) { rp.subpassCount = 1; rp.pSubpasses = &subpass; + // must be declared outside the "if". + VkSubpassDependency self_dep { VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO }; + if (selfDependency) { + self_dep.dependencyFlags = VK_DEPENDENCY_BY_REGION_BIT; + self_dep.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; + self_dep.dstAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT; + self_dep.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; + self_dep.dstStageMask = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; + self_dep.srcSubpass = 0; + self_dep.dstSubpass = 0; + rp.dependencyCount = 1; + rp.pDependencies = &self_dep; + } + VkResult res = vkCreateRenderPass(vulkan_->GetDevice(), &rp, nullptr, &pass); _assert_(res == VK_SUCCESS); _assert_(pass != VK_NULL_HANDLE); @@ -268,6 +290,30 @@ VkRenderPass VulkanQueueRunner::GetRenderPass(const RPKey &key) { return pass; } +// Must match the subpass self-dependency declared above. +void VulkanQueueRunner::SelfDependencyBarrier(VKRImage &img, VkImageAspectFlags aspect, VulkanBarrier *recordBarrier) { + if (aspect & VK_IMAGE_ASPECT_COLOR_BIT) { + VkAccessFlags srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; + VkAccessFlags dstAccessMask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT; + VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; + VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT; + recordBarrier->TransitionImage( + img.image, + 0, + 1, + aspect, + VK_IMAGE_LAYOUT_GENERAL, + VK_IMAGE_LAYOUT_GENERAL, + srcAccessMask, + dstAccessMask, + srcStageMask, + dstStageMask + ); + } else { + _assert_msg_(false, "Depth self-dependencies not yet supported"); + } +} + void VulkanQueueRunner::PreprocessSteps(std::vector &steps) { // Optimizes renderpasses, then sequences them. // Planned optimizations: @@ -1252,6 +1298,15 @@ void VulkanQueueRunner::PerformRenderPass(const VKRStep &step, VkCommandBuffer c break; } + case VKRRenderCommand::SELF_DEPENDENCY_BARRIER: + { + _assert_(step.render.selfDependency); + VulkanBarrier barrier; + SelfDependencyBarrier(step.render.framebuffer->color, VK_IMAGE_ASPECT_COLOR_BIT, &barrier); + barrier.Flush(cmd); + break; + } + case VKRRenderCommand::PUSH_CONSTANTS: vkCmdPushConstants(cmd, pipelineLayout, c.push.stages, c.push.offset, c.push.size, c.push.data); break; diff --git a/Common/GPU/Vulkan/VulkanQueueRunner.h b/Common/GPU/Vulkan/VulkanQueueRunner.h index ce6dc9f5eb48..ed0b9046af21 100644 --- a/Common/GPU/Vulkan/VulkanQueueRunner.h +++ b/Common/GPU/Vulkan/VulkanQueueRunner.h @@ -37,6 +37,7 @@ enum class VKRRenderCommand : uint8_t { DRAW, DRAW_INDEXED, PUSH_CONSTANTS, + SELF_DEPENDENCY_BARRIER, NUM_RENDER_COMMANDS, }; @@ -166,6 +167,7 @@ struct VKRStep { VKRRenderPassStoreAction depthStore; VKRRenderPassStoreAction stencilStore; u8 clearStencil; + bool selfDependency; uint32_t clearColor; float clearDepth; int numDraws; @@ -244,6 +246,11 @@ class VulkanQueueRunner { VKRRenderPassStoreAction colorStoreAction; VKRRenderPassStoreAction depthStoreAction; VKRRenderPassStoreAction stencilStoreAction; + + // Sets up a renderpass that can read from the texture being rendered to by using an input attachment. + // Can be used for limited programmable blending with no additional extensions, or unlimited programmable + // blending using VK_ARM_rasterization_order_attachment_access or VK_EXT_fragment_shader_interlock. + bool selfDependencyColor; }; VkRenderPass GetRenderPass(const RPKey &key); @@ -297,6 +304,8 @@ class VulkanQueueRunner { static void SetupTransitionToTransferSrc(VKRImage &img, VkImageAspectFlags aspect, VulkanBarrier *recordBarrier); static void SetupTransitionToTransferDst(VKRImage &img, VkImageAspectFlags aspect, VulkanBarrier *recordBarrier); + static void SelfDependencyBarrier(VKRImage &img, VkImageAspectFlags aspect, VulkanBarrier *recordBarrier); + VulkanContext *vulkan_; VkFramebuffer backbuffer_ = VK_NULL_HANDLE; diff --git a/Common/GPU/Vulkan/VulkanRenderManager.cpp b/Common/GPU/Vulkan/VulkanRenderManager.cpp index a4e14a1b8d8f..663db3b7bf69 100644 --- a/Common/GPU/Vulkan/VulkanRenderManager.cpp +++ b/Common/GPU/Vulkan/VulkanRenderManager.cpp @@ -174,7 +174,7 @@ void CreateImage(VulkanContext *vulkan, VkCommandBuffer cmd, VKRImage &img, int // Strictly speaking we don't yet need VK_IMAGE_USAGE_SAMPLED_BIT for depth buffers since we do not yet sample depth buffers. ici.usage = VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT; if (color) { - ici.usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; + ici.usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT; } else { ici.usage |= VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT; } @@ -683,6 +683,12 @@ void VulkanRenderManager::EndCurRenderStep() { } } +void VulkanRenderManager::BindCurrentFramebufferAsInputAttachment0(VkImageAspectFlags aspectBits) { + _dbg_assert_(curRenderStep_); + curRenderStep_->render.selfDependency = true; + curRenderStep_->commands.push_back(VkRenderData{ VKRRenderCommand::SELF_DEPENDENCY_BARRIER }); +} + void VulkanRenderManager::BindFramebufferAsRenderTarget(VKRFramebuffer *fb, VKRRenderPassLoadAction color, VKRRenderPassLoadAction depth, VKRRenderPassLoadAction stencil, uint32_t clearColor, float clearDepth, uint8_t clearStencil, const char *tag) { _dbg_assert_(insideFrame_); // Eliminate dupes (bind of the framebuffer we already are rendering to), instantly convert to a clear if possible. diff --git a/Common/GPU/Vulkan/VulkanRenderManager.h b/Common/GPU/Vulkan/VulkanRenderManager.h index 67dff36880f6..554a1220292d 100644 --- a/Common/GPU/Vulkan/VulkanRenderManager.h +++ b/Common/GPU/Vulkan/VulkanRenderManager.h @@ -220,6 +220,8 @@ class VulkanRenderManager { // as the other backends, even though there's no actual binding happening here. VkImageView BindFramebufferAsTexture(VKRFramebuffer *fb, int binding, VkImageAspectFlags aspectBits, int attachment); + void BindCurrentFramebufferAsInputAttachment0(VkImageAspectFlags aspectBits); + bool CopyFramebufferToMemorySync(VKRFramebuffer *src, VkImageAspectFlags aspectBits, int x, int y, int w, int h, Draw::DataFormat destFormat, uint8_t *pixels, int pixelStride, const char *tag); void CopyImageToMemorySync(VkImage image, int mipLevel, int x, int y, int w, int h, Draw::DataFormat destFormat, uint8_t *pixels, int pixelStride, const char *tag); diff --git a/Common/GPU/Vulkan/thin3d_vulkan.cpp b/Common/GPU/Vulkan/thin3d_vulkan.cpp index 8a1112d4465c..cfbb3de8ffea 100644 --- a/Common/GPU/Vulkan/thin3d_vulkan.cpp +++ b/Common/GPU/Vulkan/thin3d_vulkan.cpp @@ -391,9 +391,10 @@ class VKContext : public DrawContext { // These functions should be self explanatory. void BindFramebufferAsRenderTarget(Framebuffer *fbo, const RenderPassInfo &rp, const char *tag) override; Framebuffer *GetCurrentRenderTarget() override { - return curFramebuffer_; + return (Framebuffer *)curFramebuffer_.ptr; } void BindFramebufferAsTexture(Framebuffer *fbo, int binding, FBChannel channelBit, int attachment) override; + void BindCurrentFramebufferForColorInput() override; void GetFramebufferDimensions(Framebuffer *fbo, int *w, int *h) override; @@ -462,34 +463,7 @@ class VKContext : public DrawContext { std::vector GetFeatureList() const override; std::vector GetExtensionList() const override; - uint64_t GetNativeObject(NativeObject obj, void *srcObject) override { - switch (obj) { - case NativeObject::CONTEXT: - return (uint64_t)vulkan_; - case NativeObject::FRAMEBUFFER_RENDERPASS: - // Return a representative renderpass. - return (uint64_t)renderManager_.GetFramebufferRenderPass(); - case NativeObject::BACKBUFFER_RENDERPASS: - return (uint64_t)renderManager_.GetBackbufferRenderPass(); - case NativeObject::COMPATIBLE_RENDERPASS: - return (uint64_t)renderManager_.GetCompatibleRenderPass(); - case NativeObject::INIT_COMMANDBUFFER: - return (uint64_t)renderManager_.GetInitCmd(); - case NativeObject::BOUND_TEXTURE0_IMAGEVIEW: - return (uint64_t)boundImageView_[0]; - case NativeObject::BOUND_TEXTURE1_IMAGEVIEW: - return (uint64_t)boundImageView_[1]; - case NativeObject::RENDER_MANAGER: - return (uint64_t)(uintptr_t)&renderManager_; - case NativeObject::NULL_IMAGEVIEW: - return (uint64_t)GetNullTexture()->GetImageView(); - case NativeObject::TEXTURE_VIEW: - return (uint64_t)(((VKTexture *)srcObject)->GetImageView()); - default: - Crash(); - return 0; - } - } + uint64_t GetNativeObject(NativeObject obj, void *srcObject) override; void HandleEvent(Event ev, int width, int height, void *param1, void *param2) override; @@ -518,7 +492,7 @@ class VKContext : public DrawContext { VkDescriptorSetLayout descriptorSetLayout_ = VK_NULL_HANDLE; VkPipelineLayout pipelineLayout_ = VK_NULL_HANDLE; VkPipelineCache pipelineCache_ = VK_NULL_HANDLE; - AutoRef curFramebuffer_; + AutoRef curFramebuffer_; VkDevice device_; VkQueue queue_; @@ -796,6 +770,7 @@ VKContext::VKContext(VulkanContext *vulkan, bool splitSubmit) caps_.textureNPOTFullySupported = true; caps_.fragmentShaderDepthWriteSupported = true; caps_.logicOpSupported = vulkan->GetDeviceFeatures().enabled.logicOp != 0; + caps_.framebufferFetchSupported = true; // Limited, through input attachments and self-dependencies. auto deviceProps = vulkan->GetPhysicalDeviceProperties(vulkan_->GetCurrentPhysicalDeviceIndex()).properties; switch (deviceProps.vendorID) { @@ -1582,6 +1557,10 @@ void VKContext::BindFramebufferAsTexture(Framebuffer *fbo, int binding, FBChanne boundImageView_[binding] = renderManager_.BindFramebufferAsTexture(fb->GetFB(), binding, aspect, attachment); } +void VKContext::BindCurrentFramebufferForColorInput() { + renderManager_.BindCurrentFramebufferAsInputAttachment0(VK_IMAGE_ASPECT_COLOR_BIT); +} + void VKContext::GetFramebufferDimensions(Framebuffer *fbo, int *w, int *h) { VKFramebuffer *fb = (VKFramebuffer *)fbo; if (fb) { @@ -1622,4 +1601,35 @@ void VKContext::InvalidateFramebuffer(FBInvalidationStage stage, uint32_t channe } } +uint64_t VKContext::GetNativeObject(NativeObject obj, void *srcObject) { + switch (obj) { + case NativeObject::CONTEXT: + return (uint64_t)vulkan_; + case NativeObject::FRAMEBUFFER_RENDERPASS: + // Return a representative renderpass. + return (uint64_t)renderManager_.GetFramebufferRenderPass(); + case NativeObject::BACKBUFFER_RENDERPASS: + return (uint64_t)renderManager_.GetBackbufferRenderPass(); + case NativeObject::COMPATIBLE_RENDERPASS: + return (uint64_t)renderManager_.GetCompatibleRenderPass(); + case NativeObject::INIT_COMMANDBUFFER: + return (uint64_t)renderManager_.GetInitCmd(); + case NativeObject::BOUND_TEXTURE0_IMAGEVIEW: + return (uint64_t)boundImageView_[0]; + case NativeObject::BOUND_TEXTURE1_IMAGEVIEW: + return (uint64_t)boundImageView_[1]; + case NativeObject::RENDER_MANAGER: + return (uint64_t)(uintptr_t)&renderManager_; + case NativeObject::NULL_IMAGEVIEW: + return (uint64_t)GetNullTexture()->GetImageView(); + case NativeObject::TEXTURE_VIEW: + return (uint64_t)(((VKTexture *)srcObject)->GetImageView()); + case NativeObject::BOUND_FRAMEBUFFER_COLOR_IMAGEVIEW: + return (uint64_t)curFramebuffer_->GetFB()->color.imageView; + default: + Crash(); + return 0; + } +} + } // namespace Draw diff --git a/Common/GPU/thin3d.h b/Common/GPU/thin3d.h index 21d605b6cda1..58bec035a36f 100644 --- a/Common/GPU/thin3d.h +++ b/Common/GPU/thin3d.h @@ -245,6 +245,7 @@ enum class NativeObject { INIT_COMMANDBUFFER, BOUND_TEXTURE0_IMAGEVIEW, BOUND_TEXTURE1_IMAGEVIEW, + BOUND_FRAMEBUFFER_COLOR_IMAGEVIEW, RENDER_MANAGER, TEXTURE_VIEW, NULL_IMAGEVIEW, @@ -653,6 +654,9 @@ class DrawContext { // binding must be < MAX_TEXTURE_SLOTS (0, 1 are okay if it's 2). virtual void BindFramebufferAsTexture(Framebuffer *fbo, int binding, FBChannel channelBit, int attachment) = 0; + // Framebuffer fetch / input attachment support, needs to be explicit in Vulkan. + virtual void BindCurrentFramebufferForColorInput() {} + // deprecated, only used by D3D9 virtual uintptr_t GetFramebufferAPITexture(Framebuffer *fbo, int channelBits, int attachment) { return 0; diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index 6bc9d7437f02..09d90470ccff 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -483,12 +483,12 @@ u32 DrawEngineCommon::NormalizeVertices(u8 *outPtr, u8 *bufPtr, const u8 *inPtr, return GE_VTYPE_TC_FLOAT | GE_VTYPE_COL_8888 | GE_VTYPE_NRM_FLOAT | GE_VTYPE_POS_FLOAT | (vertType & (GE_VTYPE_IDX_MASK | GE_VTYPE_THROUGH)); } -void DrawEngineCommon::ApplyFramebufferRead(bool *fboTexNeedsBind) { +void DrawEngineCommon::ApplyFramebufferRead(FBOTexState *fboTexState) { if (gstate_c.Supports(GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH)) { - *fboTexNeedsBind = false; + *fboTexState = FBO_TEX_READ_FRAMEBUFFER; } else { gpuStats.numCopiesForShaderBlend++; - *fboTexNeedsBind = true; + *fboTexState = FBO_TEX_COPY_BIND_TEX; } gstate_c.Dirty(DIRTY_SHADERBLEND); diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h index a8997454d9a0..e6ba0b37d9d9 100644 --- a/GPU/Common/DrawEngineCommon.h +++ b/GPU/Common/DrawEngineCommon.h @@ -46,6 +46,12 @@ enum { TEX_SLOT_SPLINE_WEIGHTS_V = 6, }; +enum FBOTexState { + FBO_TEX_NONE, + FBO_TEX_COPY_BIND_TEX, + FBO_TEX_READ_FRAMEBUFFER, +}; + inline uint32_t GetVertTypeID(uint32_t vertType, int uvGenMode) { // As the decoder depends on the UVGenMode when we use UV prescale, we simply mash it // into the top of the verttype where there are unused bits. @@ -130,7 +136,7 @@ class DrawEngineCommon { // Vertex decoding void DecodeVertsStep(u8 *dest, int &i, int &decodedVerts); - void ApplyFramebufferRead(bool *fboTexNeedsBind); + void ApplyFramebufferRead(FBOTexState *fboTexState); inline int IndexSize(u32 vtype) const { const u32 indexType = (vtype & GE_VTYPE_IDX_MASK); diff --git a/GPU/Common/FragmentShaderGenerator.cpp b/GPU/Common/FragmentShaderGenerator.cpp index 42205de0e757..5ef8f808f674 100644 --- a/GPU/Common/FragmentShaderGenerator.cpp +++ b/GPU/Common/FragmentShaderGenerator.cpp @@ -125,10 +125,12 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu GELogicOp replaceLogicOpType = isModeClear ? GE_LOGIC_COPY : (GELogicOp)id.Bits(FS_BIT_REPLACE_LOGIC_OP, 4); bool replaceLogicOp = replaceLogicOpType != GE_LOGIC_COPY && compat.bitwiseOps; - bool readFramebuffer = replaceBlend == REPLACE_BLEND_READ_FRAMEBUFFER || colorWriteMask || replaceLogicOp; - bool readFramebufferTex = readFramebuffer && !gstate_c.Supports(GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH); + bool needFramebufferRead = replaceBlend == REPLACE_BLEND_READ_FRAMEBUFFER || colorWriteMask || replaceLogicOp; - bool needFragCoord = readFramebuffer || gstate_c.Supports(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT); + bool fetchFramebuffer = needFramebufferRead && gstate_c.Supports(GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH); + bool readFramebufferTex = needFramebufferRead && !gstate_c.Supports(GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH); + + bool needFragCoord = readFramebufferTex || gstate_c.Supports(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT); bool writeDepth = gstate_c.Supports(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT); if (shaderDepal && !doTexture) { @@ -148,6 +150,8 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu if (readFramebufferTex) { WRITE(p, "layout (binding = 1) uniform sampler2D fbotex;\n"); + } else if (fetchFramebuffer) { + WRITE(p, "layout (input_attachment_index = 0, binding = 9) uniform subpassInput inputColor;\n"); } if (shaderDepal) { @@ -404,7 +408,7 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu if (!strcmp(compat.fragColor0, "fragColor0")) { const char *qualifierColor0 = "out"; - if (readFramebuffer && compat.lastFragData && !strcmp(compat.lastFragData, compat.fragColor0)) { + if (fetchFramebuffer && compat.lastFragData && !strcmp(compat.lastFragData, compat.fragColor0)) { qualifierColor0 = "inout"; } // Output the output color definitions. @@ -480,20 +484,26 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu } // Two things read from the old framebuffer - shader replacement blending and bit-level masking. - if (readFramebuffer) { + if (readFramebufferTex) { if (compat.shaderLanguage == HLSL_D3D11) { WRITE(p, " vec4 destColor = fbotex.Load(int3((int)gl_FragCoord.x, (int)gl_FragCoord.y, 0));\n"); } else if (compat.shaderLanguage == HLSL_D3D9) { WRITE(p, " vec4 destColor = tex2D(fbotex, gl_FragCoord.xy * u_fbotexSize.xy);\n", compat.texture); - } else if (gstate_c.Supports(GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH)) { - // If we have EXT_shader_framebuffer_fetch / ARM_shader_framebuffer_fetch, we skip the blit. - // We can just read the prev value more directly. - WRITE(p, " lowp vec4 destColor = %s;\n", compat.lastFragData); } else if (!compat.texelFetch) { WRITE(p, " lowp vec4 destColor = %s(fbotex, gl_FragCoord.xy * u_fbotexSize.xy);\n", compat.texture); } else { WRITE(p, " lowp vec4 destColor = %s(fbotex, ivec2(gl_FragCoord.x, gl_FragCoord.y), 0);\n", compat.texelFetch); } + } else if (fetchFramebuffer) { + // If we have EXT_shader_framebuffer_fetch / ARM_shader_framebuffer_fetch, we skip the blit. + // We can just read the prev value more directly. + if (compat.shaderLanguage == GLSL_3xx) { + WRITE(p, " lowp vec4 destColor = %s;\n", compat.lastFragData); + } else if (compat.shaderLanguage == GLSL_VULKAN) { + WRITE(p, " lowp vec4 destColor = subpassLoad(inputColor);\n", compat.lastFragData); + } else { + _assert_msg_(false, "Need fetch destColor, but not a compatible language"); + } } if (isModeClear) { diff --git a/GPU/D3D11/StateMappingD3D11.cpp b/GPU/D3D11/StateMappingD3D11.cpp index 0cee52a15586..e594042abd1c 100644 --- a/GPU/D3D11/StateMappingD3D11.cpp +++ b/GPU/D3D11/StateMappingD3D11.cpp @@ -153,20 +153,23 @@ void DrawEngineD3D11::ApplyDrawState(int prim) { // We ignore the logicState on D3D since there's no support, the emulation of it is blend-and-shader only. if (pipelineState_.FramebufferRead()) { - bool fboTexNeedsBind = false; - ApplyFramebufferRead(&fboTexNeedsBind); + FBOTexState fboTexBindState_ = FBO_TEX_NONE; + ApplyFramebufferRead(&fboTexBindState_); // The shader takes over the responsibility for blending, so recompute. ApplyStencilReplaceAndLogicOpIgnoreBlend(blendState.replaceAlphaWithStencil, blendState); - if (fboTexNeedsBind) { + if (fboTexBindState_ == FBO_TEX_COPY_BIND_TEX) { framebufferManager_->BindFramebufferAsColorTexture(1, framebufferManager_->GetCurrentRenderVFB(), BINDFBCOLOR_MAY_COPY); // No sampler required, we do a plain Load in the pixel shader. fboTexBound_ = true; + fboTexBindState_ = FBO_TEX_NONE; framebufferManager_->RebindFramebuffer("RebindFramebuffer - ApplyDrawState"); // Must dirty blend state here so we re-copy next time. Example: Lunar's spell effects. dirtyRequiresRecheck_ |= DIRTY_BLEND_STATE; gstate_c.Dirty(DIRTY_BLEND_STATE); + } else if (fboTexBindState_ == FBO_TEX_READ_FRAMEBUFFER) { + fboTexBindState_ = FBO_TEX_NONE; } dirtyRequiresRecheck_ |= DIRTY_FRAGMENTSHADER_STATE; diff --git a/GPU/Directx9/DrawEngineDX9.h b/GPU/Directx9/DrawEngineDX9.h index a0ee23e60d1e..9ef5b37c650a 100644 --- a/GPU/Directx9/DrawEngineDX9.h +++ b/GPU/Directx9/DrawEngineDX9.h @@ -170,6 +170,8 @@ class DrawEngineDX9 : public DrawEngineCommon { // Hardware tessellation TessellationDataTransferDX9 *tessDataTransferDX9; + FBOTexState fboTexBindState_ = FBO_TEX_NONE; + int lastRenderStepId_ = -1; bool fboTexNeedsBind_ = false; diff --git a/GPU/Directx9/StateMappingDX9.cpp b/GPU/Directx9/StateMappingDX9.cpp index 2ebadb397416..0dfa352f0007 100644 --- a/GPU/Directx9/StateMappingDX9.cpp +++ b/GPU/Directx9/StateMappingDX9.cpp @@ -99,14 +99,14 @@ void DrawEngineDX9::ApplyDrawState(int prim) { if (!gstate.isModeClear()) { textureCache_->ApplyTexture(); - if (fboTexNeedsBind_) { + if (fboTexBindState_ = FBO_TEX_COPY_BIND_TEX) { // Note that this is positions, not UVs, that we need the copy from. framebufferManager_->BindFramebufferAsColorTexture(1, framebufferManager_->GetCurrentRenderVFB(), BINDFBCOLOR_MAY_COPY); // If we are rendering at a higher resolution, linear is probably best for the dest color. device_->SetSamplerState(1, D3DSAMP_MAGFILTER, D3DTEXF_LINEAR); device_->SetSamplerState(1, D3DSAMP_MINFILTER, D3DTEXF_LINEAR); fboTexBound_ = true; - fboTexNeedsBind_ = false; + fboTexBindState_ = FBO_TEX_NONE; } // TODO: Test texture? @@ -133,20 +133,23 @@ void DrawEngineDX9::ApplyDrawState(int prim) { // We ignore the logicState on D3D since there's no support, the emulation of it is blend-and-shader only. if (pipelineState_.FramebufferRead()) { - bool fboTexNeedsBind = false; - ApplyFramebufferRead(&fboTexNeedsBind); + ApplyFramebufferRead(&fboTexBindState_); // The shader takes over the responsibility for blending, so recompute. ApplyStencilReplaceAndLogicOpIgnoreBlend(blendState.replaceAlphaWithStencil, blendState); - if (fboTexNeedsBind) { + if (fboTexBindState_ == FBO_TEX_COPY_BIND_TEX) { // Note that this is positions, not UVs, that we need the copy from. framebufferManager_->BindFramebufferAsColorTexture(1, framebufferManager_->GetCurrentRenderVFB(), BINDFBCOLOR_MAY_COPY); // If we are rendering at a higher resolution, linear is probably best for the dest color. device_->SetSamplerState(1, D3DSAMP_MAGFILTER, D3DTEXF_LINEAR); device_->SetSamplerState(1, D3DSAMP_MINFILTER, D3DTEXF_LINEAR); fboTexBound_ = true; + fboTexBindState_ = FBO_TEX_NONE; dirtyRequiresRecheck_ |= DIRTY_BLEND_STATE; gstate_c.Dirty(DIRTY_BLEND_STATE); + } else if (fboTexBindState_ == FBO_TEX_READ_FRAMEBUFFER) { + // Not supported. + fboTexBindState_ = FBO_TEX_NONE; } dirtyRequiresRecheck_ |= DIRTY_FRAGMENTSHADER_STATE; diff --git a/GPU/GLES/StateMappingGLES.cpp b/GPU/GLES/StateMappingGLES.cpp index dbfc115a9184..cdcc9e5069a9 100644 --- a/GPU/GLES/StateMappingGLES.cpp +++ b/GPU/GLES/StateMappingGLES.cpp @@ -149,13 +149,14 @@ void DrawEngineGLES::ApplyDrawState(int prim) { GenericLogicState &logicState = pipelineState_.logicState; if (pipelineState_.FramebufferRead()) { - bool fboTexNeedsBind = false; - ApplyFramebufferRead(&fboTexNeedsBind); + FBOTexState fboTexBindState = FBO_TEX_NONE; + ApplyFramebufferRead(&fboTexBindState); // The shader takes over the responsibility for blending, so recompute. ApplyStencilReplaceAndLogicOpIgnoreBlend(blendState.replaceAlphaWithStencil, blendState); // We copy the framebuffer here, as doing so will wipe any blend state if we do it later. - if (fboTexNeedsBind) { + // fboTexNeedsBind_ won't be set if we can read directly from the target. + if (fboTexBindState == FBO_TEX_COPY_BIND_TEX) { // Note that this is positions, not UVs, that we need the copy from. framebufferManager_->BindFramebufferAsColorTexture(1, framebufferManager_->GetCurrentRenderVFB(), BINDFBCOLOR_MAY_COPY); // If we are rendering at a higher resolution, linear is probably best for the dest color. @@ -166,6 +167,9 @@ void DrawEngineGLES::ApplyDrawState(int prim) { // Must dirty blend state here so we re-copy next time. Example: Lunar's spell effects. dirtyRequiresRecheck_ |= DIRTY_BLEND_STATE; gstate_c.Dirty(DIRTY_BLEND_STATE); + } else if (fboTexBindState == FBO_TEX_READ_FRAMEBUFFER) { + // No action needed here. + fboTexBindState = FBO_TEX_NONE; } dirtyRequiresRecheck_ |= DIRTY_FRAGMENTSHADER_STATE; gstate_c.Dirty(DIRTY_FRAGMENTSHADER_STATE); diff --git a/GPU/GPUState.h b/GPU/GPUState.h index 5b011374ac7b..00879ea706aa 100644 --- a/GPU/GPUState.h +++ b/GPU/GPUState.h @@ -485,7 +485,8 @@ enum { // Free bit: 15 GPU_SUPPORTS_DEPTH_TEXTURE = FLAG_BIT(16), GPU_SUPPORTS_ACCURATE_DEPTH = FLAG_BIT(17), - // Free bits: 18-19 + GPU_SUPPORTS_FRAGMENT_SHADER_INTERLOCK = FLAG_BIT(18), + // Free bits: 19 GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH = FLAG_BIT(20), GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT = FLAG_BIT(21), GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT = FLAG_BIT(22), diff --git a/GPU/Vulkan/DrawEngineVulkan.cpp b/GPU/Vulkan/DrawEngineVulkan.cpp index 0ada8b9b06b6..70bddad56cbb 100644 --- a/GPU/Vulkan/DrawEngineVulkan.cpp +++ b/GPU/Vulkan/DrawEngineVulkan.cpp @@ -71,6 +71,7 @@ enum { DRAW_BINDING_TESS_STORAGE_BUF = 6, DRAW_BINDING_TESS_STORAGE_BUF_WU = 7, DRAW_BINDING_TESS_STORAGE_BUF_WV = 8, + DRAW_BINDING_INPUT_ATTACHMENT = 9, }; enum { @@ -94,7 +95,10 @@ DrawEngineVulkan::DrawEngineVulkan(Draw::DrawContext *draw) void DrawEngineVulkan::InitDeviceObjects() { // All resources we need for PSP drawing. Usually only bindings 0 and 2-4 are populated. - VkDescriptorSetLayoutBinding bindings[9]{}; + + // TODO: Make things more flexible, so we at least have specialized layouts for input attachments and tess. + // Note that it becomes a support matrix.. + VkDescriptorSetLayoutBinding bindings[10]{}; bindings[0].descriptorCount = 1; bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; bindings[0].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT; @@ -132,6 +136,10 @@ void DrawEngineVulkan::InitDeviceObjects() { bindings[8].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; bindings[8].stageFlags = VK_SHADER_STAGE_VERTEX_BIT; bindings[8].binding = DRAW_BINDING_TESS_STORAGE_BUF_WV; + bindings[9].descriptorCount = 1; + bindings[9].descriptorType = VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT; + bindings[9].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT; + bindings[9].binding = DRAW_BINDING_INPUT_ATTACHMENT; VulkanContext *vulkan = (VulkanContext *)draw_->GetNativeObject(Draw::NativeObject::CONTEXT); VkDevice device = vulkan->GetDevice(); @@ -414,15 +422,15 @@ VkDescriptorSet DrawEngineVulkan::GetOrCreateDescriptorSet(VkImageView imageView } if (boundSecondary_) { - tex[1].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + tex[1].imageLayout = key.secondaryIsInputAttachment ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; tex[1].imageView = boundSecondary_; tex[1].sampler = samplerSecondaryNearest_; writes[n].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; writes[n].pNext = nullptr; - writes[n].dstBinding = DRAW_BINDING_2ND_TEXTURE; + writes[n].dstBinding = key.secondaryIsInputAttachment ? DRAW_BINDING_INPUT_ATTACHMENT : DRAW_BINDING_2ND_TEXTURE; writes[n].pImageInfo = &tex[1]; writes[n].descriptorCount = 1; - writes[n].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + writes[n].descriptorType = key.secondaryIsInputAttachment ? VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT : VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; writes[n].dstSet = desc; n++; } diff --git a/GPU/Vulkan/DrawEngineVulkan.h b/GPU/Vulkan/DrawEngineVulkan.h index 531e05c4ed51..653dfd3f1ff8 100644 --- a/GPU/Vulkan/DrawEngineVulkan.h +++ b/GPU/Vulkan/DrawEngineVulkan.h @@ -234,6 +234,7 @@ class DrawEngineVulkan : public DrawEngineCommon { VkSampler sampler_; VkBuffer base_, light_, bone_; // All three UBO slots will be set to this. This will usually be identical // for all draws in a frame, except when the buffer has to grow. + bool secondaryIsInputAttachment; }; // We alternate between these. @@ -281,7 +282,7 @@ class DrawEngineVulkan : public DrawEngineCommon { VulkanDynamicState dynState_{}; int tessOffset_ = 0; - bool fboTexNeedsBind_ = false; + FBOTexState fboTexBindState_ = FBO_TEX_NONE; // Hardware tessellation TessellationDataTransferVulkan *tessDataTransferVulkan; diff --git a/GPU/Vulkan/FramebufferManagerVulkan.h b/GPU/Vulkan/FramebufferManagerVulkan.h index 0f5d7c4f532f..d3370fafb7a8 100644 --- a/GPU/Vulkan/FramebufferManagerVulkan.h +++ b/GPU/Vulkan/FramebufferManagerVulkan.h @@ -33,7 +33,7 @@ class VulkanPushBuffer; class FramebufferManagerVulkan : public FramebufferManagerCommon { public: - FramebufferManagerVulkan(Draw::DrawContext *draw); + explicit FramebufferManagerVulkan(Draw::DrawContext *draw); ~FramebufferManagerVulkan(); // If within a render pass, this will just issue a regular clear. If beginning a new render pass, diff --git a/GPU/Vulkan/GPU_Vulkan.cpp b/GPU/Vulkan/GPU_Vulkan.cpp index 6cf56b17f3d7..13b3c466da6b 100644 --- a/GPU/Vulkan/GPU_Vulkan.cpp +++ b/GPU/Vulkan/GPU_Vulkan.cpp @@ -229,6 +229,9 @@ void GPU_Vulkan::CheckGPUFeatures() { features |= GPU_SUPPORTS_TEXTURE_FLOAT; features |= GPU_SUPPORTS_DEPTH_TEXTURE; + // input attachments + features |= GPU_SUPPORTS_ANY_FRAMEBUFFER_FETCH; + auto &enabledFeatures = vulkan->GetDeviceFeatures().enabled; if (enabledFeatures.depthClamp) { features |= GPU_SUPPORTS_DEPTH_CLAMP; diff --git a/GPU/Vulkan/StateMappingVulkan.cpp b/GPU/Vulkan/StateMappingVulkan.cpp index 6457b99ad966..7377b059a0bc 100644 --- a/GPU/Vulkan/StateMappingVulkan.cpp +++ b/GPU/Vulkan/StateMappingVulkan.cpp @@ -153,7 +153,7 @@ void DrawEngineVulkan::ConvertStateToVulkanKey(FramebufferManagerVulkan &fbManag GenericLogicState &logicState = pipelineState_.logicState; if (pipelineState_.FramebufferRead()) { - ApplyFramebufferRead(&fboTexNeedsBind_); + ApplyFramebufferRead(&fboTexBindState_); // The shader takes over the responsibility for blending, so recompute. // We might still end up using blend to write something to alpha. ApplyStencilReplaceAndLogicOpIgnoreBlend(blendState.replaceAlphaWithStencil, blendState); @@ -364,15 +364,19 @@ void DrawEngineVulkan::BindShaderBlendTex() { // TODO: At this point, we know if the vertices are full alpha or not. // Set the nearest/linear here (since we correctly know if alpha/color tests are needed)? if (!gstate.isModeClear()) { - if (fboTexNeedsBind_) { + if (fboTexBindState_ == FBO_TEX_COPY_BIND_TEX) { bool bindResult = framebufferManager_->BindFramebufferAsColorTexture(1, framebufferManager_->GetCurrentRenderVFB(), BINDFBCOLOR_MAY_COPY); _dbg_assert_(bindResult); boundSecondary_ = (VkImageView)draw_->GetNativeObject(Draw::NativeObject::BOUND_TEXTURE1_IMAGEVIEW); fboTexBound_ = true; - fboTexNeedsBind_ = false; + fboTexBindState_ = FBO_TEX_NONE; // Must dirty blend state here so we re-copy next time. Example: Lunar's spell effects. dirtyRequiresRecheck_ |= DIRTY_BLEND_STATE; + } else if (fboTexBindState_ == FBO_TEX_READ_FRAMEBUFFER) { + draw_->BindCurrentFramebufferForColorInput(); + boundSecondary_ = (VkImageView)draw_->GetNativeObject(Draw::NativeObject::BOUND_FRAMEBUFFER_COLOR_IMAGEVIEW); + fboTexBindState_ = FBO_TEX_NONE; } } }