Enable depth uploads on render-to-clut-buffer. Esoteric but needed for …

…#11100. Compat flag for now.
hrydgard · Sep 12, 2022 · b4c133a · b4c133a
1 parent 19e1f20
commit b4c133a
Show file tree

Hide file tree

Showing 15 changed files with 87 additions and 28 deletions.
diff --git a/Common/GPU/D3D11/thin3d_d3d11.cpp b/Common/GPU/D3D11/thin3d_d3d11.cpp
@@ -471,6 +471,7 @@ static DXGI_FORMAT dataFormatToD3D11(DataFormat format) {
 	case DataFormat::R8G8B8A8_UNORM_SRGB: return DXGI_FORMAT_R8G8B8A8_UNORM_SRGB;
 	case DataFormat::B8G8R8A8_UNORM: return DXGI_FORMAT_B8G8R8A8_UNORM;
 	case DataFormat::B8G8R8A8_UNORM_SRGB: return DXGI_FORMAT_B8G8R8A8_UNORM_SRGB;
+	case DataFormat::R16_UNORM: return DXGI_FORMAT_R16_UNORM;
 	case DataFormat::R16_FLOAT: return DXGI_FORMAT_R16_FLOAT;
 	case DataFormat::R16G16_FLOAT: return DXGI_FORMAT_R16G16_FLOAT;
 	case DataFormat::R16G16B16A16_FLOAT: return DXGI_FORMAT_R16G16B16A16_FLOAT;

diff --git a/Common/GPU/D3D9/thin3d_d3d9.cpp b/Common/GPU/D3D9/thin3d_d3d9.cpp
@@ -114,6 +114,7 @@ static const D3DSTENCILOP stencilOpToD3D9[] = {
 
 D3DFORMAT FormatToD3DFMT(DataFormat fmt) {
 	switch (fmt) {
+	case DataFormat::R16_UNORM: return D3DFMT_L16;  // closest match, should be a fine substitution if we ignore channels except R.
 	case DataFormat::R8G8B8A8_UNORM: return D3DFMT_A8R8G8B8;
 	case DataFormat::B8G8R8A8_UNORM: return D3DFMT_A8R8G8B8;
 	case DataFormat::R4G4B4A4_UNORM_PACK16: return D3DFMT_A4R4G4B4;  // emulated

diff --git a/Common/GPU/DataFormat.h b/Common/GPU/DataFormat.h
@@ -32,6 +32,8 @@ enum class DataFormat : uint8_t {
 	A1R5G5B5_UNORM_PACK16, // A1 in the UPPER bit.
 	A1B5G5R5_UNORM_PACK16, // A1 in the UPPER bit. OpenGL-only.
 
+	R16_UNORM,
+
 	R16_FLOAT,
 	R16G16_FLOAT,
 	R16G16B16A16_FLOAT,

diff --git a/Common/GPU/OpenGL/DataFormatGL.cpp b/Common/GPU/OpenGL/DataFormatGL.cpp
@@ -4,9 +4,15 @@
 namespace Draw {
 
 // TODO: Also output storage format (GL_RGBA8 etc) for modern GL usage.
-bool Thin3DFormatToFormatAndType(DataFormat fmt, GLuint &internalFormat, GLuint &format, GLuint &type, int &alignment) {
+bool Thin3DFormatToGLFormatAndType(DataFormat fmt, GLuint &internalFormat, GLuint &format, GLuint &type, int &alignment) {
 	alignment = 4;
 	switch (fmt) {
+	case DataFormat::R16_UNORM:
+		internalFormat = GL_R16;
+		format = GL_R;
+		type = GL_UNSIGNED_SHORT;
+		break;
+
 	case DataFormat::R8G8B8A8_UNORM:
 		internalFormat = GL_RGBA;
 		format = GL_RGBA;

diff --git a/Common/GPU/OpenGL/DataFormatGL.h b/Common/GPU/OpenGL/DataFormatGL.h
@@ -5,6 +5,6 @@
 
 namespace Draw {
 
-bool Thin3DFormatToFormatAndType(DataFormat fmt, GLuint &internalFormat, GLuint &format, GLuint &type, int &alignment);
+bool Thin3DFormatToGLFormatAndType(DataFormat fmt, GLuint &internalFormat, GLuint &format, GLuint &type, int &alignment);
 
 }
diff --git a/Common/GPU/OpenGL/GLQueueRunner.cpp b/Common/GPU/OpenGL/GLQueueRunner.cpp
@@ -385,7 +385,7 @@ void GLQueueRunner::RunInitSteps(const std::vector<GLRInitStep> &steps, bool ski
 
 			GLenum internalFormat, format, type;
 			int alignment;
-			Thin3DFormatToFormatAndType(step.texture_image.format, internalFormat, format, type, alignment);
+			Thin3DFormatToGLFormatAndType(step.texture_image.format, internalFormat, format, type, alignment);
 			if (step.texture_image.depth == 1) {
 				glTexImage2D(tex->target,
 					step.texture_image.level, internalFormat,
@@ -1276,7 +1276,7 @@ void GLQueueRunner::PerformRenderPass(const GLRStep &step, bool first, bool last
 			// For things to show in RenderDoc, need to split into glTexImage2D(..., nullptr) and glTexSubImage.
 			GLuint internalFormat, format, type;
 			int alignment;
-			Thin3DFormatToFormatAndType(c.texture_subimage.format, internalFormat, format, type, alignment);
+			Thin3DFormatToGLFormatAndType(c.texture_subimage.format, internalFormat, format, type, alignment);
 			glTexSubImage2D(tex->target, c.texture_subimage.level, c.texture_subimage.x, c.texture_subimage.y, c.texture_subimage.width, c.texture_subimage.height, format, type, c.texture_subimage.data);
 			if (c.texture_subimage.allocType == GLRAllocType::ALIGNED) {
 				FreeAlignedMemory(c.texture_subimage.data);

diff --git a/Common/GPU/Vulkan/thin3d_vulkan.cpp b/Common/GPU/Vulkan/thin3d_vulkan.cpp
@@ -563,6 +563,11 @@ static int GetBpp(VkFormat format) {
 	case VK_FORMAT_R8G8B8A8_UNORM:
 	case VK_FORMAT_B8G8R8A8_UNORM:
 		return 32;
+	case VK_FORMAT_R8_UNORM:
+		return 8;
+	case VK_FORMAT_R8G8_UNORM:
+	case VK_FORMAT_R16_UNORM:
+		return 16;
 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
@@ -586,6 +591,9 @@ static VkFormat DataFormatToVulkan(DataFormat format) {
 	case DataFormat::D32F: return VK_FORMAT_D32_SFLOAT;
 	case DataFormat::D32F_S8: return VK_FORMAT_D32_SFLOAT_S8_UINT;
 	case DataFormat::S8: return VK_FORMAT_S8_UINT;
+
+	case DataFormat::R16_UNORM: return VK_FORMAT_R16_UNORM;
+
 	case DataFormat::R16_FLOAT: return VK_FORMAT_R16_SFLOAT;
 	case DataFormat::R16G16_FLOAT: return VK_FORMAT_R16G16_SFLOAT;
 	case DataFormat::R16G16B16A16_FLOAT: return VK_FORMAT_R16G16B16A16_SFLOAT;

diff --git a/Common/GPU/thin3d.cpp b/Common/GPU/thin3d.cpp
@@ -33,6 +33,9 @@ size_t DataFormatSizeInBytes(DataFormat fmt) {
 	case DataFormat::R8G8B8A8_SNORM: return 4;
 	case DataFormat::R8G8B8A8_UINT: return 4;
 	case DataFormat::R8G8B8A8_SINT: return 4;
+
+	case DataFormat::R16_UNORM: return 2;
+
 	case DataFormat::R16_FLOAT: return 2;
 	case DataFormat::R16G16_FLOAT: return 4;
 	case DataFormat::R16G16B16A16_FLOAT: return 8;

diff --git a/Common/GPU/thin3d.h b/Common/GPU/thin3d.h
@@ -557,6 +557,7 @@ typedef std::function<bool(uint8_t *data, const uint8_t *initData, uint32_t w, u
 struct TextureDesc {
 	TextureType type;
 	DataFormat format;
+
 	int width;
 	int height;
 	int depth;

diff --git a/Core/Compatibility.cpp b/Core/Compatibility.cpp
@@ -99,6 +99,7 @@ void Compatibility::CheckSettings(IniFile &iniFile, const std::string &gameID) {
 	CheckSetting(iniFile, gameID, "SplitFramebufferMargin", &flags_.SplitFramebufferMargin);
 	CheckSetting(iniFile, gameID, "ForceLowerResolutionForEffectsOn", &flags_.ForceLowerResolutionForEffectsOn);
 	CheckSetting(iniFile, gameID, "AllowDownloadCLUT", &flags_.AllowDownloadCLUT);
+	CheckSetting(iniFile, gameID, "UploadDepthForCLUTTextures", &flags_.UploadDepthForCLUTTextures);
 }
 
 void Compatibility::CheckSetting(IniFile &iniFile, const std::string &gameID, const char *option, bool *flag) {

diff --git a/Core/Compatibility.h b/Core/Compatibility.h
@@ -89,6 +89,7 @@ struct CompatFlags {
 	bool SplitFramebufferMargin;
 	bool ForceLowerResolutionForEffectsOn;
 	bool AllowDownloadCLUT;
+	bool UploadDepthForCLUTTextures;
 };
 
 class IniFile;

diff --git a/GPU/Common/FragmentShaderGenerator.cpp b/GPU/Common/FragmentShaderGenerator.cpp
@@ -745,13 +745,14 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu
 				p.C("  uv_round = floor(uv * tsize);\n");
 				p.C("  int component = int(uv_round.x) & 3;\n");
 				p.C("  uv_round.x *= 0.25;\n");
-				p.C("  vec4 t = ivec4(").LoadTexture2D("tex", "ivec2(uv_round)", 0).C(");\n");
+				p.C("  uv_round /= tsize;\n");
+				p.C("  vec4 t = ").SampleTexture2D("tex", "uv_round").C(";\n");
 				p.C("  int index;\n");
 				p.C("  switch (component) {\n");
-				p.C("  case 0: index = int(t.x * 255.99); break;\n");
-				p.C("  case 1: index = int(t.y * 255.99); break;\n");
-				p.C("  case 2: index = int(t.z * 255.99); break;\n");
-				p.C("  case 3: index = int(t.w * 255.99); break;\n");
+				p.C("  case 0: index = int(t.x * 254.99); break;\n");  // TODO: Not sure why 254.99 instead of 255.99, but it's currently needed.
+				p.C("  case 1: index = int(t.y * 254.99); break;\n");
+				p.C("  case 2: index = int(t.z * 254.99); break;\n");
+				p.C("  case 3: index = int(t.w * 254.99); break;\n");
 				p.C("  }\n");
 				p.C("  t = ").LoadTexture2D("pal", "ivec2(index, 0)", 0).C(";\n");
 				break;

diff --git a/GPU/Common/FramebufferManagerCommon.cpp b/GPU/Common/FramebufferManagerCommon.cpp
@@ -475,9 +475,9 @@ VirtualFramebuffer *FramebufferManagerCommon::DoSetRenderFrameBuffer(Framebuffer
 		vfb->fb_format = params.fb_format;
 		vfb->usageFlags = FB_USAGE_RENDER_COLOR;
 
-		u32 byteSize = ColorBufferByteSize(vfb);
-		if (Memory::IsVRAMAddress(params.fb_address) && params.fb_address + byteSize > framebufRangeEnd_) {
-			framebufRangeEnd_ = params.fb_address + byteSize;
+		u32 colorByteSize = ColorBufferByteSize(vfb);
+		if (Memory::IsVRAMAddress(params.fb_address) && params.fb_address + colorByteSize > framebufRangeEnd_) {
+			framebufRangeEnd_ = params.fb_address + colorByteSize;
 		}
 
 		// This is where we actually create the framebuffer. The true is "force".
@@ -499,9 +499,9 @@ VirtualFramebuffer *FramebufferManagerCommon::DoSetRenderFrameBuffer(Framebuffer
 
 		// Assume that if we're clearing right when switching to a new framebuffer, we don't need to upload.
 		if (useBufferedRendering_ && params.isDrawing) {
-			gpu->PerformMemoryUpload(params.fb_address, byteSize);
+			gpu->PerformMemoryUpload(params.fb_address, colorByteSize);
 			// Alpha was already done by PerformMemoryUpload.
-			PerformStencilUpload(params.fb_address, byteSize, StencilUpload::STENCIL_IS_ZERO | StencilUpload::IGNORE_ALPHA);
+			PerformStencilUpload(params.fb_address, colorByteSize, StencilUpload::STENCIL_IS_ZERO | StencilUpload::IGNORE_ALPHA);
 			// TODO: Is it worth trying to upload the depth buffer (only if it wasn't copied above..?)
 		}
 
@@ -551,9 +551,20 @@ void FramebufferManagerCommon::SetDepthFrameBuffer(bool isClearingDepth) {
 	// by copying from any overlapping buffers with fresher content.
 	if (!isClearingDepth) {
 		CopyToDepthFromOverlappingFramebuffers(currentRenderVfb_);
-	}
 
+		// Special compatibility trick for Burnout Dominator lens flares. Not sure how to best generalize this. See issue #11100
+		if (PSP_CoreParameter().compat.flags().UploadDepthForCLUTTextures && (currentRenderVfb_->usageFlags & FB_USAGE_CLUT) != 0) {
+			// Set the flag, then upload memory contents to depth channel.
+			// Sanity check the depth buffer pointer.
+			if (currentRenderVfb_->z_address != 0 && currentRenderVfb_->z_address != currentRenderVfb_->fb_address) {
+				const u16 *src = (const u16 *)Memory::GetPointerUnchecked(currentRenderVfb_->z_address);
+				DrawPixels(currentRenderVfb_, 0, 0, (const u8 *)src, GE_FORMAT_DEPTH16, currentRenderVfb_->z_stride, currentRenderVfb_->width, currentRenderVfb_->height, RASTER_DEPTH, "Depth Upload");
+			}
+		}
+	}
+	// First time use of this framebuffer's depth buffer.
 	currentRenderVfb_->usageFlags |= FB_USAGE_RENDER_DEPTH;
+
 	currentRenderVfb_->depthBindSeq = GetBindSeqCount();
 }
 
@@ -1022,22 +1033,19 @@ void FramebufferManagerCommon::UpdateFromMemory(u32 addr, int size) {
 }
 
 void FramebufferManagerCommon::DrawPixels(VirtualFramebuffer *vfb, int dstX, int dstY, const u8 *srcPixels, GEBufferFormat srcPixelFormat, int srcStride, int width, int height, RasterChannel channel, const char *tag) {
-	// Add depth support later for depth uploads.
-	_dbg_assert_(channel == RASTER_COLOR);
-
 	textureCache_->ForgetLastTexture();
-	shaderManager_->DirtyLastShader();  // On GL, important that this is BEFORE drawing
+	shaderManager_->DirtyLastShader();
 	float u0 = 0.0f, u1 = 1.0f;
 	float v0 = 0.0f, v1 = 1.0f;
 
 	DrawTextureFlags flags;
 	if (useBufferedRendering_ && vfb && vfb->fbo) {
-		flags = DRAWTEX_LINEAR;
+		flags = channel == RASTER_COLOR ? DRAWTEX_LINEAR : DRAWTEX_NEAREST;
 		draw_->BindFramebufferAsRenderTarget(vfb->fbo, { Draw::RPAction::KEEP, Draw::RPAction::KEEP, Draw::RPAction::KEEP }, tag);
-		gstate_c.Dirty(DIRTY_VIEWPORTSCISSOR_STATE);
 		SetViewport2D(0, 0, vfb->renderWidth, vfb->renderHeight);
 		draw_->SetScissorRect(0, 0, vfb->renderWidth, vfb->renderHeight);
 	} else {
+		_dbg_assert_(channel == RASTER_COLOR);
 		// We are drawing directly to the back buffer so need to flip.
 		// Should more of this be handled by the presentation engine?
 		if (needBackBufferYSwap_)
@@ -1051,11 +1059,18 @@ void FramebufferManagerCommon::DrawPixels(VirtualFramebuffer *vfb, int dstX, int
 		draw_->SetScissorRect(0, 0, pixelWidth_, pixelHeight_);
 	}
 
+	if (channel == RASTER_DEPTH) {
+		_dbg_assert_(srcPixelFormat == GE_FORMAT_DEPTH16);
+		flags = flags | DRAWTEX_DEPTH;
+	}
+
 	Draw::Texture *pixelsTex = MakePixelTexture(srcPixels, srcPixelFormat, srcStride, width, height);
 	if (pixelsTex) {
 		draw_->BindTextures(0, 1, &pixelsTex);
-		// TODO: Replace with BlitUsingRaster for simplicity.
+
+		// TODO: Replace with draw2D_.Blit() directly.
 		DrawActiveTexture(dstX, dstY, width, height, vfb->bufferWidth, vfb->bufferHeight, u0, v0, u1, v1, ROTATION_LOCKED_HORIZONTAL, flags);
+
 		gpuStats.numUploads++;
 		pixelsTex->Release();
 		draw_->InvalidateCachedState();
@@ -1145,6 +1160,7 @@ Draw::Texture *FramebufferManagerCommon::MakePixelTexture(const u8 *srcPixels, G
 			const u16_le *src16 = (const u16_le *)srcPixels + srcStride * y;
 			const u32_le *src32 = (const u32_le *)srcPixels + srcStride * y;
 			u32 *dst = (u32 *)(data + byteStride * y);
+			u16 *dst16 = (u16 *)(data + byteStride * y);
 			switch (srcPixelFormat) {
 			case GE_FORMAT_565:
 				if (preferredPixelsFormat_ == Draw::DataFormat::B8G8R8A8_UNORM)
@@ -1177,18 +1193,28 @@ Draw::Texture *FramebufferManagerCommon::MakePixelTexture(const u8 *srcPixels, G
 					memcpy(dst, src32, width * 4);
 				break;
 
-			case GE_FORMAT_INVALID:
 			case GE_FORMAT_DEPTH16:
-				_dbg_assert_msg_(false, "Invalid pixelFormat passed to DrawPixels().");
+				// TODO: Must take the depth range into account, unless it's already 0-1.
+				// TODO: Depending on the color buffer format used with this depth buffer, we need
+				// to do one of two different swizzle operations. However, for the only use of this so far,
+				// the Burnout lens flare trickery, swizzle doesn't matter since it's just a 0, 7fff, 0, 7fff pattern
+				// which comes out the same.
+				memcpy(dst16, src16, w * 2);
+				break;
+
+			case GE_FORMAT_INVALID:
+				// Bad
 				break;
 			}
 		}
 		return true;
 	};
 
+	// Note: For depth, we create an R16_UNORM texture, that'll be just fine for uploading depth through a shader,
+	// and likely more efficient.
 	Draw::TextureDesc desc{
 		Draw::TextureType::LINEAR2D,
-		preferredPixelsFormat_,
+		srcPixelFormat == GE_FORMAT_DEPTH16 ? Draw::DataFormat::R16_UNORM : preferredPixelsFormat_,
 		width,
 		height,
 		1,
@@ -1198,6 +1224,7 @@ Draw::Texture *FramebufferManagerCommon::MakePixelTexture(const u8 *srcPixels, G
 		{ (uint8_t *)srcPixels },
 		generateTexture,
 	};
+
 	// Hot Shots Golf (#12355) does tons of these in a frame in some situations! So creating textures
 	// better be fast.
 	Draw::Texture *tex = draw_->CreateTexture(desc);
@@ -1221,7 +1248,7 @@ void FramebufferManagerCommon::DrawFramebufferToOutput(const u8 *srcPixels, int
 	if (needBackBufferYSwap_) {
 		flags |= OutputFlags::BACKBUFFER_FLIPPED;
 	}
-	// DrawActiveTexture reverses these, probably to match "up".
+	// CopyToOutput reverses these, probably to match "up".
 	if (GetGPUBackend() == GPUBackend::DIRECT3D9 || GetGPUBackend() == GPUBackend::DIRECT3D11) {
 		flags |= OutputFlags::POSITION_FLIPPED;
 	}
@@ -2715,7 +2742,7 @@ void FramebufferManagerCommon::DrawActiveTexture(float x, float y, float w, floa
 	// Rearrange to strip form.
 	std::swap(coord[2], coord[3]);
 
-	draw2D_.DrawStrip2D(nullptr, coord, 4, (flags & DRAWTEX_LINEAR) != 0, Get2DPipeline(DRAW2D_COPY_COLOR));
+	draw2D_.DrawStrip2D(nullptr, coord, 4, (flags & DRAWTEX_LINEAR) != 0, Get2DPipeline((flags & DRAWTEX_DEPTH) ? DRAW2D_COPY_DEPTH : DRAW2D_COPY_COLOR));
 
 	gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_RASTER_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS | DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE);
 }

diff --git a/GPU/Common/FramebufferManagerCommon.h b/GPU/Common/FramebufferManagerCommon.h
@@ -192,6 +192,7 @@ enum DrawTextureFlags {
 	DRAWTEX_NEAREST = 0,
 	DRAWTEX_LINEAR = 1,
 	DRAWTEX_TO_BACKBUFFER = 8,
+	DRAWTEX_DEPTH = 16,
 };
 
 inline DrawTextureFlags operator | (const DrawTextureFlags &lhs, const DrawTextureFlags &rhs) {

diff --git a/assets/compat.ini b/assets/compat.ini
@@ -1307,6 +1307,12 @@ ULKS46087 = true
 [AllowDownloadCLUT]
 # Temporary compatibility option, while developing a GPU CLUT-from-framebuffer path.
 
-# Burnout Dominator - lens flare effect (issue )#11100)
+# Burnout Dominator - lens flare effect (issue #11100)
+ULUS10236 = true
+ULES00703 = true
+
+[UploadDepthForCLUTTextures]
+# Burnout Dominator - lens flare effect (issue #11100)
+# We need a preinitialized depth buffer
 ULUS10236 = true
 ULES00703 = true