Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement delayed depth readbacks, Vulkan only #16916

Merged
merged 6 commits into from
Feb 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions Common/GPU/OpenGL/thin3d_gl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1001,8 +1001,7 @@ bool OpenGLContext::CopyFramebufferToMemory(Framebuffer *src, int channelBits, i
aspect |= GL_DEPTH_BUFFER_BIT;
if (channelBits & FB_STENCIL_BIT)
aspect |= GL_STENCIL_BUFFER_BIT;
renderManager_.CopyFramebufferToMemory(fb ? fb->framebuffer_ : nullptr, aspect, x, y, w, h, dataFormat, (uint8_t *)pixels, pixelStride, mode, tag);
return true;
return renderManager_.CopyFramebufferToMemory(fb ? fb->framebuffer_ : nullptr, aspect, x, y, w, h, dataFormat, (uint8_t *)pixels, pixelStride, mode, tag);
}


Expand Down
13 changes: 13 additions & 0 deletions Common/GPU/Vulkan/VulkanFrameData.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@
#include "Common/Log.h"
#include "Common/StringUtils.h"

void CachedReadback::Destroy(VulkanContext *vulkan) {
if (buffer) {
vulkan->Delete().QueueDeleteBufferAllocation(buffer, allocation);
}
bufferSize = 0;
}

void FrameData::Init(VulkanContext *vulkan, int index) {
this->index = index;
VkDevice device = vulkan->GetDevice();
Expand Down Expand Up @@ -48,6 +55,12 @@ void FrameData::Destroy(VulkanContext *vulkan) {
vkDestroyCommandPool(device, cmdPoolMain, nullptr);
vkDestroyFence(device, fence, nullptr);
vkDestroyQueryPool(device, profile.queryPool, nullptr);

readbacks_.IterateMut([=](const ReadbackKey &key, CachedReadback *value) {
value->Destroy(vulkan);
delete value;
});
readbacks_.Clear();
}

void FrameData::AcquireNextImage(VulkanContext *vulkan, FrameDataShared &shared) {
Expand Down
25 changes: 24 additions & 1 deletion Common/GPU/Vulkan/VulkanFrameData.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <condition_variable>

#include "Common/GPU/Vulkan/VulkanContext.h"
#include "Common/Data/Collections/Hashmaps.h"

enum {
MAX_TIMESTAMP_QUERIES = 128,
Expand All @@ -25,6 +26,23 @@ struct QueueProfileContext {
double cpuEndTime;
};

class VKRFramebuffer;

struct ReadbackKey {
const VKRFramebuffer *framebuf;
int width;
int height;
};

struct CachedReadback {
VkBuffer buffer;
VmaAllocation allocation;
VkDeviceSize bufferSize;
bool isCoherent;

void Destroy(VulkanContext *vulkan);
};

struct FrameDataShared {
// Permanent objects
VkSemaphore acquireSemaphore = VK_NULL_HANDLE;
Expand Down Expand Up @@ -77,6 +95,11 @@ struct FrameData {
QueueProfileContext profile;
bool profilingEnabled_ = false;

// Async readback cache.
DenseHashMap<ReadbackKey, CachedReadback*, nullptr> readbacks_;

FrameData() : readbacks_(8) {}

void Init(VulkanContext *vulkan, int index);
void Destroy(VulkanContext *vulkan);

Expand All @@ -91,5 +114,5 @@ struct FrameData {

private:
// Metadata for logging etc
int index;
int index = -1;
};
115 changes: 75 additions & 40 deletions Common/GPU/Vulkan/VulkanQueueRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ void VulkanQueueRunner::CreateDeviceObjects() {

void VulkanQueueRunner::DestroyDeviceObjects() {
INFO_LOG(G3D, "VulkanQueueRunner::DestroyDeviceObjects");
DestroyReadbackBuffer();

syncReadback_.Destroy(vulkan_);

renderPasses_.IterateMut([&](const RPKey &rpkey, VKRRenderPass *rp) {
_assert_(rp);
Expand Down Expand Up @@ -419,7 +420,7 @@ void VulkanQueueRunner::RunSteps(std::vector<VKRStep *> &steps, FrameData &frame
PerformBlit(step, cmd);
break;
case VKRStepType::READBACK:
PerformReadback(step, cmd);
PerformReadback(step, cmd, frameData);
break;
case VKRStepType::READBACK_IMAGE:
PerformReadbackImage(step, cmd);
Expand Down Expand Up @@ -1944,52 +1945,35 @@ void VulkanQueueRunner::SetupTransferDstWriteAfterWrite(VKRImage &img, VkImageAs
);
}

void VulkanQueueRunner::ResizeReadbackBuffer(VkDeviceSize requiredSize) {
if (readbackBuffer_ && requiredSize <= readbackBufferSize_) {
void VulkanQueueRunner::ResizeReadbackBuffer(CachedReadback *readback, VkDeviceSize requiredSize) {
if (readback->buffer && requiredSize <= readback->bufferSize) {
return;
}
if (readbackBuffer_) {
vulkan_->Delete().QueueDeleteBufferAllocation(readbackBuffer_, readbackAllocation_);

if (readback->buffer) {
vulkan_->Delete().QueueDeleteBufferAllocation(readback->buffer, readback->allocation);
}

readbackBufferSize_ = requiredSize;
readback->bufferSize = requiredSize;

VkDevice device = vulkan_->GetDevice();

VkBufferCreateInfo buf{ VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };
buf.size = readbackBufferSize_;
buf.size = readback->bufferSize;
buf.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT;

VmaAllocationCreateInfo allocCreateInfo{};
allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_TO_CPU;
VmaAllocationInfo allocInfo{};

VkResult res = vmaCreateBuffer(vulkan_->Allocator(), &buf, &allocCreateInfo, &readbackBuffer_, &readbackAllocation_, &allocInfo);
VkResult res = vmaCreateBuffer(vulkan_->Allocator(), &buf, &allocCreateInfo, &readback->buffer, &readback->allocation, &allocInfo);
_assert_(res == VK_SUCCESS);

const VkMemoryType &memoryType = vulkan_->GetMemoryProperties().memoryTypes[allocInfo.memoryType];
readbackBufferIsCoherent_ = (memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) != 0;
}

void VulkanQueueRunner::DestroyReadbackBuffer() {
if (readbackBuffer_) {
vulkan_->Delete().QueueDeleteBufferAllocation(readbackBuffer_, readbackAllocation_);
}
readbackBufferSize_ = 0;
readback->isCoherent = (memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) != 0;
}

void VulkanQueueRunner::PerformReadback(const VKRStep &step, VkCommandBuffer cmd) {
ResizeReadbackBuffer(sizeof(uint32_t) * step.readback.srcRect.extent.width * step.readback.srcRect.extent.height);

VkBufferImageCopy region{};
region.imageOffset = { step.readback.srcRect.offset.x, step.readback.srcRect.offset.y, 0 };
region.imageExtent = { step.readback.srcRect.extent.width, step.readback.srcRect.extent.height, 1 };
region.imageSubresource.aspectMask = step.readback.aspectMask;
region.imageSubresource.layerCount = 1;
region.bufferOffset = 0;
region.bufferRowLength = step.readback.srcRect.extent.width;
region.bufferImageHeight = step.readback.srcRect.extent.height;

void VulkanQueueRunner::PerformReadback(const VKRStep &step, VkCommandBuffer cmd, FrameData &frameData) {
VkImage image;
VkImageLayout copyLayout;
// Special case for backbuffer readbacks.
Expand Down Expand Up @@ -2023,7 +2007,40 @@ void VulkanQueueRunner::PerformReadback(const VKRStep &step, VkCommandBuffer cmd
copyLayout = srcImage->layout;
}

vkCmdCopyImageToBuffer(cmd, image, copyLayout, readbackBuffer_, 1, &region);
// TODO: Handle different readback formats!
u32 readbackSizeInBytes = sizeof(uint32_t) * step.readback.srcRect.extent.width * step.readback.srcRect.extent.height;

CachedReadback *cached = nullptr;

if (step.readback.delayed) {
ReadbackKey key;
key.framebuf = step.readback.src;
key.width = step.readback.srcRect.extent.width;
key.height = step.readback.srcRect.extent.height;

// See if there's already a buffer we can reuse
cached = frameData.readbacks_.Get(key);
if (!cached) {
cached = new CachedReadback();
cached->bufferSize = 0;
frameData.readbacks_.Insert(key, cached);
}
} else {
cached = &syncReadback_;
}

ResizeReadbackBuffer(cached, readbackSizeInBytes);

VkBufferImageCopy region{};
region.imageOffset = { step.readback.srcRect.offset.x, step.readback.srcRect.offset.y, 0 };
region.imageExtent = { step.readback.srcRect.extent.width, step.readback.srcRect.extent.height, 1 };
region.imageSubresource.aspectMask = step.readback.aspectMask;
region.imageSubresource.layerCount = 1;
region.bufferOffset = 0;
region.bufferRowLength = step.readback.srcRect.extent.width;
region.bufferImageHeight = step.readback.srcRect.extent.height;

vkCmdCopyImageToBuffer(cmd, image, copyLayout, cached->buffer, 1, &region);

// NOTE: Can't read the buffer using the CPU here - need to sync first.

Expand All @@ -2050,7 +2067,7 @@ void VulkanQueueRunner::PerformReadbackImage(const VKRStep &step, VkCommandBuffe
SetupTransitionToTransferSrc(srcImage, VK_IMAGE_ASPECT_COLOR_BIT, &recordBarrier_);
recordBarrier_.Flush(cmd);

ResizeReadbackBuffer(sizeof(uint32_t) * step.readback_image.srcRect.extent.width * step.readback_image.srcRect.extent.height);
ResizeReadbackBuffer(&syncReadback_, sizeof(uint32_t) * step.readback_image.srcRect.extent.width * step.readback_image.srcRect.extent.height);

VkBufferImageCopy region{};
region.imageOffset = { step.readback_image.srcRect.offset.x, step.readback_image.srcRect.offset.y, 0 };
Expand All @@ -2061,7 +2078,7 @@ void VulkanQueueRunner::PerformReadbackImage(const VKRStep &step, VkCommandBuffe
region.bufferOffset = 0;
region.bufferRowLength = step.readback_image.srcRect.extent.width;
region.bufferImageHeight = step.readback_image.srcRect.extent.height;
vkCmdCopyImageToBuffer(cmd, step.readback_image.image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, readbackBuffer_, 1, &region);
vkCmdCopyImageToBuffer(cmd, step.readback_image.image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, syncReadback_.buffer, 1, &region);

// Now transfer it back to a texture.
TransitionImageLayout2(cmd, step.readback_image.image, 0, 1, 1, // I don't think we have any multilayer cases for regular textures. Above in PerformReadback, though..
Expand All @@ -2074,22 +2091,39 @@ void VulkanQueueRunner::PerformReadbackImage(const VKRStep &step, VkCommandBuffe
// Doing that will also act like a heavyweight barrier ensuring that device writes are visible on the host.
}

void VulkanQueueRunner::CopyReadbackBuffer(VKRFramebuffer *src, int width, int height, Draw::DataFormat srcFormat, Draw::DataFormat destFormat, int pixelStride, uint8_t *pixels) {
if (!readbackBuffer_)
return; // Something has gone really wrong.
bool VulkanQueueRunner::CopyReadbackBuffer(FrameData &frameData, VKRFramebuffer *src, int width, int height, Draw::DataFormat srcFormat, Draw::DataFormat destFormat, int pixelStride, uint8_t *pixels) {
CachedReadback *readback = &syncReadback_;

// Look up in readback cache.
if (src) {
ReadbackKey key;
key.framebuf = src;
key.width = width;
key.height = height;
CachedReadback *cached = frameData.readbacks_.Get(key);
if (cached) {
readback = cached;
} else {
// Didn't have a cached image ready yet
return false;
}
}

if (!readback->buffer)
return false; // Didn't find anything in cache, or something has gone really wrong.

// Read back to the requested address in ram from buffer.
void *mappedData;
const size_t srcPixelSize = DataFormatSizeInBytes(srcFormat);
VkResult res = vmaMapMemory(vulkan_->Allocator(), readbackAllocation_, &mappedData);
VkResult res = vmaMapMemory(vulkan_->Allocator(), readback->allocation, &mappedData);

if (res != VK_SUCCESS) {
ERROR_LOG(G3D, "CopyReadbackBuffer: vkMapMemory failed! result=%d", (int)res);
return;
return false;
}

if (!readbackBufferIsCoherent_) {
vmaInvalidateAllocation(vulkan_->Allocator(), readbackAllocation_, 0, width * height * srcPixelSize);
if (!readback->isCoherent) {
vmaInvalidateAllocation(vulkan_->Allocator(), readback->allocation, 0, width * height * srcPixelSize);
}

// TODO: Perform these conversions in a compute shader on the GPU.
Expand All @@ -2116,5 +2150,6 @@ void VulkanQueueRunner::CopyReadbackBuffer(VKRFramebuffer *src, int width, int h
_assert_msg_(false, "CopyReadbackBuffer: Unknown src format %d", (int)srcFormat);
}

vmaUnmapMemory(vulkan_->Allocator(), readbackAllocation_);
vmaUnmapMemory(vulkan_->Allocator(), readback->allocation);
return true;
}
13 changes: 5 additions & 8 deletions Common/GPU/Vulkan/VulkanQueueRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ struct VKRStep {
int aspectMask;
VKRFramebuffer *src;
VkRect2D srcRect;
bool delayed;
} readback;
struct {
VkImage image;
Expand Down Expand Up @@ -253,7 +254,7 @@ class VulkanQueueRunner {
}

// src == 0 means to copy from the sync readback buffer.
void CopyReadbackBuffer(VKRFramebuffer *src, int width, int height, Draw::DataFormat srcFormat, Draw::DataFormat destFormat, int pixelStride, uint8_t *pixels);
bool CopyReadbackBuffer(FrameData &frameData, VKRFramebuffer *src, int width, int height, Draw::DataFormat srcFormat, Draw::DataFormat destFormat, int pixelStride, uint8_t *pixels);

VKRRenderPass *GetRenderPass(const RPKey &key);

Expand Down Expand Up @@ -289,7 +290,7 @@ class VulkanQueueRunner {
void PerformRenderPass(const VKRStep &pass, VkCommandBuffer cmd);
void PerformCopy(const VKRStep &pass, VkCommandBuffer cmd);
void PerformBlit(const VKRStep &pass, VkCommandBuffer cmd);
void PerformReadback(const VKRStep &pass, VkCommandBuffer cmd);
void PerformReadback(const VKRStep &pass, VkCommandBuffer cmd, FrameData &frameData);
void PerformReadbackImage(const VKRStep &pass, VkCommandBuffer cmd);

void LogRenderPass(const VKRStep &pass, bool verbose);
Expand All @@ -298,8 +299,7 @@ class VulkanQueueRunner {
void LogReadback(const VKRStep &pass);
void LogReadbackImage(const VKRStep &pass);

void ResizeReadbackBuffer(VkDeviceSize requiredSize);
void DestroyReadbackBuffer();
void ResizeReadbackBuffer(CachedReadback *readback, VkDeviceSize requiredSize);

void ApplyMGSHack(std::vector<VKRStep *> &steps);
void ApplySonicHack(std::vector<VKRStep *> &steps);
Expand All @@ -325,10 +325,7 @@ class VulkanQueueRunner {

// Readback buffer. Currently we only support synchronous readback, so we only really need one.
// We size it generously.
VmaAllocation readbackAllocation_ = VK_NULL_HANDLE;
VkBuffer readbackBuffer_ = VK_NULL_HANDLE;
VkDeviceSize readbackBufferSize_ = 0;
bool readbackBufferIsCoherent_ = false;
CachedReadback syncReadback_{};

// TODO: Enable based on compat.ini.
uint32_t hacksEnabled_ = 0;
Expand Down
11 changes: 7 additions & 4 deletions Common/GPU/Vulkan/VulkanRenderManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -929,11 +929,14 @@ bool VulkanRenderManager::CopyFramebufferToMemory(VKRFramebuffer *src, VkImageAs
step->readback.src = src;
step->readback.srcRect.offset = { x, y };
step->readback.srcRect.extent = { (uint32_t)w, (uint32_t)h };
step->readback.delayed = mode == Draw::ReadbackMode::OLD_DATA_OK;
step->dependencies.insert(src);
step->tag = tag;
steps_.push_back(step);

FlushSync();
if (mode == Draw::ReadbackMode::BLOCK) {
FlushSync();
}

Draw::DataFormat srcFormat = Draw::DataFormat::UNDEFINED;
if (aspectBits & VK_IMAGE_ASPECT_COLOR_BIT) {
Expand Down Expand Up @@ -972,8 +975,8 @@ bool VulkanRenderManager::CopyFramebufferToMemory(VKRFramebuffer *src, VkImageAs
}

// Need to call this after FlushSync so the pixels are guaranteed to be ready in CPU-accessible VRAM.
queueRunner_.CopyReadbackBuffer(src, w, h, srcFormat, destFormat, pixelStride, pixels);
return true;
return queueRunner_.CopyReadbackBuffer(frameData_[vulkan_->GetCurFrame()],
mode == Draw::ReadbackMode::OLD_DATA_OK ? src : nullptr, w, h, srcFormat, destFormat, pixelStride, pixels);
}

void VulkanRenderManager::CopyImageToMemorySync(VkImage image, int mipLevel, int x, int y, int w, int h, Draw::DataFormat destFormat, uint8_t *pixels, int pixelStride, const char *tag) {
Expand All @@ -992,7 +995,7 @@ void VulkanRenderManager::CopyImageToMemorySync(VkImage image, int mipLevel, int
FlushSync();

// Need to call this after FlushSync so the pixels are guaranteed to be ready in CPU-accessible VRAM.
queueRunner_.CopyReadbackBuffer(nullptr, w, h, destFormat, destFormat, pixelStride, pixels);
queueRunner_.CopyReadbackBuffer(frameData_[vulkan_->GetCurFrame()], nullptr, w, h, destFormat, destFormat, pixelStride, pixels);
}

static void RemoveDrawCommands(std::vector<VkRenderData> *cmds) {
Expand Down
8 changes: 6 additions & 2 deletions GPU/Common/DepthBufferCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,14 +218,18 @@ bool FramebufferManagerCommon::ReadbackDepthbuffer(Draw::Framebuffer *fbo, int x

DepthUB ub{};

// Setting this to 0.95f eliminates flickering lights with delayed readback in Syphon Filter.
// That's pretty ugly though! But we'll need to do that if we're gonna enable delayed readback in those games.
Copy link
Collaborator

@unknownbrackets unknownbrackets Feb 8, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While it's ugly, tbh if we're sure it's fine and looks about right across the game this is actually a type of game hack / compatibility flag I'm not so against. Since this is conceptually taking something we know how to do right (blocking) and applying an engine or series specific hack to make it still look the same but run faster.

That doesn't betray the goals of making the emulator capable of running all games well and solving underlying problems. It is ugly, though. Not saying it's not ugly.

That said, I feel like it must make some lights occlude early/late or something which I might be a bit less a fan of.

-[Unknown]

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does, but it's only really noticeable if you really look for it. However it's also possible there are some situations with thin walls where lights will shine through now when they didn't before - I didn't find any playing for a while, but it would not surprise me.

So before I do a compat hack for this, I want to see how good we can get it with fully async readbacks (that happen as soon as the GPU performs it, and doesn't wait until a guaranteed safe time the next time around the frame contexts).

const float fudgeFactor = 1.0f;

if (!gstate_c.Use(GPU_USE_ACCURATE_DEPTH)) {
// Don't scale anything, since we're not using factors outside accurate mode.
ub.u_depthFactor[0] = 0.0f;
ub.u_depthFactor[1] = 1.0f;
ub.u_depthFactor[1] = fudgeFactor;
} else {
const float factor = DepthSliceFactor();
ub.u_depthFactor[0] = -0.5f * (factor - 1.0f) * (1.0f / factor);
ub.u_depthFactor[1] = factor;
ub.u_depthFactor[1] = factor * fudgeFactor;
}
static constexpr float shifts[] = { 16777215.0f, 16777215.0f / 256.0f, 16777215.0f / 65536.0f, 16777215.0f / 16777216.0f };
memcpy(ub.u_depthShift, shifts, sizeof(shifts));
Expand Down
Loading