Skip to content

Commit

Permalink
Merge pull request #16916 from hrydgard/implement-delayed-readback
Browse files Browse the repository at this point in the history
Implement delayed depth readbacks, Vulkan only
  • Loading branch information
hrydgard authored Feb 8, 2023
2 parents 88130bb + ab63689 commit 2d0252f
Show file tree
Hide file tree
Showing 10 changed files with 153 additions and 70 deletions.
3 changes: 1 addition & 2 deletions Common/GPU/OpenGL/thin3d_gl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1001,8 +1001,7 @@ bool OpenGLContext::CopyFramebufferToMemory(Framebuffer *src, int channelBits, i
aspect |= GL_DEPTH_BUFFER_BIT;
if (channelBits & FB_STENCIL_BIT)
aspect |= GL_STENCIL_BUFFER_BIT;
renderManager_.CopyFramebufferToMemory(fb ? fb->framebuffer_ : nullptr, aspect, x, y, w, h, dataFormat, (uint8_t *)pixels, pixelStride, mode, tag);
return true;
return renderManager_.CopyFramebufferToMemory(fb ? fb->framebuffer_ : nullptr, aspect, x, y, w, h, dataFormat, (uint8_t *)pixels, pixelStride, mode, tag);
}


Expand Down
13 changes: 13 additions & 0 deletions Common/GPU/Vulkan/VulkanFrameData.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@
#include "Common/Log.h"
#include "Common/StringUtils.h"

void CachedReadback::Destroy(VulkanContext *vulkan) {
if (buffer) {
vulkan->Delete().QueueDeleteBufferAllocation(buffer, allocation);
}
bufferSize = 0;
}

void FrameData::Init(VulkanContext *vulkan, int index) {
this->index = index;
VkDevice device = vulkan->GetDevice();
Expand Down Expand Up @@ -48,6 +55,12 @@ void FrameData::Destroy(VulkanContext *vulkan) {
vkDestroyCommandPool(device, cmdPoolMain, nullptr);
vkDestroyFence(device, fence, nullptr);
vkDestroyQueryPool(device, profile.queryPool, nullptr);

readbacks_.IterateMut([=](const ReadbackKey &key, CachedReadback *value) {
value->Destroy(vulkan);
delete value;
});
readbacks_.Clear();
}

void FrameData::AcquireNextImage(VulkanContext *vulkan, FrameDataShared &shared) {
Expand Down
25 changes: 24 additions & 1 deletion Common/GPU/Vulkan/VulkanFrameData.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <condition_variable>

#include "Common/GPU/Vulkan/VulkanContext.h"
#include "Common/Data/Collections/Hashmaps.h"

enum {
MAX_TIMESTAMP_QUERIES = 128,
Expand All @@ -25,6 +26,23 @@ struct QueueProfileContext {
double cpuEndTime;
};

class VKRFramebuffer;

struct ReadbackKey {
const VKRFramebuffer *framebuf;
int width;
int height;
};

struct CachedReadback {
VkBuffer buffer;
VmaAllocation allocation;
VkDeviceSize bufferSize;
bool isCoherent;

void Destroy(VulkanContext *vulkan);
};

struct FrameDataShared {
// Permanent objects
VkSemaphore acquireSemaphore = VK_NULL_HANDLE;
Expand Down Expand Up @@ -77,6 +95,11 @@ struct FrameData {
QueueProfileContext profile;
bool profilingEnabled_ = false;

// Async readback cache.
DenseHashMap<ReadbackKey, CachedReadback*, nullptr> readbacks_;

FrameData() : readbacks_(8) {}

void Init(VulkanContext *vulkan, int index);
void Destroy(VulkanContext *vulkan);

Expand All @@ -91,5 +114,5 @@ struct FrameData {

private:
// Metadata for logging etc
int index;
int index = -1;
};
115 changes: 75 additions & 40 deletions Common/GPU/Vulkan/VulkanQueueRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ void VulkanQueueRunner::CreateDeviceObjects() {

void VulkanQueueRunner::DestroyDeviceObjects() {
INFO_LOG(G3D, "VulkanQueueRunner::DestroyDeviceObjects");
DestroyReadbackBuffer();

syncReadback_.Destroy(vulkan_);

renderPasses_.IterateMut([&](const RPKey &rpkey, VKRRenderPass *rp) {
_assert_(rp);
Expand Down Expand Up @@ -419,7 +420,7 @@ void VulkanQueueRunner::RunSteps(std::vector<VKRStep *> &steps, FrameData &frame
PerformBlit(step, cmd);
break;
case VKRStepType::READBACK:
PerformReadback(step, cmd);
PerformReadback(step, cmd, frameData);
break;
case VKRStepType::READBACK_IMAGE:
PerformReadbackImage(step, cmd);
Expand Down Expand Up @@ -1944,52 +1945,35 @@ void VulkanQueueRunner::SetupTransferDstWriteAfterWrite(VKRImage &img, VkImageAs
);
}

void VulkanQueueRunner::ResizeReadbackBuffer(VkDeviceSize requiredSize) {
if (readbackBuffer_ && requiredSize <= readbackBufferSize_) {
void VulkanQueueRunner::ResizeReadbackBuffer(CachedReadback *readback, VkDeviceSize requiredSize) {
if (readback->buffer && requiredSize <= readback->bufferSize) {
return;
}
if (readbackBuffer_) {
vulkan_->Delete().QueueDeleteBufferAllocation(readbackBuffer_, readbackAllocation_);

if (readback->buffer) {
vulkan_->Delete().QueueDeleteBufferAllocation(readback->buffer, readback->allocation);
}

readbackBufferSize_ = requiredSize;
readback->bufferSize = requiredSize;

VkDevice device = vulkan_->GetDevice();

VkBufferCreateInfo buf{ VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };
buf.size = readbackBufferSize_;
buf.size = readback->bufferSize;
buf.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT;

VmaAllocationCreateInfo allocCreateInfo{};
allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_TO_CPU;
VmaAllocationInfo allocInfo{};

VkResult res = vmaCreateBuffer(vulkan_->Allocator(), &buf, &allocCreateInfo, &readbackBuffer_, &readbackAllocation_, &allocInfo);
VkResult res = vmaCreateBuffer(vulkan_->Allocator(), &buf, &allocCreateInfo, &readback->buffer, &readback->allocation, &allocInfo);
_assert_(res == VK_SUCCESS);

const VkMemoryType &memoryType = vulkan_->GetMemoryProperties().memoryTypes[allocInfo.memoryType];
readbackBufferIsCoherent_ = (memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) != 0;
}

void VulkanQueueRunner::DestroyReadbackBuffer() {
if (readbackBuffer_) {
vulkan_->Delete().QueueDeleteBufferAllocation(readbackBuffer_, readbackAllocation_);
}
readbackBufferSize_ = 0;
readback->isCoherent = (memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) != 0;
}

void VulkanQueueRunner::PerformReadback(const VKRStep &step, VkCommandBuffer cmd) {
ResizeReadbackBuffer(sizeof(uint32_t) * step.readback.srcRect.extent.width * step.readback.srcRect.extent.height);

VkBufferImageCopy region{};
region.imageOffset = { step.readback.srcRect.offset.x, step.readback.srcRect.offset.y, 0 };
region.imageExtent = { step.readback.srcRect.extent.width, step.readback.srcRect.extent.height, 1 };
region.imageSubresource.aspectMask = step.readback.aspectMask;
region.imageSubresource.layerCount = 1;
region.bufferOffset = 0;
region.bufferRowLength = step.readback.srcRect.extent.width;
region.bufferImageHeight = step.readback.srcRect.extent.height;

void VulkanQueueRunner::PerformReadback(const VKRStep &step, VkCommandBuffer cmd, FrameData &frameData) {
VkImage image;
VkImageLayout copyLayout;
// Special case for backbuffer readbacks.
Expand Down Expand Up @@ -2023,7 +2007,40 @@ void VulkanQueueRunner::PerformReadback(const VKRStep &step, VkCommandBuffer cmd
copyLayout = srcImage->layout;
}

vkCmdCopyImageToBuffer(cmd, image, copyLayout, readbackBuffer_, 1, &region);
// TODO: Handle different readback formats!
u32 readbackSizeInBytes = sizeof(uint32_t) * step.readback.srcRect.extent.width * step.readback.srcRect.extent.height;

CachedReadback *cached = nullptr;

if (step.readback.delayed) {
ReadbackKey key;
key.framebuf = step.readback.src;
key.width = step.readback.srcRect.extent.width;
key.height = step.readback.srcRect.extent.height;

// See if there's already a buffer we can reuse
cached = frameData.readbacks_.Get(key);
if (!cached) {
cached = new CachedReadback();
cached->bufferSize = 0;
frameData.readbacks_.Insert(key, cached);
}
} else {
cached = &syncReadback_;
}

ResizeReadbackBuffer(cached, readbackSizeInBytes);

VkBufferImageCopy region{};
region.imageOffset = { step.readback.srcRect.offset.x, step.readback.srcRect.offset.y, 0 };
region.imageExtent = { step.readback.srcRect.extent.width, step.readback.srcRect.extent.height, 1 };
region.imageSubresource.aspectMask = step.readback.aspectMask;
region.imageSubresource.layerCount = 1;
region.bufferOffset = 0;
region.bufferRowLength = step.readback.srcRect.extent.width;
region.bufferImageHeight = step.readback.srcRect.extent.height;

vkCmdCopyImageToBuffer(cmd, image, copyLayout, cached->buffer, 1, &region);

// NOTE: Can't read the buffer using the CPU here - need to sync first.

Expand All @@ -2050,7 +2067,7 @@ void VulkanQueueRunner::PerformReadbackImage(const VKRStep &step, VkCommandBuffe
SetupTransitionToTransferSrc(srcImage, VK_IMAGE_ASPECT_COLOR_BIT, &recordBarrier_);
recordBarrier_.Flush(cmd);

ResizeReadbackBuffer(sizeof(uint32_t) * step.readback_image.srcRect.extent.width * step.readback_image.srcRect.extent.height);
ResizeReadbackBuffer(&syncReadback_, sizeof(uint32_t) * step.readback_image.srcRect.extent.width * step.readback_image.srcRect.extent.height);

VkBufferImageCopy region{};
region.imageOffset = { step.readback_image.srcRect.offset.x, step.readback_image.srcRect.offset.y, 0 };
Expand All @@ -2061,7 +2078,7 @@ void VulkanQueueRunner::PerformReadbackImage(const VKRStep &step, VkCommandBuffe
region.bufferOffset = 0;
region.bufferRowLength = step.readback_image.srcRect.extent.width;
region.bufferImageHeight = step.readback_image.srcRect.extent.height;
vkCmdCopyImageToBuffer(cmd, step.readback_image.image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, readbackBuffer_, 1, &region);
vkCmdCopyImageToBuffer(cmd, step.readback_image.image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, syncReadback_.buffer, 1, &region);

// Now transfer it back to a texture.
TransitionImageLayout2(cmd, step.readback_image.image, 0, 1, 1, // I don't think we have any multilayer cases for regular textures. Above in PerformReadback, though..
Expand All @@ -2074,22 +2091,39 @@ void VulkanQueueRunner::PerformReadbackImage(const VKRStep &step, VkCommandBuffe
// Doing that will also act like a heavyweight barrier ensuring that device writes are visible on the host.
}

void VulkanQueueRunner::CopyReadbackBuffer(VKRFramebuffer *src, int width, int height, Draw::DataFormat srcFormat, Draw::DataFormat destFormat, int pixelStride, uint8_t *pixels) {
if (!readbackBuffer_)
return; // Something has gone really wrong.
bool VulkanQueueRunner::CopyReadbackBuffer(FrameData &frameData, VKRFramebuffer *src, int width, int height, Draw::DataFormat srcFormat, Draw::DataFormat destFormat, int pixelStride, uint8_t *pixels) {
CachedReadback *readback = &syncReadback_;

// Look up in readback cache.
if (src) {
ReadbackKey key;
key.framebuf = src;
key.width = width;
key.height = height;
CachedReadback *cached = frameData.readbacks_.Get(key);
if (cached) {
readback = cached;
} else {
// Didn't have a cached image ready yet
return false;
}
}

if (!readback->buffer)
return false; // Didn't find anything in cache, or something has gone really wrong.

// Read back to the requested address in ram from buffer.
void *mappedData;
const size_t srcPixelSize = DataFormatSizeInBytes(srcFormat);
VkResult res = vmaMapMemory(vulkan_->Allocator(), readbackAllocation_, &mappedData);
VkResult res = vmaMapMemory(vulkan_->Allocator(), readback->allocation, &mappedData);

if (res != VK_SUCCESS) {
ERROR_LOG(G3D, "CopyReadbackBuffer: vkMapMemory failed! result=%d", (int)res);
return;
return false;
}

if (!readbackBufferIsCoherent_) {
vmaInvalidateAllocation(vulkan_->Allocator(), readbackAllocation_, 0, width * height * srcPixelSize);
if (!readback->isCoherent) {
vmaInvalidateAllocation(vulkan_->Allocator(), readback->allocation, 0, width * height * srcPixelSize);
}

// TODO: Perform these conversions in a compute shader on the GPU.
Expand All @@ -2116,5 +2150,6 @@ void VulkanQueueRunner::CopyReadbackBuffer(VKRFramebuffer *src, int width, int h
_assert_msg_(false, "CopyReadbackBuffer: Unknown src format %d", (int)srcFormat);
}

vmaUnmapMemory(vulkan_->Allocator(), readbackAllocation_);
vmaUnmapMemory(vulkan_->Allocator(), readback->allocation);
return true;
}
13 changes: 5 additions & 8 deletions Common/GPU/Vulkan/VulkanQueueRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ struct VKRStep {
int aspectMask;
VKRFramebuffer *src;
VkRect2D srcRect;
bool delayed;
} readback;
struct {
VkImage image;
Expand Down Expand Up @@ -253,7 +254,7 @@ class VulkanQueueRunner {
}

// src == 0 means to copy from the sync readback buffer.
void CopyReadbackBuffer(VKRFramebuffer *src, int width, int height, Draw::DataFormat srcFormat, Draw::DataFormat destFormat, int pixelStride, uint8_t *pixels);
bool CopyReadbackBuffer(FrameData &frameData, VKRFramebuffer *src, int width, int height, Draw::DataFormat srcFormat, Draw::DataFormat destFormat, int pixelStride, uint8_t *pixels);

VKRRenderPass *GetRenderPass(const RPKey &key);

Expand Down Expand Up @@ -289,7 +290,7 @@ class VulkanQueueRunner {
void PerformRenderPass(const VKRStep &pass, VkCommandBuffer cmd);
void PerformCopy(const VKRStep &pass, VkCommandBuffer cmd);
void PerformBlit(const VKRStep &pass, VkCommandBuffer cmd);
void PerformReadback(const VKRStep &pass, VkCommandBuffer cmd);
void PerformReadback(const VKRStep &pass, VkCommandBuffer cmd, FrameData &frameData);
void PerformReadbackImage(const VKRStep &pass, VkCommandBuffer cmd);

void LogRenderPass(const VKRStep &pass, bool verbose);
Expand All @@ -298,8 +299,7 @@ class VulkanQueueRunner {
void LogReadback(const VKRStep &pass);
void LogReadbackImage(const VKRStep &pass);

void ResizeReadbackBuffer(VkDeviceSize requiredSize);
void DestroyReadbackBuffer();
void ResizeReadbackBuffer(CachedReadback *readback, VkDeviceSize requiredSize);

void ApplyMGSHack(std::vector<VKRStep *> &steps);
void ApplySonicHack(std::vector<VKRStep *> &steps);
Expand All @@ -325,10 +325,7 @@ class VulkanQueueRunner {

// Readback buffer. Currently we only support synchronous readback, so we only really need one.
// We size it generously.
VmaAllocation readbackAllocation_ = VK_NULL_HANDLE;
VkBuffer readbackBuffer_ = VK_NULL_HANDLE;
VkDeviceSize readbackBufferSize_ = 0;
bool readbackBufferIsCoherent_ = false;
CachedReadback syncReadback_{};

// TODO: Enable based on compat.ini.
uint32_t hacksEnabled_ = 0;
Expand Down
11 changes: 7 additions & 4 deletions Common/GPU/Vulkan/VulkanRenderManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -929,11 +929,14 @@ bool VulkanRenderManager::CopyFramebufferToMemory(VKRFramebuffer *src, VkImageAs
step->readback.src = src;
step->readback.srcRect.offset = { x, y };
step->readback.srcRect.extent = { (uint32_t)w, (uint32_t)h };
step->readback.delayed = mode == Draw::ReadbackMode::OLD_DATA_OK;
step->dependencies.insert(src);
step->tag = tag;
steps_.push_back(step);

FlushSync();
if (mode == Draw::ReadbackMode::BLOCK) {
FlushSync();
}

Draw::DataFormat srcFormat = Draw::DataFormat::UNDEFINED;
if (aspectBits & VK_IMAGE_ASPECT_COLOR_BIT) {
Expand Down Expand Up @@ -972,8 +975,8 @@ bool VulkanRenderManager::CopyFramebufferToMemory(VKRFramebuffer *src, VkImageAs
}

// Need to call this after FlushSync so the pixels are guaranteed to be ready in CPU-accessible VRAM.
queueRunner_.CopyReadbackBuffer(src, w, h, srcFormat, destFormat, pixelStride, pixels);
return true;
return queueRunner_.CopyReadbackBuffer(frameData_[vulkan_->GetCurFrame()],
mode == Draw::ReadbackMode::OLD_DATA_OK ? src : nullptr, w, h, srcFormat, destFormat, pixelStride, pixels);
}

void VulkanRenderManager::CopyImageToMemorySync(VkImage image, int mipLevel, int x, int y, int w, int h, Draw::DataFormat destFormat, uint8_t *pixels, int pixelStride, const char *tag) {
Expand All @@ -992,7 +995,7 @@ void VulkanRenderManager::CopyImageToMemorySync(VkImage image, int mipLevel, int
FlushSync();

// Need to call this after FlushSync so the pixels are guaranteed to be ready in CPU-accessible VRAM.
queueRunner_.CopyReadbackBuffer(nullptr, w, h, destFormat, destFormat, pixelStride, pixels);
queueRunner_.CopyReadbackBuffer(frameData_[vulkan_->GetCurFrame()], nullptr, w, h, destFormat, destFormat, pixelStride, pixels);
}

static void RemoveDrawCommands(std::vector<VkRenderData> *cmds) {
Expand Down
8 changes: 6 additions & 2 deletions GPU/Common/DepthBufferCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,14 +218,18 @@ bool FramebufferManagerCommon::ReadbackDepthbuffer(Draw::Framebuffer *fbo, int x

DepthUB ub{};

// Setting this to 0.95f eliminates flickering lights with delayed readback in Syphon Filter.
// That's pretty ugly though! But we'll need to do that if we're gonna enable delayed readback in those games.
const float fudgeFactor = 1.0f;

if (!gstate_c.Use(GPU_USE_ACCURATE_DEPTH)) {
// Don't scale anything, since we're not using factors outside accurate mode.
ub.u_depthFactor[0] = 0.0f;
ub.u_depthFactor[1] = 1.0f;
ub.u_depthFactor[1] = fudgeFactor;
} else {
const float factor = DepthSliceFactor();
ub.u_depthFactor[0] = -0.5f * (factor - 1.0f) * (1.0f / factor);
ub.u_depthFactor[1] = factor;
ub.u_depthFactor[1] = factor * fudgeFactor;
}
static constexpr float shifts[] = { 16777215.0f, 16777215.0f / 256.0f, 16777215.0f / 65536.0f, 16777215.0f / 16777216.0f };
memcpy(ub.u_depthShift, shifts, sizeof(shifts));
Expand Down
Loading

0 comments on commit 2d0252f

Please sign in to comment.