Skip to content

Commit

Permalink
Merge pull request #16910 from hrydgard/async-readback-prep
Browse files Browse the repository at this point in the history
Prepare for adding async readback (use VMA for readback allocs, add a param)
  • Loading branch information
hrydgard authored Feb 5, 2023
2 parents 910e66a + 31df6bf commit 7ae9d68
Show file tree
Hide file tree
Showing 14 changed files with 86 additions and 106 deletions.
4 changes: 2 additions & 2 deletions Common/GPU/D3D11/thin3d_d3d11.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ class D3D11DrawContext : public DrawContext {

void CopyFramebufferImage(Framebuffer *src, int level, int x, int y, int z, Framebuffer *dst, int dstLevel, int dstX, int dstY, int dstZ, int width, int height, int depth, int channelBits, const char *tag) override;
bool BlitFramebuffer(Framebuffer *src, int srcX1, int srcY1, int srcX2, int srcY2, Framebuffer *dst, int dstX1, int dstY1, int dstX2, int dstY2, int channelBits, FBBlitFilter filter, const char *tag) override;
bool CopyFramebufferToMemorySync(Framebuffer *src, int channelBits, int x, int y, int w, int h, Draw::DataFormat format, void *pixels, int pixelStride, const char *tag) override;
bool CopyFramebufferToMemory(Framebuffer *src, int channelBits, int x, int y, int w, int h, Draw::DataFormat format, void *pixels, int pixelStride, ReadbackMode mode, const char *tag) override;

// These functions should be self explanatory.
void BindFramebufferAsRenderTarget(Framebuffer *fbo, const RenderPassInfo &rp, const char *tag) override;
Expand Down Expand Up @@ -1525,7 +1525,7 @@ bool D3D11DrawContext::BlitFramebuffer(Framebuffer *srcfb, int srcX1, int srcY1,
return false;
}

bool D3D11DrawContext::CopyFramebufferToMemorySync(Framebuffer *src, int channelBits, int bx, int by, int bw, int bh, Draw::DataFormat destFormat, void *pixels, int pixelStride, const char *tag) {
bool D3D11DrawContext::CopyFramebufferToMemory(Framebuffer *src, int channelBits, int bx, int by, int bw, int bh, Draw::DataFormat destFormat, void *pixels, int pixelStride, ReadbackMode mode, const char *tag) {
D3D11Framebuffer *fb = (D3D11Framebuffer *)src;

if (fb) {
Expand Down
4 changes: 2 additions & 2 deletions Common/GPU/D3D9/thin3d_d3d9.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,7 @@ class D3D9Context : public DrawContext {
// Not implemented
}
bool BlitFramebuffer(Framebuffer *src, int srcX1, int srcY1, int srcX2, int srcY2, Framebuffer *dst, int dstX1, int dstY1, int dstX2, int dstY2, int channelBits, FBBlitFilter filter, const char *tag) override;
bool CopyFramebufferToMemorySync(Framebuffer *src, int channelBits, int x, int y, int w, int h, Draw::DataFormat format, void *pixels, int pixelStride, const char *tag) override;
bool CopyFramebufferToMemory(Framebuffer *src, int channelBits, int x, int y, int w, int h, Draw::DataFormat format, void *pixels, int pixelStride, ReadbackMode mode, const char *tag) override;

// These functions should be self explanatory.
void BindFramebufferAsRenderTarget(Framebuffer *fbo, const RenderPassInfo &rp, const char *tag) override;
Expand Down Expand Up @@ -1426,7 +1426,7 @@ bool D3D9Context::BlitFramebuffer(Framebuffer *srcfb, int srcX1, int srcY1, int
return SUCCEEDED(device_->StretchRect(srcSurf, &srcRect, dstSurf, &dstRect, (filter == FB_BLIT_LINEAR && channelBits == FB_COLOR_BIT) ? D3DTEXF_LINEAR : D3DTEXF_POINT));
}

bool D3D9Context::CopyFramebufferToMemorySync(Framebuffer *src, int channelBits, int bx, int by, int bw, int bh, Draw::DataFormat destFormat, void *pixels, int pixelStride, const char *tag) {
bool D3D9Context::CopyFramebufferToMemory(Framebuffer *src, int channelBits, int bx, int by, int bw, int bh, Draw::DataFormat destFormat, void *pixels, int pixelStride, ReadbackMode mode, const char *tag) {
D3D9Framebuffer *fb = (D3D9Framebuffer *)src;

if (fb) {
Expand Down
4 changes: 2 additions & 2 deletions Common/GPU/OpenGL/thin3d_gl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ class OpenGLContext : public DrawContext {

void CopyFramebufferImage(Framebuffer *src, int level, int x, int y, int z, Framebuffer *dst, int dstLevel, int dstX, int dstY, int dstZ, int width, int height, int depth, int channelBits, const char *tag) override;
bool BlitFramebuffer(Framebuffer *src, int srcX1, int srcY1, int srcX2, int srcY2, Framebuffer *dst, int dstX1, int dstY1, int dstX2, int dstY2, int channelBits, FBBlitFilter filter, const char *tag) override;
bool CopyFramebufferToMemorySync(Framebuffer *src, int channelBits, int x, int y, int w, int h, Draw::DataFormat format, void *pixels, int pixelStride, const char *tag) override;
bool CopyFramebufferToMemory(Framebuffer *src, int channelBits, int x, int y, int w, int h, Draw::DataFormat format, void *pixels, int pixelStride, ReadbackMode mode, const char *tag) override;

// These functions should be self explanatory.
void BindFramebufferAsRenderTarget(Framebuffer *fbo, const RenderPassInfo &rp, const char *tag) override;
Expand Down Expand Up @@ -988,7 +988,7 @@ static void LogReadPixelsError(GLenum error) {
}
#endif

bool OpenGLContext::CopyFramebufferToMemorySync(Framebuffer *src, int channelBits, int x, int y, int w, int h, Draw::DataFormat dataFormat, void *pixels, int pixelStride, const char *tag) {
bool OpenGLContext::CopyFramebufferToMemory(Framebuffer *src, int channelBits, int x, int y, int w, int h, Draw::DataFormat dataFormat, void *pixels, int pixelStride, ReadbackMode mode, const char *tag) {
if (gl_extensions.IsGLES && (channelBits & FB_COLOR_BIT) == 0) {
// Can't readback depth or stencil on GLES.
return false;
Expand Down
17 changes: 8 additions & 9 deletions Common/GPU/Vulkan/VulkanFrameData.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,6 @@ void FrameData::Init(VulkanContext *vulkan, int index) {
vulkan->SetDebugName(fence, VK_OBJECT_TYPE_FENCE, StringFromFormat("fence%d", index).c_str());
readyForFence = true;

// This fence is used for synchronizing readbacks. Does not need preinitialization.
// TODO: Put this in frameDataShared, only one is needed.
readbackFence = vulkan->CreateFence(false);
vulkan->SetDebugName(fence, VK_OBJECT_TYPE_FENCE, "readbackFence");

VkQueryPoolCreateInfo query_ci{ VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO };
query_ci.queryCount = MAX_TIMESTAMP_QUERIES;
query_ci.queryType = VK_QUERY_TYPE_TIMESTAMP;
Expand All @@ -52,7 +47,6 @@ void FrameData::Destroy(VulkanContext *vulkan) {
vkDestroyCommandPool(device, cmdPoolInit, nullptr);
vkDestroyCommandPool(device, cmdPoolMain, nullptr);
vkDestroyFence(device, fence, nullptr);
vkDestroyFence(device, readbackFence, nullptr);
vkDestroyQueryPool(device, profile.queryPool, nullptr);
}

Expand Down Expand Up @@ -144,7 +138,7 @@ void FrameData::SubmitPending(VulkanContext *vulkan, FrameSubmitType type, Frame
}

if ((hasMainCommands || hasPresentCommands) && type == FrameSubmitType::Sync) {
fenceToTrigger = readbackFence;
fenceToTrigger = sharedData.readbackFence;
}

if (hasMainCommands) {
Expand Down Expand Up @@ -206,8 +200,8 @@ void FrameData::SubmitPending(VulkanContext *vulkan, FrameSubmitType type, Frame

if (type == FrameSubmitType::Sync) {
// Hard stall of the GPU, not ideal, but necessary so the CPU has the contents of the readback.
vkWaitForFences(vulkan->GetDevice(), 1, &readbackFence, true, UINT64_MAX);
vkResetFences(vulkan->GetDevice(), 1, &readbackFence);
vkWaitForFences(vulkan->GetDevice(), 1, &sharedData.readbackFence, true, UINT64_MAX);
vkResetFences(vulkan->GetDevice(), 1, &sharedData.readbackFence);
syncDone = true;
}
}
Expand All @@ -219,10 +213,15 @@ void FrameDataShared::Init(VulkanContext *vulkan) {
_dbg_assert_(res == VK_SUCCESS);
res = vkCreateSemaphore(vulkan->GetDevice(), &semaphoreCreateInfo, nullptr, &renderingCompleteSemaphore);
_dbg_assert_(res == VK_SUCCESS);

// This fence is used for synchronizing readbacks. Does not need preinitialization.
readbackFence = vulkan->CreateFence(false);
vulkan->SetDebugName(readbackFence, VK_OBJECT_TYPE_FENCE, "readbackFence");
}

void FrameDataShared::Destroy(VulkanContext *vulkan) {
VkDevice device = vulkan->GetDevice();
vkDestroySemaphore(device, acquireSemaphore, nullptr);
vkDestroySemaphore(device, renderingCompleteSemaphore, nullptr);
vkDestroyFence(device, readbackFence, nullptr);
}
4 changes: 3 additions & 1 deletion Common/GPU/Vulkan/VulkanFrameData.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ struct FrameDataShared {
VkSemaphore acquireSemaphore = VK_NULL_HANDLE;
VkSemaphore renderingCompleteSemaphore = VK_NULL_HANDLE;

// For synchronous readbacks.
VkFence readbackFence = VK_NULL_HANDLE;

void Init(VulkanContext *vulkan);
void Destroy(VulkanContext *vulkan);
};
Expand All @@ -49,7 +52,6 @@ struct FrameData {
bool readyForFence = true;

VkFence fence = VK_NULL_HANDLE;
VkFence readbackFence = VK_NULL_HANDLE; // Strictly speaking we might only need one global of these.

// These are on different threads so need separate pools.
VkCommandPool cmdPoolInit = VK_NULL_HANDLE; // Written to from main thread
Expand Down
113 changes: 40 additions & 73 deletions Common/GPU/Vulkan/VulkanQueueRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,72 +67,9 @@ void VulkanQueueRunner::CreateDeviceObjects() {
#endif
}

void VulkanQueueRunner::ResizeReadbackBuffer(VkDeviceSize requiredSize) {
if (readbackBuffer_ && requiredSize <= readbackBufferSize_) {
return;
}
if (readbackMemory_) {
vulkan_->Delete().QueueDeleteDeviceMemory(readbackMemory_);
}
if (readbackBuffer_) {
vulkan_->Delete().QueueDeleteBuffer(readbackBuffer_);
}

readbackBufferSize_ = requiredSize;

VkDevice device = vulkan_->GetDevice();

VkBufferCreateInfo buf{ VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };
buf.size = readbackBufferSize_;
buf.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT;

VkResult res = vkCreateBuffer(device, &buf, nullptr, &readbackBuffer_);
_assert_(res == VK_SUCCESS);

VkMemoryRequirements reqs{};
vkGetBufferMemoryRequirements(device, readbackBuffer_, &reqs);

VkMemoryAllocateInfo allocInfo{ VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO };
allocInfo.allocationSize = reqs.size;

// For speedy readbacks, we want the CPU cache to be enabled. However on most hardware we then have to
// sacrifice coherency, which means manual flushing. But try to find such memory first! If no cached
// memory type is available we fall back to just coherent.
const VkFlags desiredTypes[] = {
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
};
VkFlags successTypeReqs = 0;
for (VkFlags typeReqs : desiredTypes) {
if (vulkan_->MemoryTypeFromProperties(reqs.memoryTypeBits, typeReqs, &allocInfo.memoryTypeIndex)) {
successTypeReqs = typeReqs;
break;
}
}
_assert_(successTypeReqs != 0);
readbackBufferIsCoherent_ = (successTypeReqs & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) != 0;

res = vkAllocateMemory(device, &allocInfo, nullptr, &readbackMemory_);
if (res != VK_SUCCESS) {
readbackMemory_ = VK_NULL_HANDLE;
vkDestroyBuffer(device, readbackBuffer_, nullptr);
readbackBuffer_ = VK_NULL_HANDLE;
return;
}
uint32_t offset = 0;
vkBindBufferMemory(device, readbackBuffer_, readbackMemory_, offset);
}

void VulkanQueueRunner::DestroyDeviceObjects() {
INFO_LOG(G3D, "VulkanQueueRunner::DestroyDeviceObjects");
if (readbackMemory_) {
vulkan_->Delete().QueueDeleteDeviceMemory(readbackMemory_);
}
if (readbackBuffer_) {
vulkan_->Delete().QueueDeleteBuffer(readbackBuffer_);
}
readbackBufferSize_ = 0;
DestroyReadbackBuffer();

renderPasses_.IterateMut([&](const RPKey &rpkey, VKRRenderPass *rp) {
_assert_(rp);
Expand Down Expand Up @@ -2007,6 +1944,40 @@ void VulkanQueueRunner::SetupTransferDstWriteAfterWrite(VKRImage &img, VkImageAs
);
}

void VulkanQueueRunner::ResizeReadbackBuffer(VkDeviceSize requiredSize) {
if (readbackBuffer_ && requiredSize <= readbackBufferSize_) {
return;
}
if (readbackBuffer_) {
vulkan_->Delete().QueueDeleteBufferAllocation(readbackBuffer_, readbackAllocation_);
}

readbackBufferSize_ = requiredSize;

VkDevice device = vulkan_->GetDevice();

VkBufferCreateInfo buf{ VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };
buf.size = readbackBufferSize_;
buf.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT;

VmaAllocationCreateInfo allocCreateInfo{};
allocCreateInfo.usage = VMA_MEMORY_USAGE_GPU_TO_CPU;
VmaAllocationInfo allocInfo{};

VkResult res = vmaCreateBuffer(vulkan_->Allocator(), &buf, &allocCreateInfo, &readbackBuffer_, &readbackAllocation_, &allocInfo);
_assert_(res == VK_SUCCESS);

const VkMemoryType &memoryType = vulkan_->GetMemoryProperties().memoryTypes[allocInfo.memoryType];
readbackBufferIsCoherent_ = (memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT) != 0;
}

void VulkanQueueRunner::DestroyReadbackBuffer() {
if (readbackBuffer_) {
vulkan_->Delete().QueueDeleteBufferAllocation(readbackBuffer_, readbackAllocation_);
}
readbackBufferSize_ = 0;
}

void VulkanQueueRunner::PerformReadback(const VKRStep &step, VkCommandBuffer cmd) {
ResizeReadbackBuffer(sizeof(uint32_t) * step.readback.srcRect.extent.width * step.readback.srcRect.extent.height);

Expand Down Expand Up @@ -2104,20 +2075,15 @@ void VulkanQueueRunner::PerformReadbackImage(const VKRStep &step, VkCommandBuffe
}

void VulkanQueueRunner::CopyReadbackBuffer(int width, int height, Draw::DataFormat srcFormat, Draw::DataFormat destFormat, int pixelStride, uint8_t *pixels) {
if (!readbackMemory_)
if (!readbackBuffer_)
return; // Something has gone really wrong.

// Read back to the requested address in ram from buffer.
void *mappedData;
const size_t srcPixelSize = DataFormatSizeInBytes(srcFormat);

VkResult res = vkMapMemory(vulkan_->GetDevice(), readbackMemory_, 0, width * height * srcPixelSize, 0, &mappedData);
VkResult res = vmaMapMemory(vulkan_->Allocator(), readbackAllocation_, &mappedData);
if (!readbackBufferIsCoherent_) {
VkMappedMemoryRange range{};
range.memory = readbackMemory_;
range.offset = 0;
range.size = width * height * srcPixelSize;
vkInvalidateMappedMemoryRanges(vulkan_->GetDevice(), 1, &range);
vmaInvalidateAllocation(vulkan_->Allocator(), readbackAllocation_, 0, width * height * srcPixelSize);
}

if (res != VK_SUCCESS) {
Expand Down Expand Up @@ -2148,5 +2114,6 @@ void VulkanQueueRunner::CopyReadbackBuffer(int width, int height, Draw::DataForm
ERROR_LOG(G3D, "CopyReadbackBuffer: Unknown format");
_assert_msg_(false, "CopyReadbackBuffer: Unknown src format %d", (int)srcFormat);
}
vkUnmapMemory(vulkan_->GetDevice(), readbackMemory_);

vmaUnmapMemory(vulkan_->Allocator(), readbackAllocation_);
}
3 changes: 2 additions & 1 deletion Common/GPU/Vulkan/VulkanQueueRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,7 @@ class VulkanQueueRunner {
void LogReadbackImage(const VKRStep &pass);

void ResizeReadbackBuffer(VkDeviceSize requiredSize);
void DestroyReadbackBuffer();

void ApplyMGSHack(std::vector<VKRStep *> &steps);
void ApplySonicHack(std::vector<VKRStep *> &steps);
Expand All @@ -323,7 +324,7 @@ class VulkanQueueRunner {

// Readback buffer. Currently we only support synchronous readback, so we only really need one.
// We size it generously.
VkDeviceMemory readbackMemory_ = VK_NULL_HANDLE;
VmaAllocation readbackAllocation_ = VK_NULL_HANDLE;
VkBuffer readbackBuffer_ = VK_NULL_HANDLE;
VkDeviceSize readbackBufferSize_ = 0;
bool readbackBufferIsCoherent_ = false;
Expand Down
4 changes: 2 additions & 2 deletions Common/GPU/Vulkan/thin3d_vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ class VKContext : public DrawContext {

void CopyFramebufferImage(Framebuffer *src, int level, int x, int y, int z, Framebuffer *dst, int dstLevel, int dstX, int dstY, int dstZ, int width, int height, int depth, int channelBits, const char *tag) override;
bool BlitFramebuffer(Framebuffer *src, int srcX1, int srcY1, int srcX2, int srcY2, Framebuffer *dst, int dstX1, int dstY1, int dstX2, int dstY2, int channelBits, FBBlitFilter filter, const char *tag) override;
bool CopyFramebufferToMemorySync(Framebuffer *src, int channelBits, int x, int y, int w, int h, Draw::DataFormat format, void *pixels, int pixelStride, const char *tag) override;
bool CopyFramebufferToMemory(Framebuffer *src, int channelBits, int x, int y, int w, int h, Draw::DataFormat format, void *pixels, int pixelStride, ReadbackMode mode, const char *tag) override;
DataFormat PreferredFramebufferReadbackFormat(Framebuffer *src) override;

// These functions should be self explanatory.
Expand Down Expand Up @@ -1632,7 +1632,7 @@ bool VKContext::BlitFramebuffer(Framebuffer *srcfb, int srcX1, int srcY1, int sr
return true;
}

bool VKContext::CopyFramebufferToMemorySync(Framebuffer *srcfb, int channelBits, int x, int y, int w, int h, Draw::DataFormat format, void *pixels, int pixelStride, const char *tag) {
bool VKContext::CopyFramebufferToMemory(Framebuffer *srcfb, int channelBits, int x, int y, int w, int h, Draw::DataFormat format, void *pixels, int pixelStride, ReadbackMode mode, const char *tag) {
VKFramebuffer *src = (VKFramebuffer *)srcfb;

int aspectMask = 0;
Expand Down
9 changes: 8 additions & 1 deletion Common/GPU/thin3d.h
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,11 @@ enum class Event {
PRESENTED,
};

enum class ReadbackMode {
BLOCK,
OLD_DATA_OK, // Lets the backend return old results that won't need any waiting to get.
};

constexpr uint32_t MAX_TEXTURE_SLOTS = 3;

struct FramebufferDesc {
Expand Down Expand Up @@ -693,7 +698,9 @@ class DrawContext {

virtual void CopyFramebufferImage(Framebuffer *src, int level, int x, int y, int z, Framebuffer *dst, int dstLevel, int dstX, int dstY, int dstZ, int width, int height, int depth, int channelBits, const char *tag) = 0;
virtual bool BlitFramebuffer(Framebuffer *src, int srcX1, int srcY1, int srcX2, int srcY2, Framebuffer *dst, int dstX1, int dstY1, int dstX2, int dstY2, int channelBits, FBBlitFilter filter, const char *tag) = 0;
virtual bool CopyFramebufferToMemorySync(Framebuffer *src, int channelBits, int x, int y, int w, int h, Draw::DataFormat format, void *pixels, int pixelStride, const char *tag) {

// If the backend doesn't support old data, it's "OK" to block.
virtual bool CopyFramebufferToMemory(Framebuffer *src, int channelBits, int x, int y, int w, int h, Draw::DataFormat format, void *pixels, int pixelStride, ReadbackMode mode, const char *tag) {
return false;
}
virtual DataFormat PreferredFramebufferReadbackFormat(Framebuffer *src) {
Expand Down
7 changes: 5 additions & 2 deletions GPU/Common/DepthBufferCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,16 +242,19 @@ bool FramebufferManagerCommon::ReadbackDepthbufferSync(Draw::Framebuffer *fbo, i
};
draw_->DrawUP(positions, 3);

draw_->CopyFramebufferToMemorySync(blitFBO, FB_COLOR_BIT, x * scaleX, y * scaleY, w * scaleX, h * scaleY, DataFormat::R8G8B8A8_UNORM, convBuf_, destW, "ReadbackDepthbufferSync");
draw_->CopyFramebufferToMemory(blitFBO, FB_COLOR_BIT,
x * scaleX, y * scaleY, w * scaleX, h * scaleY,
DataFormat::R8G8B8A8_UNORM, convBuf_, destW, ReadbackMode::BLOCK, "ReadbackDepthbufferSync");

textureCache_->ForgetLastTexture();
// TODO: Use 4444 (or better, R16_UNORM) so we can copy lines directly (instead of 32 -> 16 on CPU)?
format16Bit = true;
} else {
draw_->CopyFramebufferToMemorySync(fbo, FB_DEPTH_BIT, x, y, w, h, DataFormat::D32F, convBuf_, w, "ReadbackDepthbufferSync");
draw_->CopyFramebufferToMemory(fbo, FB_DEPTH_BIT, x, y, w, h, DataFormat::D32F, convBuf_, w, ReadbackMode::BLOCK, "ReadbackDepthbufferSync");
format16Bit = false;
}

// TODO: Move this conversion into the backends.
if (format16Bit) {
// In this case, we used the shader to apply depth scale factors.
// This can be SSE'd or NEON'd very efficiently, though ideally we would avoid this conversion by using R16_UNORM for readback.
Expand Down
Loading

0 comments on commit 7ae9d68

Please sign in to comment.