-
Notifications
You must be signed in to change notification settings - Fork 410
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
layers: Allow using uncached buffer for Debug Printf #6128
base: main
Are you sure you want to change the base?
Changes from all commits
090d262
c0c5d42
cb05cb6
4b956cf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,87 @@ | |
#include <iostream> | ||
#include "generated/layer_chassis_dispatch.h" | ||
|
||
void DebugPrintf::PreCallRecordCreateDevice(VkPhysicalDevice gpu, const VkDeviceCreateInfo *pCreateInfo, | ||
const VkAllocationCallbacks *pAllocator, VkDevice *pDevice, void *modified_ci) { | ||
GpuAssistedBase::PreCallRecordCreateDevice(gpu, pCreateInfo, pAllocator, pDevice, modified_ci); | ||
|
||
std::string use_uncached_buffer_string = getLayerOption("khronos_validation.printf_uncached_buffer"); | ||
vvl::ToLower(use_uncached_buffer_string); | ||
use_uncached_buffer = !use_uncached_buffer_string.compare("true"); | ||
|
||
if (use_uncached_buffer) | ||
{ | ||
static const std::string dcm_ext{"VK_AMD_device_coherent_memory"}; | ||
{ | ||
bool dcm_supported = false; | ||
uint32_t property_count = 0; | ||
if (DispatchEnumerateDeviceExtensionProperties(gpu, nullptr, &property_count, nullptr) == VK_SUCCESS) { | ||
std::vector<VkExtensionProperties> property_list(property_count); | ||
if (DispatchEnumerateDeviceExtensionProperties(gpu, nullptr, &property_count, property_list.data()) == | ||
VK_SUCCESS) { | ||
for (const VkExtensionProperties &properties : property_list) { | ||
if (dcm_ext == properties.extensionName) { | ||
dcm_supported = true; | ||
} | ||
} | ||
} | ||
} | ||
if (!dcm_supported) { | ||
ReportSetupProblem( | ||
device, "Debug Printf with uncached buffer requires VK_AMD_device_coherent_memory which is not supported"); | ||
aborted = true; | ||
return; | ||
} | ||
} | ||
|
||
// See CreateDevice() in chassis.cpp. modified_ci is a pointer to a safe struct stored on the stack. | ||
// This code follows the safe struct memory memory management scheme. That is, we must delete any memory | ||
// remove from the safe struct, and any additions must be allocated in a way that is compatible with | ||
// the safe struct destructor. | ||
auto *modified_create_info = static_cast<safe_VkDeviceCreateInfo *>(modified_ci); | ||
|
||
bool found_ext = false; | ||
for (uint32_t i = 0; i < modified_create_info->enabledExtensionCount; i++) { | ||
if (dcm_ext == modified_create_info->ppEnabledExtensionNames[i]) { | ||
found_ext = true; | ||
break; | ||
} | ||
} | ||
if (!found_ext) { | ||
LogInfo(gpu, "UNASSIGNED-DEBUG-PRINTF", "VK_AMD_device_coherent_memory extension not enabled but use_uncached_buffer is true. Forcing extension."); | ||
const char **ext_names = new const char *[modified_create_info->enabledExtensionCount + 1]; | ||
// Copy the existing pointer table | ||
std::copy(modified_create_info->ppEnabledExtensionNames, | ||
modified_create_info->ppEnabledExtensionNames + modified_create_info->enabledExtensionCount, ext_names); | ||
// Add our new extension | ||
char *dcm_ext_copy = new char[dcm_ext.size() + 1]{}; | ||
dcm_ext.copy(dcm_ext_copy, dcm_ext.size()); | ||
dcm_ext_copy[dcm_ext.size()] = '\0'; | ||
ext_names[modified_create_info->enabledExtensionCount] = dcm_ext_copy; | ||
// Patch up the safe struct | ||
delete[] modified_create_info->ppEnabledExtensionNames; | ||
modified_create_info->ppEnabledExtensionNames = ext_names; | ||
modified_create_info->enabledExtensionCount++; | ||
} | ||
auto *dcm_features = const_cast<VkPhysicalDeviceCoherentMemoryFeaturesAMD *>( | ||
LvlFindInChain<VkPhysicalDeviceCoherentMemoryFeaturesAMD>(modified_create_info)); | ||
if (dcm_features) { | ||
if (dcm_features->deviceCoherentMemory != VK_TRUE) { | ||
LogInfo(gpu, "UNASSIGNED-DEBUG-PRINTF", | ||
"use_uncached_buffer is true, but deviceCoherentMemory feature is not enabled. Force enabling feature."); | ||
dcm_features->deviceCoherentMemory = VK_TRUE; | ||
} | ||
} else { | ||
LogInfo(gpu, "UNASSIGNED-DEBUG-PRINTF", | ||
"use_uncached_buffer is true, but deviceCoherentMemory feature is not enabled. Force enabling feature."); | ||
auto new_dcm_features = LvlInitStruct<VkPhysicalDeviceCoherentMemoryFeaturesAMD>(); | ||
new_dcm_features.deviceCoherentMemory = VK_TRUE; | ||
new_dcm_features.pNext = const_cast<void *>(modified_create_info->pNext); | ||
modified_create_info->pNext = new VkPhysicalDeviceCoherentMemoryFeaturesAMD(new_dcm_features); | ||
} | ||
} | ||
} | ||
|
||
// Perform initializations that can be done at Create Device time. | ||
void DebugPrintf::CreateDevice(const VkDeviceCreateInfo *pCreateInfo) { | ||
if (enabled[gpu_validation]) { | ||
|
@@ -42,7 +123,16 @@ void DebugPrintf::CreateDevice(const VkDeviceCreateInfo *pCreateInfo) { | |
use_stdout = !stdout_string.compare("true"); | ||
if (getenv("DEBUG_PRINTF_TO_STDOUT")) use_stdout = true; | ||
|
||
// GpuAssistedBase::CreateDevice will set up bindings | ||
// Need to get this option again, because PreCallRecordCreateDevice was done | ||
// in separate DebugPrintf instance (during VkInstance creation). | ||
std::string use_uncached_buffer_string = getLayerOption("khronos_validation.printf_uncached_buffer"); | ||
vvl::ToLower(use_uncached_buffer_string); | ||
use_uncached_buffer = !use_uncached_buffer_string.compare("true"); | ||
if (use_uncached_buffer) { | ||
force_device_coherent_memory = true; // vma needs to know it. | ||
} | ||
|
||
// GpuAssistedBase::CreateDevice will set up bindings. | ||
VkDescriptorSetLayoutBinding binding = {3, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, | ||
VK_SHADER_STAGE_ALL_GRAPHICS | VK_SHADER_STAGE_MESH_BIT_EXT | | ||
VK_SHADER_STAGE_TASK_BIT_EXT | VK_SHADER_STAGE_COMPUTE_BIT | | ||
|
@@ -138,6 +228,49 @@ void DebugPrintf::PreCallRecordCreateShaderModule(VkDevice device, const VkShade | |
} | ||
} | ||
|
||
// Override GpuAssistedBase version to allow processing in case of VK_ERROR_DEVICE_LOST when using uncached buffer. | ||
void DebugPrintf::PostCallRecordQueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo *pSubmits, VkFence fence, | ||
VkResult result) { | ||
ValidationStateTracker::PostCallRecordQueueSubmit(queue, submitCount, pSubmits, fence, result); | ||
|
||
bool device_lost = (result == VK_ERROR_DEVICE_LOST); | ||
|
||
if (aborted) return; | ||
|
||
if (result != VK_SUCCESS) { | ||
if (!use_uncached_buffer) { | ||
return; | ||
} else if (!device_lost) { | ||
return; // VK_ERROR_OUT_OF_HOST_MEMORY or VK_ERROR_OUT_OF_DEVICE_MEMORY | ||
} | ||
} | ||
|
||
bool buffers_present = false; | ||
// Don't QueueWaitIdle if there's nothing to process | ||
for (uint32_t submit_idx = 0; submit_idx < submitCount; submit_idx++) { | ||
const VkSubmitInfo *submit = &pSubmits[submit_idx]; | ||
for (uint32_t i = 0; i < submit->commandBufferCount; i++) { | ||
buffers_present |= CommandBufferNeedsProcessing(submit->pCommandBuffers[i]); | ||
} | ||
} | ||
if (!buffers_present) return; | ||
|
||
if (!device_lost) { | ||
SubmitBarrier(queue); | ||
|
||
DispatchQueueWaitIdle(queue); /// @todo Dispatch wait idle only after SubmitBarrier() succeeded. | ||
} else { | ||
assert(use_uncached_buffer); | ||
} | ||
|
||
for (uint32_t submit_idx = 0; submit_idx < submitCount; submit_idx++) { | ||
const VkSubmitInfo *submit = &pSubmits[submit_idx]; | ||
for (uint32_t i = 0; i < submit->commandBufferCount; i++) { | ||
ProcessCommandBuffer(queue, submit->pCommandBuffers[i]); | ||
} | ||
} | ||
} | ||
|
||
vartype vartype_lookup(char intype) { | ||
switch (intype) { | ||
case 'd': | ||
|
@@ -307,11 +440,14 @@ void DebugPrintf::AnalyzeAndGenerateMessages(VkCommandBuffer command_buffer, VkQ | |
// 8 Printf Format String Id | ||
// 9 Printf Values Word 0 (optional) | ||
// 10 Printf Values Word 1 (optional) | ||
uint32_t expect = debug_output_buffer[1]; | ||
if (!expect) return; | ||
uint32_t expect = debug_output_buffer[spvtools::kDebugOutputSizeOffset]; | ||
// Total size of all messages are written by AtomicAdd. Atomics in uncached memory seems to be working in caches anyway | ||
// and are not flushed to uncached memory at the end. In that case, expect will contain zero. | ||
// As a WA just parse messages using individual sizes (written correctly). | ||
if (!expect && !use_uncached_buffer) return; | ||
|
||
uint32_t index = spvtools::kDebugOutputDataOffset; | ||
while (debug_output_buffer[index]) { | ||
while ((index < output_buffer_size) && debug_output_buffer[index]) { | ||
std::stringstream shader_message; | ||
VkShaderModule shader_module_handle = VK_NULL_HANDLE; | ||
VkPipeline pipeline_handle = VK_NULL_HANDLE; | ||
|
@@ -412,11 +548,19 @@ void DebugPrintf::AnalyzeAndGenerateMessages(VkCommandBuffer command_buffer, VkQ | |
} | ||
index += debug_record->size; | ||
} | ||
if ((index - spvtools::kDebugOutputDataOffset) != expect) { | ||
if ((use_uncached_buffer && (index >= output_buffer_size)) || | ||
(!use_uncached_buffer && (index - spvtools::kDebugOutputDataOffset) != expect)) { | ||
LogWarning(device, "UNASSIGNED-DEBUG-PRINTF", | ||
"WARNING - Debug Printf message was truncated, likely due to a buffer size that was too small for the message"); | ||
} | ||
memset(debug_output_buffer, 0, 4 * (debug_output_buffer[spvtools::kDebugOutputSizeOffset] + spvtools::kDebugOutputDataOffset)); | ||
|
||
if (use_uncached_buffer) { | ||
// WA for atomics. | ||
memset(debug_output_buffer, 0, output_buffer_size); | ||
} else { | ||
// Clear only written memory. | ||
memset(debug_output_buffer, 0, sizeof(uint32_t) * (expect + spvtools::kDebugOutputDataOffset)); | ||
} | ||
} | ||
|
||
// For the given command buffer, map its debug data buffers and read their contents for analysis. | ||
|
@@ -429,7 +573,6 @@ void debug_printf_state::CommandBuffer::Process(VkQueue queue) { | |
uint32_t ray_trace_index = 0; | ||
|
||
for (auto &buffer_info : gpu_buffer_list) { | ||
char *data; | ||
|
||
uint32_t operation_index = 0; | ||
if (buffer_info.pipeline_bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) { | ||
|
@@ -445,10 +588,16 @@ void debug_printf_state::CommandBuffer::Process(VkQueue queue) { | |
assert(false); | ||
} | ||
|
||
VkResult result = vmaMapMemory(device_state->vmaAllocator, buffer_info.output_mem_block.allocation, (void **)&data); | ||
VkResult result = VK_SUCCESS; | ||
if (buffer_info.output_mem_block.data == nullptr) { | ||
result = vmaMapMemory(device_state->vmaAllocator, buffer_info.output_mem_block.allocation, | ||
(void **)&buffer_info.output_mem_block.data); | ||
} | ||
if (result == VK_SUCCESS) { | ||
device_state->AnalyzeAndGenerateMessages(commandBuffer(), queue, buffer_info, operation_index, (uint32_t *)data); | ||
device_state->AnalyzeAndGenerateMessages(commandBuffer(), queue, buffer_info, operation_index, | ||
(uint32_t *)buffer_info.output_mem_block.data); | ||
vmaUnmapMemory(device_state->vmaAllocator, buffer_info.output_mem_block.allocation); | ||
buffer_info.output_mem_block.data = nullptr; | ||
} | ||
} | ||
} | ||
|
@@ -672,6 +821,9 @@ void DebugPrintf::AllocateDebugPrintfResources(const VkCommandBuffer cmd_buffer, | |
buffer_info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; | ||
VmaAllocationCreateInfo alloc_info = {}; | ||
alloc_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; | ||
if (use_uncached_buffer) { | ||
alloc_info.requiredFlags |= VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD | VK_MEMORY_PROPERTY_DEVICE_UNCACHED_BIT_AMD; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You probably don't want to require UNCACHED here, since theoretically in future you might get COHERENT and CACHED memory. Uncached is meant to be informative, not something you generally rely upon unless you're doing some very specific performance tuning. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't quite understand. Here I want to get device uncached to read it after device is lost. |
||
} | ||
result = vmaCreateBuffer(vmaAllocator, &buffer_info, &alloc_info, &output_block.buffer, &output_block.allocation, nullptr); | ||
if (result != VK_SUCCESS) { | ||
ReportSetupProblem(device, "Unable to allocate device memory. Device could become unstable."); | ||
|
@@ -680,11 +832,15 @@ void DebugPrintf::AllocateDebugPrintfResources(const VkCommandBuffer cmd_buffer, | |
} | ||
|
||
// Clear the output block to zeros so that only printf values from the gpu will be present | ||
uint32_t *data; | ||
result = vmaMapMemory(vmaAllocator, output_block.allocation, reinterpret_cast<void **>(&data)); | ||
result = vmaMapMemory(vmaAllocator, output_block.allocation, reinterpret_cast<void **>(&output_block.data)); | ||
if (result == VK_SUCCESS) { | ||
memset(data, 0, output_buffer_size); | ||
vmaUnmapMemory(vmaAllocator, output_block.allocation); | ||
memset(output_block.data, 0, output_buffer_size); | ||
// Mapping may fail after DEVICE_LOST. Keep it mapped for now in such case. | ||
// Will be unmapped in debug_printf_state::CommandBuffer::Process | ||
if (!use_uncached_buffer) { | ||
vmaUnmapMemory(vmaAllocator, output_block.allocation); | ||
output_block.data = nullptr; | ||
} | ||
} | ||
|
||
auto desc_writes = LvlInitStruct<VkWriteDescriptorSet>(); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think this loop will work if the output buffer has one message but isn't full. To see this, use uncached buffer memory in the test NegativeDebugPrintf.BasicUsage and run it. This loop doesn't stop when it's supposed to.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you check in which vkQueueSubmit/AnalyzeAndGenerateMessages you observe it? BasicUsage does it 15 times.
Here is how I understand it:
In my first one,
debug_output_buffer == 0x0000028b8e520000
Memory of that buffer is:
Index starts at 0x2. At the end of first loop iteration:
index += debug_record->size; // 0x2 + 0xa = 0xc
At second loop beginning
debug_output_buffer[index]
== 0x0000028b8e520030 {0x0}So the loop ends.
(index < output_buffer_size)
means "while we are still in the buffer"&& debug_output_buffer[index])
means "while next message is not empty"There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't have access to the system I was using to test this today - I'll get back to it tomorrow. The system I do have is getting VK_TIMEOUT instead of VK_ERROR_DEVICE_LOST or VK_SUCCESS at the vkQueueWaitIdle, and I had to add that to the test's expected results. You should add that to your test.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I've added VK_TIMEOUT to asserts in test.
I assume you used AMD for that test. it happened to me once too.
VK_TIMEOUT should be not returned from vkQueueSubmit, vkDeviceWaitIdle or vkQueueWaitIdle.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see the problem. Because debug_output_buffer[1] comes back as 0, the memset at the end of message processing doesn't clear the buffer:
And the memset clears 4 * (0 + 2) bytes which doesn't cover all of the records. Then on the next printf, the loop depends on
debug_output_buffer[index]
being zero when the loop is supposed to stop, but previous printfs may have left values that didn't get memset to 0. I suppose one answer would be to memset the whole buffer.But I'm seeing another problem. If I modify your test to do two debug printfs:
I only get the second printf back, as if the first one was overwritten in the shader. Could that be a result of the atomic operation problem and debug_output_buffer[1] not getting written?
(This is all on an AMD RX6600 with driver 23.7.1)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The atomic add is performed by the instrumentation added to the shader via spirv-tools code, which I'm not very familiar with.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
https://github.com/KhronosGroup/SPIRV-Tools/blob/main/source/opt/instrument_pass.cpp#L823
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok so after a bit of digging, this extension was enabled before we had the MALL cache in hardware, and nobody ever went back and changed the interface to bypass MALL, so there IS caching enabled on this memory type for newer hardware. Whoops. I've filed an internal bug to fix this and to add a more comprehensive test to make sure this actually works on the off chance we make another big change to the caching hierarchy...
You can go look at the public driver source if for some reason you want to see where this is happening 🙃
I'll let you know when it's fixed and subsequently released.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Tobski I assume the fix is released now?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A fix has been implemented but I'm not sure on release status, I'll take a look.