KhronosGroup · dorian-apanel-intel · Jul 12, 2023 · Jul 17, 2023 · Jul 19, 2023 · Jul 21, 2023
@@ -749,6 +749,26 @@
                                                 ]
                                             }
                                         },
+                                        {
+                                            "key": "printf_uncached_buffer",
+                                            "label": "Printf using uncached buffer (ALPHA)",
+                                            "description": "Use VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD(from VK_AMD_device_coherent_memory) to allocate destination buffer. Slower, but useful in case of instrumenting shader causing VK_ERROR_DEVICE_LOST.",
+                                            "type": "BOOL",
+                                            "default": false,
+                                            "platforms": [
+                                                "WINDOWS",
+                                                "LINUX"
+                                            ],
+                                            "dependence": {
+                                                "mode": "ALL",
+                                                "settings": [
+                                                    {
+                                                        "key": "validate_gpu_based",
+                                                        "value": "GPU_BASED_DEBUG_PRINTF"
+                                                    }
+                                                ]
+                                            }
+                                        },
                                         {
                                             "key": "printf_verbose",
                                             "label": "Printf verbose",

@@ -21,6 +21,87 @@
 #include <iostream>
 #include "generated/layer_chassis_dispatch.h"
 
+void DebugPrintf::PreCallRecordCreateDevice(VkPhysicalDevice gpu, const VkDeviceCreateInfo *pCreateInfo,
+                                            const VkAllocationCallbacks *pAllocator, VkDevice *pDevice, void *modified_ci) {
+    GpuAssistedBase::PreCallRecordCreateDevice(gpu, pCreateInfo, pAllocator, pDevice, modified_ci);
+
+    std::string use_uncached_buffer_string = getLayerOption("khronos_validation.printf_uncached_buffer");
+    vvl::ToLower(use_uncached_buffer_string);
+    use_uncached_buffer = !use_uncached_buffer_string.compare("true");
+
+    if (use_uncached_buffer)
+    {
+        static const std::string dcm_ext{"VK_AMD_device_coherent_memory"};
+        {
+            bool dcm_supported = false;
+            uint32_t property_count = 0;
+            if (DispatchEnumerateDeviceExtensionProperties(gpu, nullptr, &property_count, nullptr) == VK_SUCCESS) {
+                std::vector<VkExtensionProperties> property_list(property_count);
+                if (DispatchEnumerateDeviceExtensionProperties(gpu, nullptr, &property_count, property_list.data()) ==
+                    VK_SUCCESS) {
+                    for (const VkExtensionProperties &properties : property_list) {
+                        if (dcm_ext == properties.extensionName) {
+                            dcm_supported = true;
+                        }
+                    }
+                }
+            }
+            if (!dcm_supported) {
+                ReportSetupProblem(
+                    device, "Debug Printf with uncached buffer requires VK_AMD_device_coherent_memory which is not supported");
+                aborted = true;
+                return;
+            }
+        }
+
+        // See CreateDevice() in chassis.cpp. modified_ci is a pointer to a safe struct stored on the stack.
+        // This code follows the safe struct memory memory management scheme. That is, we must delete any memory
+        // remove from the safe struct, and any additions must be allocated in a way that is compatible with
+        // the safe struct destructor.
+        auto *modified_create_info = static_cast<safe_VkDeviceCreateInfo *>(modified_ci);
+
+        bool found_ext = false;
+        for (uint32_t i = 0; i < modified_create_info->enabledExtensionCount; i++) {
+            if (dcm_ext == modified_create_info->ppEnabledExtensionNames[i]) {
+                found_ext = true;
+                break;
+            }
+        }
+        if (!found_ext) {
+            LogInfo(gpu, "UNASSIGNED-DEBUG-PRINTF", "VK_AMD_device_coherent_memory extension not enabled but use_uncached_buffer is true. Forcing extension.");
+            const char **ext_names = new const char *[modified_create_info->enabledExtensionCount + 1];
+            // Copy the existing pointer table
+            std::copy(modified_create_info->ppEnabledExtensionNames,
+                      modified_create_info->ppEnabledExtensionNames + modified_create_info->enabledExtensionCount, ext_names);
+            // Add our new extension
+            char *dcm_ext_copy = new char[dcm_ext.size() + 1]{};
+            dcm_ext.copy(dcm_ext_copy, dcm_ext.size());
+            dcm_ext_copy[dcm_ext.size()] = '\0';
+            ext_names[modified_create_info->enabledExtensionCount] = dcm_ext_copy;
+            // Patch up the safe struct
+            delete[] modified_create_info->ppEnabledExtensionNames;
+            modified_create_info->ppEnabledExtensionNames = ext_names;
+            modified_create_info->enabledExtensionCount++;
+        }
+        auto *dcm_features = const_cast<VkPhysicalDeviceCoherentMemoryFeaturesAMD *>(
+            LvlFindInChain<VkPhysicalDeviceCoherentMemoryFeaturesAMD>(modified_create_info));
+        if (dcm_features) {
+            if (dcm_features->deviceCoherentMemory != VK_TRUE) {
+                LogInfo(gpu, "UNASSIGNED-DEBUG-PRINTF",
+                        "use_uncached_buffer is true, but deviceCoherentMemory feature is not enabled. Force enabling feature.");
+                dcm_features->deviceCoherentMemory = VK_TRUE;
+            }
+        } else {
+            LogInfo(gpu, "UNASSIGNED-DEBUG-PRINTF",
+                    "use_uncached_buffer is true, but deviceCoherentMemory feature is not enabled. Force enabling feature.");
+            auto new_dcm_features = LvlInitStruct<VkPhysicalDeviceCoherentMemoryFeaturesAMD>();
+            new_dcm_features.deviceCoherentMemory = VK_TRUE;
+            new_dcm_features.pNext = const_cast<void *>(modified_create_info->pNext);
+            modified_create_info->pNext = new VkPhysicalDeviceCoherentMemoryFeaturesAMD(new_dcm_features);
+        }
+    }
+}
+
 // Perform initializations that can be done at Create Device time.
 void DebugPrintf::CreateDevice(const VkDeviceCreateInfo *pCreateInfo) {
     if (enabled[gpu_validation]) {
@@ -42,7 +123,16 @@ void DebugPrintf::CreateDevice(const VkDeviceCreateInfo *pCreateInfo) {
     use_stdout = !stdout_string.compare("true");
     if (getenv("DEBUG_PRINTF_TO_STDOUT")) use_stdout = true;
 
-    // GpuAssistedBase::CreateDevice will set up bindings
+    // Need to get this option again, because PreCallRecordCreateDevice was done
+    // in separate DebugPrintf instance (during VkInstance creation).
+    std::string use_uncached_buffer_string = getLayerOption("khronos_validation.printf_uncached_buffer");
+    vvl::ToLower(use_uncached_buffer_string);
+    use_uncached_buffer = !use_uncached_buffer_string.compare("true");
+    if (use_uncached_buffer) {
+        force_device_coherent_memory = true; // vma needs to know it.
+    }
+
+    // GpuAssistedBase::CreateDevice will set up bindings.
     VkDescriptorSetLayoutBinding binding = {3, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1,
                                             VK_SHADER_STAGE_ALL_GRAPHICS | VK_SHADER_STAGE_MESH_BIT_EXT |
                                                 VK_SHADER_STAGE_TASK_BIT_EXT | VK_SHADER_STAGE_COMPUTE_BIT |
@@ -138,6 +228,49 @@ void DebugPrintf::PreCallRecordCreateShaderModule(VkDevice device, const VkShade
     }
 }
 
+// Override GpuAssistedBase version to allow processing in case of VK_ERROR_DEVICE_LOST when using uncached buffer.
+void DebugPrintf::PostCallRecordQueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo *pSubmits, VkFence fence,
+                                            VkResult result) {
+    ValidationStateTracker::PostCallRecordQueueSubmit(queue, submitCount, pSubmits, fence, result);
+
+    bool device_lost = (result == VK_ERROR_DEVICE_LOST);
+
+    if (aborted) return;
+
+    if (result != VK_SUCCESS) {
+        if (!use_uncached_buffer) {
+            return;
+        } else if (!device_lost) {
+            return;  // VK_ERROR_OUT_OF_HOST_MEMORY or VK_ERROR_OUT_OF_DEVICE_MEMORY
+        }
+    }
+
+    bool buffers_present = false;
+    // Don't QueueWaitIdle if there's nothing to process
+    for (uint32_t submit_idx = 0; submit_idx < submitCount; submit_idx++) {
+        const VkSubmitInfo *submit = &pSubmits[submit_idx];
+        for (uint32_t i = 0; i < submit->commandBufferCount; i++) {
+            buffers_present |= CommandBufferNeedsProcessing(submit->pCommandBuffers[i]);
+        }
+    }
+    if (!buffers_present) return;
+
+    if (!device_lost) {
+        SubmitBarrier(queue);
+
+        DispatchQueueWaitIdle(queue); /// @todo Dispatch wait idle only after SubmitBarrier() succeeded.
+    } else {
+        assert(use_uncached_buffer);
+    }
+
+    for (uint32_t submit_idx = 0; submit_idx < submitCount; submit_idx++) {
+        const VkSubmitInfo *submit = &pSubmits[submit_idx];
+        for (uint32_t i = 0; i < submit->commandBufferCount; i++) {
+            ProcessCommandBuffer(queue, submit->pCommandBuffers[i]);
+        }
+    }
+}
+
 vartype vartype_lookup(char intype) {
     switch (intype) {
         case 'd':
@@ -307,11 +440,14 @@ void DebugPrintf::AnalyzeAndGenerateMessages(VkCommandBuffer command_buffer, VkQ
     //    8         Printf Format String Id
     //    9         Printf Values Word 0 (optional)
     //    10         Printf Values Word 1 (optional)
-    uint32_t expect = debug_output_buffer[1];
-    if (!expect) return;
+    uint32_t expect = debug_output_buffer[spvtools::kDebugOutputSizeOffset];
+    // Total size of all messages are written by AtomicAdd. Atomics in uncached memory seems to be working in caches anyway
+    // and are not flushed to uncached memory at the end. In that case, expect will contain zero.
+    // As a WA just parse messages using individual sizes (written correctly).
+    if (!expect && !use_uncached_buffer) return;
 
     uint32_t index = spvtools::kDebugOutputDataOffset;
-    while (debug_output_buffer[index]) {
+    while ((index < output_buffer_size) && debug_output_buffer[index]) {
         std::stringstream shader_message;
         VkShaderModule shader_module_handle = VK_NULL_HANDLE;
         VkPipeline pipeline_handle = VK_NULL_HANDLE;
@@ -412,11 +548,19 @@ void DebugPrintf::AnalyzeAndGenerateMessages(VkCommandBuffer command_buffer, VkQ
         }
         index += debug_record->size;
     }
-    if ((index - spvtools::kDebugOutputDataOffset) != expect) {
+    if ((use_uncached_buffer && (index >= output_buffer_size)) ||
+        (!use_uncached_buffer && (index - spvtools::kDebugOutputDataOffset) != expect)) {
         LogWarning(device, "UNASSIGNED-DEBUG-PRINTF",
                    "WARNING - Debug Printf message was truncated, likely due to a buffer size that was too small for the message");
     }
-    memset(debug_output_buffer, 0, 4 * (debug_output_buffer[spvtools::kDebugOutputSizeOffset] + spvtools::kDebugOutputDataOffset));
+
+    if (use_uncached_buffer) {
+        // WA for atomics.
+        memset(debug_output_buffer, 0, output_buffer_size);
+    } else {
+        // Clear only written memory.
+        memset(debug_output_buffer, 0, sizeof(uint32_t) * (expect + spvtools::kDebugOutputDataOffset));
+    }
 }
 
 // For the given command buffer, map its debug data buffers and read their contents for analysis.
@@ -429,7 +573,6 @@ void debug_printf_state::CommandBuffer::Process(VkQueue queue) {
         uint32_t ray_trace_index = 0;
 
         for (auto &buffer_info : gpu_buffer_list) {
-            char *data;
 
             uint32_t operation_index = 0;
             if (buffer_info.pipeline_bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
@@ -445,10 +588,16 @@ void debug_printf_state::CommandBuffer::Process(VkQueue queue) {
                 assert(false);
             }
 
-            VkResult result = vmaMapMemory(device_state->vmaAllocator, buffer_info.output_mem_block.allocation, (void **)&data);
+            VkResult result = VK_SUCCESS;
+            if (buffer_info.output_mem_block.data == nullptr) {
+                result = vmaMapMemory(device_state->vmaAllocator, buffer_info.output_mem_block.allocation,
+                                      (void **)&buffer_info.output_mem_block.data);
+            }
             if (result == VK_SUCCESS) {
-                device_state->AnalyzeAndGenerateMessages(commandBuffer(), queue, buffer_info, operation_index, (uint32_t *)data);
+                device_state->AnalyzeAndGenerateMessages(commandBuffer(), queue, buffer_info, operation_index,
+                                                         (uint32_t *)buffer_info.output_mem_block.data);
                 vmaUnmapMemory(device_state->vmaAllocator, buffer_info.output_mem_block.allocation);
+                buffer_info.output_mem_block.data = nullptr;
             }
         }
     }
@@ -672,6 +821,9 @@ void DebugPrintf::AllocateDebugPrintfResources(const VkCommandBuffer cmd_buffer,
     buffer_info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
     VmaAllocationCreateInfo alloc_info = {};
     alloc_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+    if (use_uncached_buffer) {
+        alloc_info.requiredFlags |= VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD | VK_MEMORY_PROPERTY_DEVICE_UNCACHED_BIT_AMD;
+    }
     result = vmaCreateBuffer(vmaAllocator, &buffer_info, &alloc_info, &output_block.buffer, &output_block.allocation, nullptr);
     if (result != VK_SUCCESS) {
         ReportSetupProblem(device, "Unable to allocate device memory.  Device could become unstable.");
@@ -680,11 +832,15 @@ void DebugPrintf::AllocateDebugPrintfResources(const VkCommandBuffer cmd_buffer,
     }
 
     // Clear the output block to zeros so that only printf values from the gpu will be present
-    uint32_t *data;
-    result = vmaMapMemory(vmaAllocator, output_block.allocation, reinterpret_cast<void **>(&data));
+    result = vmaMapMemory(vmaAllocator, output_block.allocation, reinterpret_cast<void **>(&output_block.data));
     if (result == VK_SUCCESS) {
-        memset(data, 0, output_buffer_size);
-        vmaUnmapMemory(vmaAllocator, output_block.allocation);
+        memset(output_block.data, 0, output_buffer_size);
+        // Mapping may fail after DEVICE_LOST. Keep it mapped for now in such case.
+        // Will be unmapped in debug_printf_state::CommandBuffer::Process
+        if (!use_uncached_buffer) {
+            vmaUnmapMemory(vmaAllocator, output_block.allocation);
+            output_block.data = nullptr;
+        }
     }
 
     auto desc_writes = LvlInitStruct<VkWriteDescriptorSet>();

@@ -23,6 +23,7 @@ class DebugPrintf;
 struct DPFDeviceMemoryBlock {
     VkBuffer buffer;
     VmaAllocation allocation;
+    uint32_t* data; // only valid if using uncached buffer, because mapping may fail after device is lost.
 };
 
 struct DPFBufferInfo {
@@ -86,12 +87,16 @@ class DebugPrintf : public GpuAssistedBase {
         desired_features.fragmentStoresAndAtomics = true;
     }
 
+    void PreCallRecordCreateDevice(VkPhysicalDevice gpu, const VkDeviceCreateInfo* pCreateInfo,
+                                   const VkAllocationCallbacks* pAllocator, VkDevice* pDevice, void* modified_ci) override;
     void CreateDevice(const VkDeviceCreateInfo* pCreateInfo) override;
     bool InstrumentShader(const vvl::span<const uint32_t>& input, std::vector<uint32_t>& new_pgm,
                           uint32_t* unique_shader_id) override;
     void PreCallRecordCreateShaderModule(VkDevice device, const VkShaderModuleCreateInfo* pCreateInfo,
                                          const VkAllocationCallbacks* pAllocator, VkShaderModule* pShaderModule,
                                          void* csm_state_data) override;
+    void PostCallRecordQueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo* pSubmits,
+                                                    VkFence fence, VkResult result) override;
     std::vector<DPFSubstring> ParseFormatString(const std::string& format_string);
     std::string FindFormatString(vvl::span<const uint32_t> pgm, uint32_t string_id);
     void AnalyzeAndGenerateMessages(VkCommandBuffer command_buffer, VkQueue queue, DPFBufferInfo& buffer_info,
@@ -172,6 +177,7 @@ class DebugPrintf : public GpuAssistedBase {
     void DestroyBuffer(DPFBufferInfo& buffer_info);
 
   private:
+    bool use_uncached_buffer = false;
     bool verbose = false;
     bool use_stdout = false;
 };
@@ -192,7 +192,8 @@ static VKAPI_ATTR void VKAPI_CALL gpuVkCmdCopyBuffer(VkCommandBuffer commandBuff
     DispatchCmdCopyBuffer(commandBuffer, srcBuffer, dstBuffer, regionCount, pRegions);
 }
 
-VkResult UtilInitializeVma(VkInstance instance, VkPhysicalDevice physical_device, VkDevice device, bool use_buffer_device_address, VmaAllocator *pAllocator) {
+VkResult UtilInitializeVma(VkInstance instance, VkPhysicalDevice physical_device, VkDevice device, bool use_buffer_device_address,
+                           bool use_device_coherent_memory, VmaAllocator *pAllocator) {
     VmaVulkanFunctions functions;
     VmaAllocatorCreateInfo allocator_info = {};
     allocator_info.instance = instance;
@@ -203,6 +204,10 @@ VkResult UtilInitializeVma(VkInstance instance, VkPhysicalDevice physical_device
         allocator_info.flags |= VMA_ALLOCATOR_CREATE_BUFFER_DEVICE_ADDRESS_BIT;
     }
 
+    if (use_device_coherent_memory) {
+        allocator_info.flags |= VMA_ALLOCATOR_CREATE_AMD_DEVICE_COHERENT_MEMORY_BIT;
+    }
+
     functions.vkGetInstanceProcAddr = static_cast<PFN_vkGetInstanceProcAddr>(gpuVkGetInstanceProcAddr);
     functions.vkGetDeviceProcAddr = static_cast<PFN_vkGetDeviceProcAddr>(gpuVkGetDeviceProcAddr);
     functions.vkGetPhysicalDeviceProperties = static_cast<PFN_vkGetPhysicalDeviceProperties>(gpuVkGetPhysicalDeviceProperties);
@@ -373,7 +378,8 @@ void GpuAssistedBase::CreateDevice(const VkDeviceCreateInfo *pCreateInfo) {
     }
     desc_set_bind_index = adjusted_max_desc_sets - 1;
 
-    VkResult result1 = UtilInitializeVma(instance, physical_device, device, force_buffer_device_address, &vmaAllocator);
+    VkResult result1 = UtilInitializeVma(instance, physical_device, device, force_buffer_device_address,
+                                         force_device_coherent_memory, &vmaAllocator);
     assert(result1 == VK_SUCCESS);
     desc_set_manager = std::make_unique<UtilDescriptorSetManager>(device, static_cast<uint32_t>(bindings_.size()));
 

@@ -79,7 +79,7 @@ VALSTATETRACK_DERIVED_STATE_OBJECT(VkQueue, gpu_utils_state::Queue, QUEUE_STATE)
 VALSTATETRACK_DERIVED_STATE_OBJECT(VkCommandBuffer, gpu_utils_state::CommandBuffer, CMD_BUFFER_STATE)
 
 VkResult UtilInitializeVma(VkInstance instance, VkPhysicalDevice physical_device, VkDevice device, bool use_buffer_device_address,
-                           VmaAllocator *pAllocator);
+                           bool use_device_coherent_memory, VmaAllocator *pAllocator);
 
 void UtilGenerateStageMessage(const uint32_t *debug_record, std::string &msg);
 void UtilGenerateCommonMessage(const debug_report_data *report_data, const VkCommandBuffer commandBuffer,
@@ -216,6 +216,7 @@ class GpuAssistedBase : public ValidationStateTracker {
   public:
     bool aborted = false;
     bool force_buffer_device_address;
+    bool force_device_coherent_memory = false;
     PFN_vkSetDeviceLoaderData vkSetDeviceLoaderData;
     const char *setup_vuid;
     VkPhysicalDeviceFeatures supported_features{};