From 090d262afe5d9b6a5c9d29fe15a4ec8d17ab7eda Mon Sep 17 00:00:00 2001
From: Dorian R Apanel <dorian.r.apanel@intel.com>
Date: Wed, 12 Jul 2023 14:35:34 +0200
Subject: [PATCH 1/4] layers: Allow using uncached buffer for Debug Printf

Adds option to force using AMD_DEVICE_COHERENT_MEMORY
for debug printf buffer to print messages
even if VK_ERROR_DEVICE_LOST is encountered.

* Option added to layer json to be visible in vkconfig,
* Forcing extension and device feature if not enabled by application,
* Added workaround for atomic operations in uncached memory
being in cache anyway,
* Added workaround to failing MapMemory after
DEVICE_LOST (occurs on AMD): When using uncached buffer,
do not unmap buffer until messages are analyzed.
---
 layers/VkLayer_khronos_validation.json.in |  20 +++
 layers/gpu_validation/debug_printf.cpp    | 172 ++++++++++++++++++++--
 layers/gpu_validation/debug_printf.h      |   6 +
 layers/gpu_validation/gpu_utils.cpp       |  10 +-
 layers/gpu_validation/gpu_utils.h         |   3 +-
 layers/layer_options.cpp                  |   9 ++
 tests/framework/layer_validation_tests.h  |   2 +-
 tests/unit/debug_printf.cpp               | 135 ++++++++++++++++-
 8 files changed, 341 insertions(+), 16 deletions(-)
diff --git a/layers/VkLayer_khronos_validation.json.in b/layers/VkLayer_khronos_validation.json.in
index d56708b50f6..ce1ae254aa9 100644
--- a/layers/VkLayer_khronos_validation.json.in
+++ b/layers/VkLayer_khronos_validation.json.in
@@ -749,6 +749,26 @@
                                                 ]
                                             }
                                         },
+                                        {
+                                            "key": "printf_uncached_buffer",
+                                            "label": "Printf using uncached buffer (ALPHA)",
+                                            "description": "Use VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD(from VK_AMD_device_coherent_memory) to allocate destination buffer. Slower, but useful in case of instrumenting shader causing VK_ERROR_DEVICE_LOST.",
+                                            "type": "BOOL",
+                                            "default": false,
+                                            "platforms": [
+                                                "WINDOWS",
+                                                "LINUX"
+                                            ],
+                                            "dependence": {
+                                                "mode": "ALL",
+                                                "settings": [
+                                                    {
+                                                        "key": "validate_gpu_based",
+                                                        "value": "GPU_BASED_DEBUG_PRINTF"
+                                                    }
+                                                ]
+                                            }
+                                        },
                                         {
                                             "key": "printf_verbose",
                                             "label": "Printf verbose",
diff --git a/layers/gpu_validation/debug_printf.cpp b/layers/gpu_validation/debug_printf.cpp
index 66e3f655cac..e1efaef7d2d 100644
--- a/layers/gpu_validation/debug_printf.cpp
+++ b/layers/gpu_validation/debug_printf.cpp
@@ -21,6 +21,87 @@
 #include <iostream>
 #include "generated/layer_chassis_dispatch.h"
 
+void DebugPrintf::PreCallRecordCreateDevice(VkPhysicalDevice gpu, const VkDeviceCreateInfo *pCreateInfo,
+                                            const VkAllocationCallbacks *pAllocator, VkDevice *pDevice, void *modified_ci) {
+    GpuAssistedBase::PreCallRecordCreateDevice(gpu, pCreateInfo, pAllocator, pDevice, modified_ci);
+
+    std::string use_uncached_buffer_string = getLayerOption("khronos_validation.printf_uncached_buffer");
+    vvl::ToLower(use_uncached_buffer_string);
+    use_uncached_buffer = !use_uncached_buffer_string.compare("true");
+
+    if (use_uncached_buffer)
+    {
+        static const std::string dcm_ext{"VK_AMD_device_coherent_memory"};
+        {
+            bool dcm_supported = false;
+            uint32_t property_count = 0;
+            if (DispatchEnumerateDeviceExtensionProperties(gpu, nullptr, &property_count, nullptr) == VK_SUCCESS) {
+                std::vector<VkExtensionProperties> property_list(property_count);
+                if (DispatchEnumerateDeviceExtensionProperties(gpu, nullptr, &property_count, property_list.data()) ==
+                    VK_SUCCESS) {
+                    for (const VkExtensionProperties &properties : property_list) {
+                        if (dcm_ext == properties.extensionName) {
+                            dcm_supported = true;
+                        }
+                    }
+                }
+            }
+            if (!dcm_supported) {
+                ReportSetupProblem(
+                    device, "Debug Printf with uncached buffer requires VK_AMD_device_coherent_memory which is not supported");
+                aborted = true;
+                return;
+            }
+        }
+
+        // See CreateDevice() in chassis.cpp. modified_ci is a pointer to a safe struct stored on the stack.
+        // This code follows the safe struct memory memory management scheme. That is, we must delete any memory
+        // remove from the safe struct, and any additions must be allocated in a way that is compatible with
+        // the safe struct destructor.
+        auto *modified_create_info = static_cast<safe_VkDeviceCreateInfo *>(modified_ci);
+
+        bool found_ext = false;
+        for (uint32_t i = 0; i < modified_create_info->enabledExtensionCount; i++) {
+            if (dcm_ext == modified_create_info->ppEnabledExtensionNames[i]) {
+                found_ext = true;
+                break;
+            }
+        }
+        if (!found_ext) {
+            LogInfo(gpu, "UNASSIGNED-DEBUG-PRINTF", "VK_AMD_device_coherent_memory extension not enabled but use_uncached_buffer is true. Forcing extension.");
+            const char **ext_names = new const char *[modified_create_info->enabledExtensionCount + 1];
+            // Copy the existing pointer table
+            std::copy(modified_create_info->ppEnabledExtensionNames,
+                      modified_create_info->ppEnabledExtensionNames + modified_create_info->enabledExtensionCount, ext_names);
+            // Add our new extension
+            char *dcm_ext_copy = new char[dcm_ext.size() + 1]{};
+            dcm_ext.copy(dcm_ext_copy, dcm_ext.size());
+            dcm_ext_copy[dcm_ext.size()] = '\0';
+            ext_names[modified_create_info->enabledExtensionCount] = dcm_ext_copy;
+            // Patch up the safe struct
+            delete[] modified_create_info->ppEnabledExtensionNames;
+            modified_create_info->ppEnabledExtensionNames = ext_names;
+            modified_create_info->enabledExtensionCount++;
+        }
+        auto *dcm_features = const_cast<VkPhysicalDeviceCoherentMemoryFeaturesAMD *>(
+            LvlFindInChain<VkPhysicalDeviceCoherentMemoryFeaturesAMD>(modified_create_info));
+        if (dcm_features) {
+            if (dcm_features->deviceCoherentMemory != VK_TRUE) {
+                LogInfo(gpu, "UNASSIGNED-DEBUG-PRINTF",
+                        "use_uncached_buffer is true, but deviceCoherentMemory feature is not enabled. Force enabling feature.");
+                dcm_features->deviceCoherentMemory = VK_TRUE;
+            }
+        } else {
+            LogInfo(gpu, "UNASSIGNED-DEBUG-PRINTF",
+                    "use_uncached_buffer is true, but deviceCoherentMemory feature is not enabled. Force enabling feature.");
+            auto new_dcm_features = LvlInitStruct<VkPhysicalDeviceCoherentMemoryFeaturesAMD>();
+            new_dcm_features.deviceCoherentMemory = VK_TRUE;
+            new_dcm_features.pNext = const_cast<void *>(modified_create_info->pNext);
+            modified_create_info->pNext = new VkPhysicalDeviceCoherentMemoryFeaturesAMD(new_dcm_features);
+        }
+    }
+}
+
 // Perform initializations that can be done at Create Device time.
 void DebugPrintf::CreateDevice(const VkDeviceCreateInfo *pCreateInfo) {
     if (enabled[gpu_validation]) {
@@ -42,7 +123,16 @@ void DebugPrintf::CreateDevice(const VkDeviceCreateInfo *pCreateInfo) {
     use_stdout = !stdout_string.compare("true");
     if (getenv("DEBUG_PRINTF_TO_STDOUT")) use_stdout = true;
 
-    // GpuAssistedBase::CreateDevice will set up bindings
+    // Need to get this option again, because PreCallRecordCreateDevice was done
+    // in separate DebugPrintf instance (during VkInstance creation).
+    std::string use_uncached_buffer_string = getLayerOption("khronos_validation.printf_uncached_buffer");
+    vvl::ToLower(use_uncached_buffer_string);
+    use_uncached_buffer = !use_uncached_buffer_string.compare("true");
+    if (use_uncached_buffer) {
+        force_device_coherent_memory = true; // vma needs to know it.
+    }
+
+    // GpuAssistedBase::CreateDevice will set up bindings.
     VkDescriptorSetLayoutBinding binding = {3, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1,
                                             VK_SHADER_STAGE_ALL_GRAPHICS | VK_SHADER_STAGE_MESH_BIT_EXT |
                                                 VK_SHADER_STAGE_TASK_BIT_EXT | VK_SHADER_STAGE_COMPUTE_BIT |
@@ -138,6 +228,49 @@ void DebugPrintf::PreCallRecordCreateShaderModule(VkDevice device, const VkShade
     }
 }
 
+// Override GpuAssistedBase version to allow processing in case of VK_ERROR_DEVICE_LOST when using uncached buffer.
+void DebugPrintf::PostCallRecordQueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo *pSubmits, VkFence fence,
+                                            VkResult result) {
+    ValidationStateTracker::PostCallRecordQueueSubmit(queue, submitCount, pSubmits, fence, result);
+
+    bool device_lost = (result == VK_ERROR_DEVICE_LOST);
+
+    if (aborted) return;
+
+    if (result != VK_SUCCESS) {
+        if (!use_uncached_buffer) {
+            return;
+        } else if (!device_lost) {
+            return;  // VK_ERROR_OUT_OF_HOST_MEMORY or VK_ERROR_OUT_OF_DEVICE_MEMORY
+        }
+    }
+
+    bool buffers_present = false;
+    // Don't QueueWaitIdle if there's nothing to process
+    for (uint32_t submit_idx = 0; submit_idx < submitCount; submit_idx++) {
+        const VkSubmitInfo *submit = &pSubmits[submit_idx];
+        for (uint32_t i = 0; i < submit->commandBufferCount; i++) {
+            buffers_present |= CommandBufferNeedsProcessing(submit->pCommandBuffers[i]);
+        }
+    }
+    if (!buffers_present) return;
+
+    if (!device_lost) {
+        SubmitBarrier(queue);
+
+        DispatchQueueWaitIdle(queue); /// @todo Dispatch wait idle only after SubmitBarrier() succeeded.
+    } else {
+        assert(use_uncached_buffer);
+    }
+
+    for (uint32_t submit_idx = 0; submit_idx < submitCount; submit_idx++) {
+        const VkSubmitInfo *submit = &pSubmits[submit_idx];
+        for (uint32_t i = 0; i < submit->commandBufferCount; i++) {
+            ProcessCommandBuffer(queue, submit->pCommandBuffers[i]);
+        }
+    }
+}
+
 vartype vartype_lookup(char intype) {
     switch (intype) {
         case 'd':
@@ -308,10 +441,14 @@ void DebugPrintf::AnalyzeAndGenerateMessages(VkCommandBuffer command_buffer, VkQ
     //    9         Printf Values Word 0 (optional)
     //    10         Printf Values Word 1 (optional)
     uint32_t expect = debug_output_buffer[1];
-    if (!expect) return;
+    // Total size of all messages are written by AtomicAdd. Atomics in uncached memory seems to be working in caches anyway
+    // and are not flushed to uncached memory at the end. In that case, expect will contain zero.
+    // As a WA just parse messages using individual sizes (written correctly).
+    // Can the messages be overridden because of those atomics?
+    if (!expect && !use_uncached_buffer) return;
 
     uint32_t index = spvtools::kDebugOutputDataOffset;
-    while (debug_output_buffer[index]) {
+    while ((index < output_buffer_size) && debug_output_buffer[index]) {
         std::stringstream shader_message;
         VkShaderModule shader_module_handle = VK_NULL_HANDLE;
         VkPipeline pipeline_handle = VK_NULL_HANDLE;
@@ -412,7 +549,8 @@ void DebugPrintf::AnalyzeAndGenerateMessages(VkCommandBuffer command_buffer, VkQ
         }
         index += debug_record->size;
     }
-    if ((index - spvtools::kDebugOutputDataOffset) != expect) {
+    if ((use_uncached_buffer && (index >= output_buffer_size)) ||
+        (!use_uncached_buffer && (index - spvtools::kDebugOutputDataOffset) != expect)) {
         LogWarning(device, "UNASSIGNED-DEBUG-PRINTF",
                    "WARNING - Debug Printf message was truncated, likely due to a buffer size that was too small for the message");
     }
@@ -429,7 +567,6 @@ void debug_printf_state::CommandBuffer::Process(VkQueue queue) {
         uint32_t ray_trace_index = 0;
 
         for (auto &buffer_info : gpu_buffer_list) {
-            char *data;
 
             uint32_t operation_index = 0;
             if (buffer_info.pipeline_bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS) {
@@ -445,10 +582,16 @@ void debug_printf_state::CommandBuffer::Process(VkQueue queue) {
                 assert(false);
             }
 
-            VkResult result = vmaMapMemory(device_state->vmaAllocator, buffer_info.output_mem_block.allocation, (void **)&data);
+            VkResult result = VK_SUCCESS;
+            if (buffer_info.output_mem_block.data == nullptr) {
+                result = vmaMapMemory(device_state->vmaAllocator, buffer_info.output_mem_block.allocation,
+                                      (void **)&buffer_info.output_mem_block.data);
+            }
             if (result == VK_SUCCESS) {
-                device_state->AnalyzeAndGenerateMessages(commandBuffer(), queue, buffer_info, operation_index, (uint32_t *)data);
+                device_state->AnalyzeAndGenerateMessages(commandBuffer(), queue, buffer_info, operation_index,
+                                                         (uint32_t *)buffer_info.output_mem_block.data);
                 vmaUnmapMemory(device_state->vmaAllocator, buffer_info.output_mem_block.allocation);
+                buffer_info.output_mem_block.data = nullptr;
             }
         }
     }
@@ -672,6 +815,9 @@ void DebugPrintf::AllocateDebugPrintfResources(const VkCommandBuffer cmd_buffer,
     buffer_info.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
     VmaAllocationCreateInfo alloc_info = {};
     alloc_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+    if (use_uncached_buffer) {
+        alloc_info.requiredFlags |= VK_MEMORY_PROPERTY_DEVICE_COHERENT_BIT_AMD | VK_MEMORY_PROPERTY_DEVICE_UNCACHED_BIT_AMD;
+    }
     result = vmaCreateBuffer(vmaAllocator, &buffer_info, &alloc_info, &output_block.buffer, &output_block.allocation, nullptr);
     if (result != VK_SUCCESS) {
         ReportSetupProblem(device, "Unable to allocate device memory.  Device could become unstable.");
@@ -680,11 +826,15 @@ void DebugPrintf::AllocateDebugPrintfResources(const VkCommandBuffer cmd_buffer,
     }
 
     // Clear the output block to zeros so that only printf values from the gpu will be present
-    uint32_t *data;
-    result = vmaMapMemory(vmaAllocator, output_block.allocation, reinterpret_cast<void **>(&data));
+    result = vmaMapMemory(vmaAllocator, output_block.allocation, reinterpret_cast<void **>(&output_block.data));
     if (result == VK_SUCCESS) {
-        memset(data, 0, output_buffer_size);
-        vmaUnmapMemory(vmaAllocator, output_block.allocation);
+        memset(output_block.data, 0, output_buffer_size);
+        // Mapping may fail after DEVICE_LOST. Keep it mapped for now in such case.
+        // Will be unmapped in debug_printf_state::CommandBuffer::Process
+        if (!use_uncached_buffer) {
+            vmaUnmapMemory(vmaAllocator, output_block.allocation);
+            output_block.data = nullptr;
+        }
     }
 
     auto desc_writes = LvlInitStruct<VkWriteDescriptorSet>();
diff --git a/layers/gpu_validation/debug_printf.h b/layers/gpu_validation/debug_printf.h
index 7b0921747ad..6bb8e206e01 100644
--- a/layers/gpu_validation/debug_printf.h
+++ b/layers/gpu_validation/debug_printf.h
@@ -23,6 +23,7 @@ class DebugPrintf;
 struct DPFDeviceMemoryBlock {
     VkBuffer buffer;
     VmaAllocation allocation;
+    uint32_t* data; // only valid if using uncached buffer, because mapping may fail after device is lost.
 };
 
 struct DPFBufferInfo {
@@ -86,12 +87,16 @@ class DebugPrintf : public GpuAssistedBase {
         desired_features.fragmentStoresAndAtomics = true;
     }
 
+    void PreCallRecordCreateDevice(VkPhysicalDevice gpu, const VkDeviceCreateInfo* pCreateInfo,
+                                   const VkAllocationCallbacks* pAllocator, VkDevice* pDevice, void* modified_ci) override;
     void CreateDevice(const VkDeviceCreateInfo* pCreateInfo) override;
     bool InstrumentShader(const vvl::span<const uint32_t>& input, std::vector<uint32_t>& new_pgm,
                           uint32_t* unique_shader_id) override;
     void PreCallRecordCreateShaderModule(VkDevice device, const VkShaderModuleCreateInfo* pCreateInfo,
                                          const VkAllocationCallbacks* pAllocator, VkShaderModule* pShaderModule,
                                          void* csm_state_data) override;
+    void PostCallRecordQueueSubmit(VkQueue queue, uint32_t submitCount, const VkSubmitInfo* pSubmits,
+                                                    VkFence fence, VkResult result) override;
     std::vector<DPFSubstring> ParseFormatString(const std::string& format_string);
     std::string FindFormatString(vvl::span<const uint32_t> pgm, uint32_t string_id);
     void AnalyzeAndGenerateMessages(VkCommandBuffer command_buffer, VkQueue queue, DPFBufferInfo& buffer_info,
@@ -172,6 +177,7 @@ class DebugPrintf : public GpuAssistedBase {
     void DestroyBuffer(DPFBufferInfo& buffer_info);
 
   private:
+    bool use_uncached_buffer = false;
     bool verbose = false;
     bool use_stdout = false;
 };
diff --git a/layers/gpu_validation/gpu_utils.cpp b/layers/gpu_validation/gpu_utils.cpp
index 76ef2d2658e..0fde38c8993 100644
--- a/layers/gpu_validation/gpu_utils.cpp
+++ b/layers/gpu_validation/gpu_utils.cpp
@@ -192,7 +192,8 @@ static VKAPI_ATTR void VKAPI_CALL gpuVkCmdCopyBuffer(VkCommandBuffer commandBuff
     DispatchCmdCopyBuffer(commandBuffer, srcBuffer, dstBuffer, regionCount, pRegions);
 }
 
-VkResult UtilInitializeVma(VkInstance instance, VkPhysicalDevice physical_device, VkDevice device, bool use_buffer_device_address, VmaAllocator *pAllocator) {
+VkResult UtilInitializeVma(VkInstance instance, VkPhysicalDevice physical_device, VkDevice device, bool use_buffer_device_address,
+                           bool use_device_coherent_memory, VmaAllocator *pAllocator) {
     VmaVulkanFunctions functions;
     VmaAllocatorCreateInfo allocator_info = {};
     allocator_info.instance = instance;
@@ -203,6 +204,10 @@ VkResult UtilInitializeVma(VkInstance instance, VkPhysicalDevice physical_device
         allocator_info.flags |= VMA_ALLOCATOR_CREATE_BUFFER_DEVICE_ADDRESS_BIT;
     }
 
+    if (use_device_coherent_memory) {
+        allocator_info.flags |= VMA_ALLOCATOR_CREATE_AMD_DEVICE_COHERENT_MEMORY_BIT;
+    }
+
     functions.vkGetInstanceProcAddr = static_cast<PFN_vkGetInstanceProcAddr>(gpuVkGetInstanceProcAddr);
     functions.vkGetDeviceProcAddr = static_cast<PFN_vkGetDeviceProcAddr>(gpuVkGetDeviceProcAddr);
     functions.vkGetPhysicalDeviceProperties = static_cast<PFN_vkGetPhysicalDeviceProperties>(gpuVkGetPhysicalDeviceProperties);
@@ -373,7 +378,8 @@ void GpuAssistedBase::CreateDevice(const VkDeviceCreateInfo *pCreateInfo) {
     }
     desc_set_bind_index = adjusted_max_desc_sets - 1;
 
-    VkResult result1 = UtilInitializeVma(instance, physical_device, device, force_buffer_device_address, &vmaAllocator);
+    VkResult result1 = UtilInitializeVma(instance, physical_device, device, force_buffer_device_address,
+                                         force_device_coherent_memory, &vmaAllocator);
     assert(result1 == VK_SUCCESS);
     desc_set_manager = std::make_unique<UtilDescriptorSetManager>(device, static_cast<uint32_t>(bindings_.size()));
 
diff --git a/layers/gpu_validation/gpu_utils.h b/layers/gpu_validation/gpu_utils.h
index 78f086f71c6..fbb585085d8 100644
--- a/layers/gpu_validation/gpu_utils.h
+++ b/layers/gpu_validation/gpu_utils.h
@@ -79,7 +79,7 @@ VALSTATETRACK_DERIVED_STATE_OBJECT(VkQueue, gpu_utils_state::Queue, QUEUE_STATE)
 VALSTATETRACK_DERIVED_STATE_OBJECT(VkCommandBuffer, gpu_utils_state::CommandBuffer, CMD_BUFFER_STATE)
 
 VkResult UtilInitializeVma(VkInstance instance, VkPhysicalDevice physical_device, VkDevice device, bool use_buffer_device_address,
-                           VmaAllocator *pAllocator);
+                           bool use_device_coherent_memory, VmaAllocator *pAllocator);
 
 void UtilGenerateStageMessage(const uint32_t *debug_record, std::string &msg);
 void UtilGenerateCommonMessage(const debug_report_data *report_data, const VkCommandBuffer commandBuffer,
@@ -216,6 +216,7 @@ class GpuAssistedBase : public ValidationStateTracker {
   public:
     bool aborted = false;
     bool force_buffer_device_address;
+    bool force_device_coherent_memory = false;
     PFN_vkSetDeviceLoaderData vkSetDeviceLoaderData;
     const char *setup_vuid;
     VkPhysicalDeviceFeatures supported_features{};
diff --git a/layers/layer_options.cpp b/layers/layer_options.cpp
index 5b2b0f85fe3..6f33e8f0a2e 100644
--- a/layers/layer_options.cpp
+++ b/layers/layer_options.cpp
@@ -53,6 +53,8 @@ const char *SETTING_CUSTOM_STYPE_LIST = "custom_stype_list";
 const char *SETTING_DUPLICATE_MESSAGE_LIMIT = "duplicate_message_limit";
 const char *SETTING_FINE_GRAINED_LOCKING = "fine_grained_locking";
 
+const char *SETTING_DEBUG_PRINTF_UNCACHED_BUFFER = "printf_uncached_buffer";
+
 // Set the local disable flag for the appropriate VALIDATION_CHECK_DISABLE enum
 void SetValidationDisable(CHECK_DISABLED &disable_data, const ValidationCheckDisables disable_id) {
     switch (disable_id) {
@@ -373,6 +375,11 @@ static std::string GetConfigValue(const char *setting) {
     return getLayerOption(key.c_str());
 }
 
+static void SetConfigValue(const char *setting, const char* value) {
+    const std::string key(GetSettingKey(setting));
+    return setLayerOption(key.c_str(), value);
+}
+
 static std::string GetEnvVarValue(const char *setting) {
     std::string env_var = setting;
     vvl::ToUpper(env_var);
@@ -438,6 +445,8 @@ void ProcessConfigAndEnvSettings(ConfigAndEnvSettings *settings_data) {
                 CreateFilterMessageIdList(data, ",", settings_data->message_filter_list);
             } else if (name == SETTING_DUPLICATE_MESSAGE_LIMIT) {
                 *settings_data->duplicate_message_limit = cur_setting.data.value32;
+            } else if ((name == SETTING_DEBUG_PRINTF_UNCACHED_BUFFER) && cur_setting.data.valueBool) {
+                SetConfigValue(SETTING_DEBUG_PRINTF_UNCACHED_BUFFER, "true");
             } else if (name == SETTING_CUSTOM_STYPE_LIST) {
                 if (cur_setting.type == VK_LAYER_SETTING_VALUE_TYPE_STRING_ARRAY_EXT) {
                     std::string data(cur_setting.data.arrayString.pCharArray);
diff --git a/tests/framework/layer_validation_tests.h b/tests/framework/layer_validation_tests.h
index b3af136f0be..50febd35dab 100644
--- a/tests/framework/layer_validation_tests.h
+++ b/tests/framework/layer_validation_tests.h
@@ -280,7 +280,7 @@ class VkGpuAssistedLayerTest : public VkLayerTest {
 
 class NegativeDebugPrintf : public VkLayerTest {
   public:
-    void InitDebugPrintfFramework();
+    void InitDebugPrintfFramework(bool use_uncached_buffer = false);
 
   protected:
 };
diff --git a/tests/unit/debug_printf.cpp b/tests/unit/debug_printf.cpp
index 042c9afe7a4..3fdef7e3751 100644
--- a/tests/unit/debug_printf.cpp
+++ b/tests/unit/debug_printf.cpp
@@ -13,7 +13,25 @@
 
 #include "../framework/layer_validation_tests.h"
 
-void NegativeDebugPrintf::InitDebugPrintfFramework() {
+class UncachedBufferSetting {
+  public:
+    UncachedBufferSetting(const bool use_uncached_buffer /*= false*/) {
+        setting_value_data.valueBool = use_uncached_buffer;
+
+        strncpy(setting_val.name, "printf_uncached_buffer", sizeof(setting_val.name));
+        setting_val.type = VK_LAYER_SETTING_VALUE_TYPE_BOOL_EXT;
+        setting_val.data = setting_value_data;
+        setting = {VK_STRUCTURE_TYPE_INSTANCE_LAYER_SETTINGS_EXT, nullptr, 1, &setting_val};
+    }
+    VkLayerSettingsEXT *pnext{&setting};
+
+  private:
+    VkLayerSettingValueDataEXT setting_value_data{};
+    VkLayerSettingValueEXT setting_val;
+    VkLayerSettingsEXT setting;
+};
+
+void NegativeDebugPrintf::InitDebugPrintfFramework(bool use_uncached_buffer) {
     VkValidationFeatureEnableEXT enables[] = {VK_VALIDATION_FEATURE_ENABLE_DEBUG_PRINTF_EXT};
     VkValidationFeatureDisableEXT disables[] = {
         VK_VALIDATION_FEATURE_DISABLE_THREAD_SAFETY_EXT, VK_VALIDATION_FEATURE_DISABLE_API_PARAMETERS_EXT,
@@ -24,6 +42,11 @@ void NegativeDebugPrintf::InitDebugPrintfFramework() {
     features.pEnabledValidationFeatures = enables;
     features.pDisabledValidationFeatures = disables;
 
+    auto uncached_buffer_setting = UncachedBufferSetting(use_uncached_buffer);
+    if (use_uncached_buffer) {
+        features.pNext = uncached_buffer_setting.pnext;
+    }
+
     InitFramework(m_errorMonitor, &features);
 }
 
@@ -1063,3 +1086,113 @@ TEST_F(NegativeDebugPrintf, GPLFragmentIndependentSets) {
     vk::QueueWaitIdle(m_device->m_queue);
     m_errorMonitor->VerifyFound();
 }
+
+TEST_F(NegativeDebugPrintf, UncachedBuffer) {
+    TEST_DESCRIPTION("Verify that calls to debugPrintfEXT are received in debug stream");
+    SetTargetApiVersion(VK_API_VERSION_1_1);
+    AddRequiredExtensions(VK_KHR_SHADER_NON_SEMANTIC_INFO_EXTENSION_NAME);
+    AddRequiredExtensions(VK_AMD_DEVICE_COHERENT_MEMORY_EXTENSION_NAME);  // It must be supported but if not enabled
+                                                                          // Debug Printf can force enable it.
+    InitDebugPrintfFramework(true /*use_uncached_buffer*/);
+
+    if (!AreRequiredExtensionsEnabled()) {
+        GTEST_SKIP() << RequiredExtensionsNotSupported() << " not supported";
+    }
+
+    // deviceCoherentMemory feature needs to be supported, but if not specified or enabled, Debug Printf will force enable it.
+    auto dcm_features = LvlInitStruct<VkPhysicalDeviceCoherentMemoryFeaturesAMD>();
+    auto features2 = GetPhysicalDeviceFeatures2(dcm_features);
+    if (dcm_features.deviceCoherentMemory == 0) {
+        GTEST_SKIP() << "deviceCoherentMemory feature not supported";
+    }
+
+    ASSERT_NO_FATAL_FAILURE(InitState(nullptr, &dcm_features, VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT));
+    if (DeviceValidationVersion() < VK_API_VERSION_1_1) {
+        GTEST_SKIP() << "At least Vulkan version 1.1 is required";
+    }
+
+    auto features = m_device->phy().features();
+    if (!features.vertexPipelineStoresAndAtomics || !features.fragmentStoresAndAtomics) {
+        GTEST_SKIP() << "GPU-Assisted printf test requires vertexPipelineStoresAndAtomics and fragmentStoresAndAtomics";
+    }
+    ASSERT_NO_FATAL_FAILURE(InitViewport());
+    ASSERT_NO_FATAL_FAILURE(InitRenderTarget());
+
+    if (IsPlatform(kMockICD)) {
+        GTEST_SKIP() << "Test not supported by MockICD, GPU-Assisted validation test requires a driver that can draw";
+    }
+    // Make a uniform buffer to be passed to the shader that contains the test number
+    uint32_t qfi = 0;
+    VkBufferCreateInfo bci = LvlInitStruct<VkBufferCreateInfo>();
+    bci.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
+    bci.size = 8;
+    bci.queueFamilyIndexCount = 1;
+    bci.pQueueFamilyIndices = &qfi;
+
+    VkPushConstantRange push_constant_range = {VK_SHADER_STAGE_VERTEX_BIT, 0, sizeof(float) * 4};
+
+    const VkPipelineLayoutObj pipeline_layout(m_device, {}, {push_constant_range});
+
+    float push_constants[4] = {0.0, 1.0, 2.0, 3.0};
+
+    char const *shader_source = R"glsl(
+        #version 450
+        #extension GL_EXT_debug_printf : enable
+        layout(push_constant, std430) uniform foo { float x[4]; } constants;
+
+        void main() {
+            float myfloat = 3.1415f;
+            gl_Position = vec4(0.0, 0.0, 0.0, 0.0);
+            if (gl_VertexIndex == 0) {
+                debugPrintfEXT("Here are three float values %f, %f, %f", 1.0, myfloat, gl_Position.x);
+                float x = constants.x[0];
+                while(x > -1.f) { // infinite loop
+                    x += 0.000001f;
+                }
+                debugPrintfEXT("Here is a value that should not be printed %f", x);
+            }
+        }
+    )glsl";
+    char const *message = "Here are three float values 1.000000, 3.141500, 0.000000";
+
+    VkShaderObj vs(this, shader_source, VK_SHADER_STAGE_VERTEX_BIT, SPV_ENV_VULKAN_1_0, SPV_SOURCE_GLSL, nullptr, "main", true);
+
+    VkViewport viewport = m_viewports[0];
+    VkRect2D scissors = m_scissors[0];
+
+    VkSubmitInfo submit_info = LvlInitStruct<VkSubmitInfo>();
+    submit_info.commandBufferCount = 1;
+    submit_info.pCommandBuffers = &m_commandBuffer->handle();
+
+    VkPipelineObj pipe(m_device);
+    pipe.AddShader(&vs);
+    pipe.AddDefaultColorAttachment();
+    pipe.DisableRasterization();
+    VkResult err = pipe.CreateVKPipeline(pipeline_layout.handle(), renderPass());
+    ASSERT_VK_SUCCESS(err);
+
+    VkCommandBufferBeginInfo begin_info = LvlInitStruct<VkCommandBufferBeginInfo>();
+    VkCommandBufferInheritanceInfo hinfo = LvlInitStruct<VkCommandBufferInheritanceInfo>();
+    begin_info.pInheritanceInfo = &hinfo;
+
+    m_commandBuffer->begin(&begin_info);
+    m_commandBuffer->BeginRenderPass(m_renderPassBeginInfo);
+    vk::CmdBindPipeline(m_commandBuffer->handle(), VK_PIPELINE_BIND_POINT_GRAPHICS, pipe.handle());
+    vk::CmdPushConstants(m_commandBuffer->handle(), pipeline_layout.handle(), VK_SHADER_STAGE_VERTEX_BIT, 0, sizeof(push_constants),
+                         push_constants);
+    vk::CmdSetViewport(m_commandBuffer->handle(), 0, 1, &viewport);
+    vk::CmdSetScissor(m_commandBuffer->handle(), 0, 1, &scissors);
+    vk::CmdDraw(m_commandBuffer->handle(), 3, 1, 0, 0);
+    vk::CmdEndRenderPass(m_commandBuffer->handle());
+    m_commandBuffer->end();
+
+    m_errorMonitor->SetDesiredFailureMsg(kInformationBit, message);
+
+    err = vk::QueueSubmit(m_device->m_queue, 1, &submit_info, VK_NULL_HANDLE);
+    ASSERT_TRUE((err == VK_SUCCESS) || (err == VK_ERROR_DEVICE_LOST)) << vk_result_string(err);
+    if (err == VK_SUCCESS) {
+        err = vk::QueueWaitIdle(m_device->m_queue);
+        ASSERT_TRUE((err == VK_SUCCESS) || (err == VK_ERROR_DEVICE_LOST)) << vk_result_string(err);
+    }
+    m_errorMonitor->VerifyFound();
+}

From c0c5d42e390c7866978d19c2657ccc7cc5d9a2e0 Mon Sep 17 00:00:00 2001
From: Dorian R Apanel <dorian.r.apanel@intel.com>
Date: Mon, 17 Jul 2023 15:20:33 +0200
Subject: [PATCH 2/4] tests: Removed unused variables

---
 tests/unit/debug_printf.cpp | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/tests/unit/debug_printf.cpp b/tests/unit/debug_printf.cpp
index 3fdef7e3751..8c56c38e1cd 100644
--- a/tests/unit/debug_printf.cpp
+++ b/tests/unit/debug_printf.cpp
@@ -1101,7 +1101,7 @@ TEST_F(NegativeDebugPrintf, UncachedBuffer) {
 
     // deviceCoherentMemory feature needs to be supported, but if not specified or enabled, Debug Printf will force enable it.
     auto dcm_features = LvlInitStruct<VkPhysicalDeviceCoherentMemoryFeaturesAMD>();
-    auto features2 = GetPhysicalDeviceFeatures2(dcm_features);
+    GetPhysicalDeviceFeatures2(dcm_features);
     if (dcm_features.deviceCoherentMemory == 0) {
         GTEST_SKIP() << "deviceCoherentMemory feature not supported";
     }
@@ -1121,13 +1121,6 @@ TEST_F(NegativeDebugPrintf, UncachedBuffer) {
     if (IsPlatform(kMockICD)) {
         GTEST_SKIP() << "Test not supported by MockICD, GPU-Assisted validation test requires a driver that can draw";
     }
-    // Make a uniform buffer to be passed to the shader that contains the test number
-    uint32_t qfi = 0;
-    VkBufferCreateInfo bci = LvlInitStruct<VkBufferCreateInfo>();
-    bci.usage = VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
-    bci.size = 8;
-    bci.queueFamilyIndexCount = 1;
-    bci.pQueueFamilyIndices = &qfi;
 
     VkPushConstantRange push_constant_range = {VK_SHADER_STAGE_VERTEX_BIT, 0, sizeof(float) * 4};
 

From cb05cb6e66af3ce56c4ada85ccea4436a359fcb9 Mon Sep 17 00:00:00 2001
From: Dorian R Apanel <dorian.r.apanel@intel.com>
Date: Wed, 19 Jul 2023 13:01:40 +0200
Subject: [PATCH 3/4] tests: Added VK_TIMEOUT handling

---
 tests/unit/debug_printf.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/unit/debug_printf.cpp b/tests/unit/debug_printf.cpp
index 8c56c38e1cd..fb5263eb4f2 100644
--- a/tests/unit/debug_printf.cpp
+++ b/tests/unit/debug_printf.cpp
@@ -1182,10 +1182,11 @@ TEST_F(NegativeDebugPrintf, UncachedBuffer) {
     m_errorMonitor->SetDesiredFailureMsg(kInformationBit, message);
 
     err = vk::QueueSubmit(m_device->m_queue, 1, &submit_info, VK_NULL_HANDLE);
-    ASSERT_TRUE((err == VK_SUCCESS) || (err == VK_ERROR_DEVICE_LOST)) << vk_result_string(err);
+    // Some drivers return VK_TIMEOUT (spec does not allow it).
+    ASSERT_TRUE((err == VK_SUCCESS) || (err == VK_ERROR_DEVICE_LOST) || (err == VK_TIMEOUT)) << vk_result_string(err);
     if (err == VK_SUCCESS) {
         err = vk::QueueWaitIdle(m_device->m_queue);
-        ASSERT_TRUE((err == VK_SUCCESS) || (err == VK_ERROR_DEVICE_LOST)) << vk_result_string(err);
+        ASSERT_TRUE((err == VK_SUCCESS) || (err == VK_ERROR_DEVICE_LOST) || (err == VK_TIMEOUT)) << vk_result_string(err);
     }
     m_errorMonitor->VerifyFound();
 }

From 4b956cfbc4e2e8c402b971c8e7cb077f29e2d697 Mon Sep 17 00:00:00 2001
From: Dorian R Apanel <dorian.r.apanel@intel.com>
Date: Fri, 21 Jul 2023 11:47:53 +0200
Subject: [PATCH 4/4] layers: Fix printf debug_output_buffer clear

---
 layers/gpu_validation/debug_printf.cpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/layers/gpu_validation/debug_printf.cpp b/layers/gpu_validation/debug_printf.cpp
index e1efaef7d2d..1cfbd9bc8f8 100644
--- a/layers/gpu_validation/debug_printf.cpp
+++ b/layers/gpu_validation/debug_printf.cpp
@@ -440,11 +440,10 @@ void DebugPrintf::AnalyzeAndGenerateMessages(VkCommandBuffer command_buffer, VkQ
     //    8         Printf Format String Id
     //    9         Printf Values Word 0 (optional)
     //    10         Printf Values Word 1 (optional)
-    uint32_t expect = debug_output_buffer[1];
+    uint32_t expect = debug_output_buffer[spvtools::kDebugOutputSizeOffset];
     // Total size of all messages are written by AtomicAdd. Atomics in uncached memory seems to be working in caches anyway
     // and are not flushed to uncached memory at the end. In that case, expect will contain zero.
     // As a WA just parse messages using individual sizes (written correctly).
-    // Can the messages be overridden because of those atomics?
     if (!expect && !use_uncached_buffer) return;
 
     uint32_t index = spvtools::kDebugOutputDataOffset;
@@ -554,7 +553,14 @@ void DebugPrintf::AnalyzeAndGenerateMessages(VkCommandBuffer command_buffer, VkQ
         LogWarning(device, "UNASSIGNED-DEBUG-PRINTF",
                    "WARNING - Debug Printf message was truncated, likely due to a buffer size that was too small for the message");
     }
-    memset(debug_output_buffer, 0, 4 * (debug_output_buffer[spvtools::kDebugOutputSizeOffset] + spvtools::kDebugOutputDataOffset));
+
+    if (use_uncached_buffer) {
+        // WA for atomics.
+        memset(debug_output_buffer, 0, output_buffer_size);
+    } else {
+        // Clear only written memory.
+        memset(debug_output_buffer, 0, sizeof(uint32_t) * (expect + spvtools::kDebugOutputDataOffset));
+    }
 }
 
 // For the given command buffer, map its debug data buffers and read their contents for analysis.