Fix

microsoft · Sep 25, 2024 · 5e15b59 · 5e15b59
1 parent 03b094b
commit 5e15b59
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 23 deletions.
diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
@@ -324,6 +324,15 @@ class IExecutionProvider {
     return default_device_;
   };
 
+  /**
+   * Return the appropriate OrtDevice object given OrtMemType for allocating graph inputs, including initializers.
+   * It returns the same allocator as GetOrtDeviceByMemType by default, but it can be overriden by execution providers
+   * if needed.
+   */
+  virtual OrtDevice GetOrtDeviceByMemTypeForGraphInput(OrtMemType mem_type) const {
+    return GetOrtDeviceByMemType(mem_type);
+  };
+
   /**
    * Create Preferred allocators for the current Execution Provider
    * This function is a stateless function which creates new instances of Allocator, without storing them in EP.

diff --git a/onnxruntime/core/framework/allocation_planner.cc b/onnxruntime/core/framework/allocation_planner.cc
@@ -767,16 +767,7 @@ class PlannerImpl {
 
             if (!is_implicit_input) {
               OrtMemType mem_type = p_kernel_def->InputMemoryType(arg_idx);
-              auto ort_device = exec_provider->GetOrtDeviceByMemType(mem_type);
-
-#ifdef USE_DML
-              // DML uses a different allocator for weights and inputs that allocates unpooled memory
-              if (p_kernel_def->Provider() == onnxruntime::kDmlExecutionProvider && mem_type == OrtMemType::OrtMemTypeDefault) {
-                ort_device = OrtDevice(ort_device.Type(), OrtDevice::MemType::DML_UNPOOLED, ort_device.Id());
-              }
-#endif
-
-              plan_.SetLocation(static_cast<size_t>(index), ort_device);
+              plan_.SetLocation(static_cast<size_t>(index), exec_provider->GetOrtDeviceByMemTypeForGraphInput(mem_type));
               set_node_arg_has_explicit_consumer.insert(index);
             } else {  // implicit input
               // Only process an implicit input if there are explicit consumers at this graph level
@@ -888,23 +879,14 @@ class PlannerImpl {
     return Status::OK();
   }
 
-  OrtDevice GetLocationForNodeInput(size_t input_index, const Node& node, const KernelCreateInfoMap& kernel_create_info_map) {
+  OrtDevice GetLocationForNodeWeightInput(size_t input_index, const Node& node, const KernelCreateInfoMap& kernel_create_info_map) {
     auto* p_provider = execution_providers_.Get(node);
     ORT_ENFORCE(p_provider);
 
     const KernelCreateInfo& kernel_create_info = GetKernelCreateInfo(kernel_create_info_map, node.Index());
 
     // weights are not output from any node, so it's OK to put its location on CPU provider
-    auto ort_device = p_provider->GetOrtDeviceByMemType(utils::IsInputOnCpu(node, &kernel_create_info, input_index) ? OrtMemTypeCPUInput : OrtMemTypeDefault);
-
-#ifdef USE_DML
-    // DML uses a different allocator for weights and inputs that allocates unpooled memory
-    if (node.GetExecutionProviderType() == onnxruntime::kDmlExecutionProvider && ort_device.MemType() == OrtDevice::MemType::DEFAULT) {
-      ort_device = OrtDevice(ort_device.Type(), OrtDevice::MemType::DML_UNPOOLED, ort_device.Id());
-    }
-#endif
-
-    return ort_device;
+    return p_provider->GetOrtDeviceByMemTypeForGraphInput(utils::IsInputOnCpu(node, &kernel_create_info, input_index) ? OrtMemTypeCPUInput : OrtMemTypeDefault);
   }
 
   std::vector<std::pair<int, int>> GetAliasMap(const Node& node, const KernelCreateInfo& kernel_create_info) {
@@ -1000,7 +982,7 @@ class PlannerImpl {
         // (subgraphs) is okay and utils::CopyInputsAcrossDevices() will take it to
         // the right device before subgraph execution.
         locations[wt_index].emplace_back(
-            GetLocationForNodeInput(node_input_index, node, kernel_create_info_map));
+            GetLocationForNodeWeightInput(node_input_index, node, kernel_create_info_map));
       }
     }
 

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -75,7 +75,8 @@ namespace Dml
         bool enableGraphCapture,
         bool enableSyncSpinning,
         bool disableMemoryArena) :
-            IExecutionProvider(onnxruntime::kDmlExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0))
+            IExecutionProvider(onnxruntime::kDmlExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0)),
+            m_unpooledDevice(OrtDevice::GPU, OrtDevice::MemType::DML_UNPOOLED, 0)
     {
         D3D12_COMMAND_LIST_TYPE queueType = executionContext->GetCommandListTypeForQueue();
         if (queueType != D3D12_COMMAND_LIST_TYPE_DIRECT && queueType != D3D12_COMMAND_LIST_TYPE_COMPUTE)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
@@ -352,8 +352,19 @@ namespace Dml
             return m_impl->ReplayGraph(graph_annotation_id);
         }
 
+        OrtDevice GetOrtDeviceByMemTypeForGraphInput(OrtMemType mem_type) const final
+        {
+            if (mem_type == OrtMemTypeDefault)
+            {
+                return m_unpooledDevice;
+            }
+
+            return GetOrtDeviceByMemType(mem_type);
+        };
+
     private:
         ComPtr<ExecutionProviderImpl> m_impl;
+        const OrtDevice m_unpooledDevice;
     };
 
 } // namespace Dml