intel · saurabhkale17 · Jul 18, 2024 · Aug 23, 2024 · Aug 24, 2024 · Aug 26, 2024
diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h
@@ -50,6 +50,15 @@ constexpr const char* HIP = "Hip";
 constexpr const char* HIP_PINNED = "HipPinned";
 constexpr const char* OpenVINO_CPU = "OpenVINO_CPU";
 constexpr const char* OpenVINO_GPU = "OpenVINO_GPU";
+constexpr const char* OpenVINO_NPU = "OpenVINO_RT_NPU";
+
+// application
+// 1. Allocate with ORT::CreateTensor("<custom_allocator_tag>")
+// 2. "Manual" allocation
+
+constexpr const char* OpenVINO_RT = "OpenVINO_RT";
+constexpr const char* OpenVINO_RT_NPU = "OpenVINO_RT_NPU";
+constexpr const char* WIN32_HANDLE = "WIN32_HANDLE";
 constexpr const char* WEBGPU_BUFFER = "WebGPU_Buffer";
 
 constexpr size_t kAllocAlignment = 256;

diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc
@@ -145,6 +145,10 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtA
     *out = new OrtMemoryInfo(
         name1, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(id1)), id1,
         mem_type1);
+  } else if (strcmp(name1, onnxruntime::OpenVINO_RT_NPU) == 0) {
+    *out = new OrtMemoryInfo(
+        name1, type, OrtDevice(OrtDevice::NPU, OrtDevice::MemType::DEFAULT, static_cast<OrtDevice::DeviceId>(id1)), id1,
+        mem_type1);
   } else if (strcmp(name1, onnxruntime::CUDA_PINNED) == 0) {
     *out = new OrtMemoryInfo(
         onnxruntime::CUDA_PINNED, type, OrtDevice(OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, static_cast<OrtDevice::DeviceId>(id1)),

diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -48,14 +48,6 @@
   // Set the inference_num_threads property of the CPU
   SetNumThreads(device_config);
 
-#ifndef NDEBUG
-  if (IsDebugEnabled()) {
-    std::string file_name = subgraph_context.subgraph_name + "_static.onnx";
-    std::fstream outfile(file_name, std::ios::out | std::ios::trunc | std::ios::binary);
-    model_proto.SerializeToOstream(outfile);
-  }
-#endif
-
   try {
     std::string dev_prec = global_context.device_type + "_" + global_context_.precision_str;
 
@@ -295,16 +287,99 @@
           ORT_THROW(msg);
         }
       } else {
-        OVTensorPtr graph_input_blob;
+        auto tensor = context.GetInput(subgraph_context_.input_names.at(input_name));
+        auto allocator_name = tensor.GetTensorMemoryInfo().GetAllocatorName();
+        ov_tensor_data_t ov_tensor_key;
+        ort_tensor_key_t ort_tensor_key{tensor.GetTensorRawData(), allocator_name};
+        if (const auto& it = ort_ov_tensor_map.find(ort_tensor_key); it != ort_ov_tensor_map.end()) {
+          ov_tensor_key = it->second;
+        } else {
+          // Does this make sense for both types of allocators?
+          auto input = ie_cnn_network_->get_parameters().at(input_idx);
+          ov_tensor_key.tensor_ptr = std::make_shared<ov::Tensor>(input->get_element_type(), input->get_shape(),
+                                                                    (void*)tensor.GetTensorRawData());
+          if (allocator_name == OpenVINO_RT_NPU) {
+            ov_tensor_key.copy_needed = false;
+          } else {
+            ov_tensor_key.copy_needed = true;
+          }
+          ort_ov_tensor_map.emplace(ort_tensor_key, ov_tensor_key);
+
+          try {
+            infer_request->SetTensor(input_name, ov_tensor_key.tensor_ptr);
+          } catch (const char* msg) {
+            ORT_THROW(msg);
+          }
+        }
+
+        if (ov_tensor_key.copy_needed) {
+          const char* ort_tensor_data = tensor.GetTensorData<char>();
+          size_t tensor_data_size = ov_tensor_key.tensor_ptr->get_byte_size();
+          auto ort_batch_memory_offset = ort_tensor_data + tensor_data_size * batch_slice_idx;
+          std::memcpy(ov_tensor_key.tensor_ptr->data(), ort_batch_memory_offset, tensor_data_size);
+        }
+      }
+      input_idx++;
+    }
+
+    // Set the output blob as remote blob
+    auto graph_output_info = exe_network_.Get().outputs();
+    auto output_idx = 0;
+    for (auto output_info_iter = graph_output_info.begin();
+         output_info_iter != graph_output_info.end(); ++output_info_iter) {
+      auto output_names = output_info_iter->get_names();
+      std::string onnx_output_name;
+      std::string output_name;
+      bool output_name_found = false;
+      // using the output name retrieved from ONNX original to match with the output names returned by OV tensors
+      for (auto it = subgraph_context_.output_names.begin(); it != subgraph_context_.output_names.end(); ++it) {
+        onnx_output_name = it->first;
+        if (output_names.find(onnx_output_name) != output_names.end()) {
+          // Assigning the output_name
+          output_name = it->first;
+          output_name_found = true;
+          break;
+        }
+      }
+      if (!output_name_found) {
+        ORT_THROW(
+            log_tag +
+            "Output names mismatch between OpenVINO and ONNX. [ONNX Output: ] " +
+            onnx_output_name + " doesn't exist in the list of OpenVINO output tensor names");
+      }
+
+      size_t batch_size = 1;
+      Ort::UnownedValue tensor = GetOutputTensor(context,
+                                                 batch_size,
+                                                 infer_request,
+                                                 output_name,
+                                                 subgraph_context_.output_names);
+      auto allocator_name = tensor.GetTensorMemoryInfo().GetAllocatorName();
+
+      ov_tensor_data_t ov_tensor_data;
+      ort_tensor_key_t ort_tensor_key{tensor.GetTensorRawData(), allocator_name};
+      if (const auto& it = ort_ov_tensor_map.find(ort_tensor_key); it != ort_ov_tensor_map.end()) {
+        ov_tensor_data = it->second;
+      } else {
+        auto output = ie_cnn_network_->get_results().at(output_idx);
+        ov_tensor_data.tensor_ptr = std::make_shared<ov::Tensor>(output->get_element_type(), output->get_shape(),
+                                                                 (void*)tensor.GetTensorRawData());
+        if(allocator_name == OpenVINO_RT_NPU) {
+          ov_tensor_data.copy_needed = false;
+        } else {
+          ov_tensor_data.copy_needed = true;
+        }
+        ort_ov_tensor_map.emplace(ort_tensor_key, ov_tensor_data);
+
         try {
-          graph_input_blob = infer_request->GetTensor(input_name);
+          infer_request->SetTensor(output_name, ov_tensor_data.tensor_ptr);
         } catch (const char* msg) {
           ORT_THROW(msg);
         }
-        FillInputBlob(std::move(graph_input_blob), batch_slice_idx, std::move(input_name), context, subgraph_context_);
       }
-      input_idx++;
+      output_idx++;
     }
+
     // Start Async inference
     infer_request->StartAsync();
   } catch (const char* msg) {
@@ -430,7 +505,6 @@
     auto graph_output_info = exe_network_.Get().outputs();
     for (auto output_info_iter = graph_output_info.begin();
          output_info_iter != graph_output_info.end(); ++output_info_iter) {
-      OVTensorPtr graph_output_blob;
       auto output_names = output_info_iter->get_names();
       std::string onnx_output_name;
       std::string output_name;
@@ -454,20 +528,24 @@
             " doesn't exist in the "
             "list of OpenVINO output tensor names");
       }
-      try {
-        graph_output_blob = infer_request->GetTensor(output_name);
-      } catch (const char* msg) {
-        ORT_THROW(msg);
-      }
+
       size_t batch_size = 1;
       Ort::UnownedValue output_tensor =
           GetOutputTensor(context, batch_size, infer_request, std::move(output_name), subgraph_context_.output_names);
-      auto mem_info = output_tensor.GetTensorMemoryInfo();
-      if (mem_info.GetAllocatorName() == OpenVINO_GPU) {
-        return;
+      auto allocator_name = output_tensor.GetTensorMemoryInfo().GetAllocatorName();
+      ov_tensor_data_t ov_tensor_data;
+      ort_tensor_key_t ort_tensor_key{output_tensor.GetTensorRawData(), allocator_name};
+      if (const auto& it = ort_ov_tensor_map.find(ort_tensor_key); it != ort_ov_tensor_map.end()) {
+        ov_tensor_data = it->second;
       } else {
-        size_t batch_slice = 0;
-        FillOutputBlob(std::move(graph_output_blob), output_tensor, batch_slice);
+        ORT_THROW(log_tag + "Expected all outputs to have associated OV::Tensor's");
+      }
+
+      if (ov_tensor_data.copy_needed) {
+        auto ort_tensor_data = output_tensor.GetTensorMutableData<char>();
+        size_t tensor_data_size = ov_tensor_data.tensor_ptr->get_byte_size();
+        auto ort_batch_memory_offset = ort_tensor_data /*+ tensor_data_size * batch_size*/;
+        std::memcpy(ort_batch_memory_offset, ov_tensor_data.tensor_ptr->data(), tensor_data_size);
       }
     }
 

diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h
@@ -11,6 +11,7 @@
 #include <string>
 #include <condition_variable>
 #include <mutex>
+#include <map>
 
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/providers/openvino/contexts.h"
@@ -20,6 +21,11 @@
 namespace onnxruntime {
 namespace openvino_ep {
 
+struct ov_tensor_data_t {
+  OVTensorPtr tensor_ptr;
+  bool copy_needed;
+};
+
 class InferRequestsQueue;
 class BasicBackend : public IBackend {
  public:
@@ -60,6 +66,9 @@
 #if defined IO_BUFFER_ENABLED
   OVRemoteContextPtr remote_context_;
 #endif
+
+  using ort_tensor_key_t = std::pair<const void *, const std::string>;
+  std::map<ort_tensor_key_t, ov_tensor_data_t> ort_ov_tensor_map;
 };
 
 class InferRequestsQueue {

diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -10,6 +10,7 @@
 #include "core/providers/openvino/onnx_ctx_model_helper.h"
 #include "core/providers/openvino/ov_versions/capability.h"
 #include "openvino/core/version.hpp"
+#include "core/providers/openvino/ov_allocator.h"
 
 #define MEMCPY_S(dest, src, destsz, srcsz) memcpy(dest, src, std::min(destsz, srcsz))
 
@@ -180,4 +181,16 @@
   return Status::OK();
 }
 
+std::vector<AllocatorPtr> OpenVINOExecutionProvider::CreatePreferredAllocators() {
+  AllocatorCreationInfo npu_allocator_info {
+    [this](OrtDevice::DeviceId device_id) {
+        return std::make_unique<OVRTAllocator>(global_context_->ie_core.Get(), OrtDevice::NPU, device_id, OpenVINO_RT_NPU);
+    },
+    0,
+  };
+
+  // fill in allocator
+  return std::vector<AllocatorPtr>{CreateAllocator(npu_allocator_info)};
+}
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h
@@ -190,6 +190,8 @@ class OpenVINOExecutionProvider : public IExecutionProvider {
     return nullptr;
   }
 
+  std::vector<AllocatorPtr> CreatePreferredAllocators() override;
+
  private:
   std::unique_ptr<openvino_ep::GlobalContext> global_context_;
   openvino_ep::EPCtxHandler ep_ctx_handle_{};

diff --git a/onnxruntime/core/providers/openvino/ov_allocator.cc b/onnxruntime/core/providers/openvino/ov_allocator.cc
@@ -0,0 +1,54 @@
+// Copyright (C) Intel Corporation
+// Licensed under the MIT License
+
+#include "core/providers/openvino/ov_allocator.h"
+#include "core/providers/openvino/ov_interface.h"
+#include "openvino/runtime/intel_npu/level_zero/level_zero.hpp"
+#include "openvino/runtime/intel_npu/properties.hpp"
+
+namespace onnxruntime {
+
+using namespace openvino_ep;
+
+constexpr size_t default_alignment = 4096;
+
+static inline size_t align_up(size_t size, size_t pow2_alignment) {
+  return (size + pow2_alignment - 1) & ~(pow2_alignment - 1);
+}
+
+OVRTAllocator::OVRTAllocator(ov::Core& core, OrtDevice::DeviceType device_type, OrtDevice::DeviceId device_id, const char* name) : IAllocator(OrtMemoryInfo(name, OrtAllocatorType::OrtDeviceAllocator, OrtDevice(device_type, OrtDevice::MemType::DEFAULT, device_id), device_id, OrtMemTypeCPUInput)), core_(core) {
+  if (device_type == OrtDevice::NPU) {
+    remote_ctx_ = core_.get_default_context("NPU").as<ov::intel_npu::level_zero::ZeroContext>();
+  } else {
+    ORT_THROW("Invalid device type");
+  }
+}
+
+void* OVRTAllocator::Alloc(size_t size) {
+  try {
+    size_t alloc_size = align_up(size + sizeof(ov::Tensor*) + default_alignment, default_alignment);
+    ov::Tensor* tensor = new ov::Tensor(remote_ctx_.create_host_tensor(ov::element::Type_t::u8,
+                                                                       { alloc_size }));
+    uintptr_t data_ptr = reinterpret_cast<uintptr_t>(tensor->data());
+
+    ov::Tensor** ptr = reinterpret_cast<ov::Tensor**>(align_up(data_ptr + sizeof(ov::Tensor*), default_alignment));
+    ptr[-1] = tensor;
+
+    return reinterpret_cast<void*>(ptr);
+
+  } catch (const ov::Exception& e) {
+    ORT_THROW(std::string("Alloc failed: ") + e.what());
+  }
+  return nullptr;
+}
+
+void OVRTAllocator::Free(void* p) {
+  try {
+    ov::Tensor** ptr = reinterpret_cast<ov::Tensor**>(p);
+    delete ptr[-1];
+  } catch (const ov::Exception& e) {
+    ORT_THROW(std::string("Free failed: ") + e.what());
+  }
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/ov_allocator.h b/onnxruntime/core/providers/openvino/ov_allocator.h
@@ -0,0 +1,24 @@
+// Copyright (C) Intel Corporation
+// Licensed under the MIT License
+
+#pragma once
+
+#include "core/common/inlined_containers.h"
+#include "core/framework/allocator.h"
+#include "openvino/runtime/remote_context.hpp"
+
+
+namespace onnxruntime {
+
+class OVRTAllocator : public IAllocator {
+ public:
+  OVRTAllocator(ov::Core &core, OrtDevice::DeviceType device_type, OrtDevice::DeviceId device_id, const char* name);
+  void* Alloc(size_t size) override;
+  void Free(void* p) override;
+
+ private:
+    ov::Core &core_;
+    ov::RemoteContext remote_ctx_;
+};
+
+}  // namespace onnxruntime