microsoft · jywu-msft · Jan 12, 2024 · Oct 31, 2023 · Nov 1, 2023 · Nov 1, 2023
diff --git a/include/onnxruntime/core/framework/op_kernel_context.h b/include/onnxruntime/core/framework/op_kernel_context.h
@@ -186,6 +186,10 @@ class OpKernelContext {
   */
   AllocatorPtr GetAllocator(const OrtDevice& device) const;
 
+#if defined(ENABLE_ATEN) || defined(USE_TENSORRT)
+  Status SetOutputMLValue(int index, const OrtValue& ort_value);
+#endif
+
  protected:
   OpKernelContext(concurrency::ThreadPool* threadpool, const logging::Logger& logger, Stream* stream);
 
@@ -195,10 +199,6 @@ class OpKernelContext {
   const OrtValue* GetImplicitInputMLValue(int index) const;
   OrtValue* GetOutputMLValue(int index);
 
-#ifdef ENABLE_ATEN
-  Status SetOutputMLValue(int index, const OrtValue& ort_value);
-#endif
-
   // Creates the OrtValue* based on the shape, if it does not exist
   virtual OrtValue* OutputMLValue(int index, const TensorShape& shape);
 

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -4512,6 +4512,15 @@ struct OrtApi {
    * \since Version 1.17.
    */
   ORT_API2_STATUS(ReadOpAttr, _In_ const OrtOpAttr* op_attr, _In_ OrtOpAttrType type, _Inout_ void* data, _In_ size_t len, _Out_ size_t* out);
+
+  /** \brief Used for custom operators, set an output of a kernel
+   *
+   * \see ::OrtCustomOp
+   *
+   * \since Version 1.17.
+   */
+  ORT_API2_STATUS(KernelContext_SetOutput, _Inout_ OrtKernelContext* context, _In_ size_t index,
+                  _In_ const OrtValue* ort_value);
 };
 
 /*

diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -2052,6 +2052,7 @@ struct KernelContext {
   ConstValue GetInput(size_t index) const;
   UnownedValue GetOutput(size_t index, const int64_t* dim_values, size_t dim_count) const;
   UnownedValue GetOutput(size_t index, const std::vector<int64_t>& dims) const;
+  void SetOutput(size_t index, const OrtValue& ort_value);
   void* GetGPUComputeStream() const;
   Logger GetLogger() const;
   OrtAllocator* GetAllocator(const OrtMemoryInfo& memory_info) const;

diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -1634,6 +1634,10 @@ inline UnownedValue KernelContext::GetOutput(size_t index, const std::vector<int
   return UnownedValue(out);
 }
 
+inline void KernelContext::SetOutput(size_t index, const OrtValue& ort_value) {
+  Ort::ThrowOnError(GetApi().KernelContext_SetOutput(ctx_, index, &ort_value));
+}
+
 inline void* KernelContext::GetGPUComputeStream() const {
   void* out = nullptr;
   Ort::ThrowOnError(GetApi().KernelContext_GetGPUComputeStream(ctx_, &out));

diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc
@@ -50,7 +50,7 @@ IExecutionFrame::IExecutionFrame(const OrtValueNameIdxMap& ort_value_idx_map,
 
 IExecutionFrame::~IExecutionFrame() = default;
 
-#ifdef ENABLE_ATEN
+#if defined(ENABLE_ATEN) || defined(USE_TENSORRT)
 Status IExecutionFrame::SetOutputMLValue(int index, const OrtValue& ort_value) {
   int ort_value_idx = GetNodeIdxToMLValueIdx(index);
   if (ort_value_idx == NodeIndexInfo::kInvalidEntry || static_cast<size_t>(ort_value_idx) >= all_values_size_) {

diff --git a/onnxruntime/core/framework/execution_frame.h b/onnxruntime/core/framework/execution_frame.h
@@ -54,7 +54,7 @@ class IExecutionFrame {
   const OrtValue* GetNodeInputOrOutputMLValue(int index) const;
   OrtValue* GetMutableNodeInputOrOutputMLValue(int index);
 
-#ifdef ENABLE_ATEN
+#if defined(ENABLE_ATEN) || defined(USE_TENSORRT)
   // Override the index-th output with ort_value
   Status SetOutputMLValue(int index, const OrtValue& ort_value);
 #endif

diff --git a/onnxruntime/core/framework/op_kernel.cc b/onnxruntime/core/framework/op_kernel.cc
@@ -186,7 +186,7 @@ AllocatorPtr OpKernelContext::GetAllocator(const OrtDevice& device) const {
   return execution_frame_->GetAllocator(device);
 }
 
-#ifdef ENABLE_ATEN
+#if defined(ENABLE_ATEN) || defined(USE_TENSORRT)
 Status OpKernelContext::SetOutputMLValue(int index, const OrtValue& ort_value) {
   if (index < 0 || index >= OutputCount()) {
     return Status(common::ONNXRUNTIME, common::FAIL,

diff --git a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc
@@ -0,0 +1,104 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <iostream>
+#include <fstream>
+#include <filesystem>
+
+#include "onnx_ctx_model_helper.h"
+
+namespace onnxruntime {
+
+/*
+ *  Check whether the graph has the EP context contrib op.
+ *  The op can contain the precompiled engine info for TRT EP to directly load the engine.
+ *
+ *  Note: Please see more details about "EPContext" contrib op in contrib_defs.cc
+ */
+bool GraphHasCtxNode(const GraphViewer& graph_viewer) {
+  for (int i = 0; i < graph_viewer.MaxNodeIndex(); ++i) {
+    auto node = graph_viewer.GetNode(i);
+    if (node != nullptr && node->OpType() == EPCONTEXT_OP) {
+      return true;
+    }
+  }
+  return false;
+}
+
+const onnxruntime::Path& GetModelPath(const GraphViewer& graph_viewer) {
+  // find the top level graph
+  const Graph* cur_graph = &graph_viewer.GetGraph();
+  while (cur_graph->IsSubgraph()) {
+    cur_graph = cur_graph->ParentGraph();
+  }
+
+  const Graph& main_graph = *cur_graph;
+  return main_graph.ModelPath();
+}
+
+std::filesystem::path LocateEngineRelativeToPath(std::string engine_cache_path, const onnxruntime::Path& path) {
+  std::filesystem::path base_path(path.ToPathString());
+  std::filesystem::path parent_path = base_path.parent_path();
+  std::filesystem::path engine_path = parent_path.append(engine_cache_path);
+  return engine_path;
+}
+
+Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph_viewer) {
+  assert(graph_viewer.NumberOfNodes() == 1);
+  assert(graph_viewer.GetNode(0)->OpType() == EPCONTEXT_OP);
+  auto node = graph_viewer.GetNode(0);
+  auto& attrs = node->GetAttributes();
+
+  const int64_t embed_mode = attrs.at(EMBED_MODE).i();
+  if (embed_mode) {
+    const std::string& context_binary = attrs.at(EP_CACHE_CONTEXT).s();
+    std::cout << context_binary << std::endl;
+  } else {
+    std::ifstream engine_file(engine_cache_path_.string(), std::ios::binary | std::ios::in);
+    engine_file.seekg(0, std::ios::end);
+    size_t engine_size = engine_file.tellg();
+    engine_file.seekg(0, std::ios::beg);
+    std::unique_ptr<char[]> engine_buf{new char[engine_size]};
+    engine_file.read((char*)engine_buf.get(), engine_size);
+    *(trt_engine_) = std::unique_ptr<nvinfer1::ICudaEngine>(trt_runtime_->deserializeCudaEngine(engine_buf.get(), engine_size));
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path_.string();
+    if (!(*trt_engine_)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
+                             "TensorRT EP could not deserialize engine from cache: " + engine_cache_path_.string());
+    }
+  }
+  return Status::OK();
+}
+
+/*
+ * The sanity check for EP context contrib op.
+ */
+bool TensorRTCacheModelHandler::ValidateEPCtxNode(const GraphViewer& graph_viewer) {
+  assert(graph_viewer.NumberOfNodes() == 1);
+  assert(graph_viewer.GetNode(0)->OpType() == EPCONTEXT_OP);
+  auto node = graph_viewer.GetNode(0);
+  auto& attrs = node->GetAttributes();
+
+  // "embed_mode" attr and "ep_cache_context" attr should be present
+  if (attrs.count(EMBED_MODE) > 0 && attrs.count(EP_CACHE_CONTEXT) > 0) {
+    // ep_cache_context: payload of the execution provider context if embed_mode=1, or path to the context file if embed_mode=0
+    const int64_t embed_mode = attrs.at(EMBED_MODE).i();
+
+    // engine cache path
+    if (embed_mode == 0) {
+      // First assume engine cache path is relatvie to model path,
+      // If not, then assume the engine cache path is an absolute path.
+      engine_cache_path_ = LocateEngineRelativeToPath(attrs.at(EP_CACHE_CONTEXT).s(), GetModelPath(graph_viewer));
+      auto default_engine_cache_path_ = engine_cache_path_;
+      if (!std::filesystem::exists(engine_cache_path_)) {
+        engine_cache_path_.assign(attrs.at(EP_CACHE_CONTEXT).s());
+        if (!std::filesystem::exists(engine_cache_path_)) {
+          LOGS_DEFAULT(ERROR) << "Can't find " << default_engine_cache_path_.string() << " or " << engine_cache_path_.string() << " TensorRT engine";
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h
@@ -0,0 +1,42 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string>
+#include <filesystem>
+
+#include "NvInfer.h"
+#include "core/providers/shared_library/provider_api.h"
+
+namespace onnxruntime {
+
+static const std::string EPCONTEXT_OP = "EPContext";
+static const std::string MAIN_CONTEXT = "main_context";
+static const std::string EMBED_MODE = "embed_mode";
+static const std::string EP_CACHE_CONTEXT = "ep_cache_context";
+static const std::string EP_SDK_VER = "ep_sdk_version";
+static const std::string PARTITION_NAME = "partition_name";
+static const std::string SOURCE = "source";
+
+bool GraphHasCtxNode(const GraphViewer& graph_viewer);
+const onnxruntime::Path& GetModelPath(const GraphViewer& graph_viewer);
+std::filesystem::path LocateEngineRelativeToPath(std::string engine_cache_path, const onnxruntime::Path& path);
+
+class TensorRTCacheModelHandler {
+public:
+  TensorRTCacheModelHandler(std::unique_ptr<nvinfer1::ICudaEngine>* trt_engine,
+                            nvinfer1::IRuntime* trt_runtime) : trt_engine_(trt_engine), trt_runtime_(trt_runtime) {
+  }
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TensorRTCacheModelHandler);
+
+  bool ValidateEPCtxNode(const GraphViewer& graph_viewer);
+
+  Status GetEpContextFromGraph(const GraphViewer& graph_viewer);
+
+private:
+  std::unique_ptr<nvinfer1::ICudaEngine>* trt_engine_;
+  nvinfer1::IRuntime* trt_runtime_;
+  std::filesystem::path engine_cache_path_;
+};  // TRTCacheModelHandler
+}  // namespace onnxruntime