diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc index d75a7a519254e..33056006410c8 100644 --- a/onnxruntime/core/graph/graph.cc +++ b/onnxruntime/core/graph/graph.cc @@ -3878,13 +3878,17 @@ Node& Graph::CreateFusedSubGraphNode(const IndexedSubGraph& sub_graph, const std int cur_idx = 0; for (const auto& arg_name : func_meta_def->inputs) { - input_args.push_back(GetNodeArg(arg_name)); + // In some cases, it needs to get the NodeArgs from ancestors. + // For example, if the subgraph we are going to build is the subgraph of the original graph + // and the NodeArgs of the outer scope values are defined in the top-level original graph. + input_args.push_back(GetNodeArgIncludingParentGraphs(arg_name)); input_indexes[arg_name] = cur_idx++; } cur_idx = 0; for (const auto& arg_name : func_meta_def->outputs) { - output_args.push_back(GetNodeArg(arg_name)); + // In some cases, it needs to get the NodeArgs from ancestors. + output_args.push_back(GetNodeArgIncludingParentGraphs(arg_name)); output_indexes[arg_name] = cur_idx++; } diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h index fa6ad26cc248a..27226005a9c0b 100644 --- a/onnxruntime/core/providers/shared_library/provider_interfaces.h +++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h @@ -658,6 +658,8 @@ struct ProviderHost { virtual std::unique_ptr Node__OutputEdgesEnd(const Node* p) noexcept = 0; virtual void Node__ForEachDef(const Node* p, std::function func, bool include_missing_optional_defs) = 0; + virtual const std::unordered_map>& Node__GetAttributeNameToMutableSubgraphMap(Node* p) = 0; + virtual std::unordered_map> Node__GetAttributeNameToSubgraphMap(const Node* p) const = 0; // NodeArg virtual const std::string& NodeArg__Name(const NodeArg* p) noexcept = 0; @@ -695,6 +697,8 @@ struct ProviderHost { virtual std::unique_ptr Graph__ToGraphProto(const Graph* p) = 0; virtual NodeArg& Graph__GetOrCreateNodeArg(Graph* p, const std::string& name, const ONNX_NAMESPACE::TypeProto* p_arg_type) = 0; + virtual void Graph__AddOuterScopeNodeArg(Graph* p, const std::string& name) = 0; + virtual void Graph__SetInputs(Graph* p, gsl::span inputs) = 0; virtual Status Graph__Resolve(Graph* p) = 0; virtual void Graph__AddInitializedTensor(Graph* p, const ONNX_NAMESPACE::TensorProto& tensor) = 0; @@ -708,10 +712,15 @@ struct ProviderHost { virtual const Node* Graph__ParentNode(const Graph* p) const = 0; virtual const Graph* Graph__ParentGraph(const Graph* p) const = 0; + virtual Graph* Graph__MutableParentGraph(Graph* p) = 0; virtual const std::string& Graph__Name(const Graph* p) const noexcept = 0; virtual const Path& Graph__ModelPath(const Graph* p) const = 0; virtual const std::vector& Graph__GetInputsIncludingInitializers(const Graph* p) const noexcept = 0; virtual bool Graph__IsSubgraph(const Graph* p) = 0; + virtual int Graph__MaxNodeIndex(const Graph* p) const noexcept = 0; + virtual Node* Graph__GetNode(Graph* p, NodeIndex node_index) noexcept = 0; + virtual const Node* Graph__GetNode(const Graph* p, NodeIndex node_index) const = 0; + virtual const NodeArg* Graph__GetNodeArg(const Graph* p, const std::string& name) const = 0; // GraphViewer virtual void GraphViewer__operator_delete(GraphViewer* p) = 0; diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h index e09c2c495886a..f0ab7869b7d50 100644 --- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h +++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h @@ -653,6 +653,8 @@ struct Node final { EdgeConstIterator OutputEdgesEnd() const noexcept { return g_host->Node__OutputEdgesEnd(this); } void ForEachDef(std::function func, bool include_missing_optional_defs = false) const { g_host->Node__ForEachDef(this, func, std::move(include_missing_optional_defs)); } + const std::unordered_map>& GetAttributeNameToMutableSubgraphMap() { return g_host->Node__GetAttributeNameToMutableSubgraphMap(this); } + std::unordered_map> GetAttributeNameToSubgraphMap() const { return g_host->Node__GetAttributeNameToSubgraphMap(this); } PROVIDER_DISALLOW_ALL(Node) }; @@ -707,6 +709,8 @@ struct Graph final { std::unique_ptr ToGraphProto() const { return g_host->Graph__ToGraphProto(this); } NodeArg& GetOrCreateNodeArg(const std::string& name, const ONNX_NAMESPACE::TypeProto* p_arg_type) { return g_host->Graph__GetOrCreateNodeArg(this, name, p_arg_type); } + void AddOuterScopeNodeArg(const std::string& name) { g_host->Graph__AddOuterScopeNodeArg(this, name); } + void SetInputs(gsl::span inputs) { g_host->Graph__SetInputs(this, inputs); } Status Resolve() { return g_host->Graph__Resolve(this); } void AddInitializedTensor(const ONNX_NAMESPACE::TensorProto& tensor) { return g_host->Graph__AddInitializedTensor(this, tensor); } @@ -721,10 +725,15 @@ struct Graph final { const Node* ParentNode() const { return g_host->Graph__ParentNode(this); } const Graph* ParentGraph() const { return g_host->Graph__ParentGraph(this); } + Graph* MutableParentGraph() { return g_host->Graph__MutableParentGraph(this); } const std::string& Name() const noexcept { return g_host->Graph__Name(this); } const Path& ModelPath() const { return g_host->Graph__ModelPath(this); } const std::vector& GetInputsIncludingInitializers() const noexcept { return g_host->Graph__GetInputsIncludingInitializers(this); } bool IsSubgraph() const { return g_host->Graph__IsSubgraph(this); } + int MaxNodeIndex() const noexcept { return g_host->Graph__MaxNodeIndex(this); } + const Node* GetNode(NodeIndex node_index) const noexcept { return g_host->Graph__GetNode(this, node_index); } + Node* GetNode(NodeIndex node_index) noexcept { return g_host->Graph__GetNode(this, node_index); } + const NodeArg* GetNodeArg(const std::string& name) const { return g_host->Graph__GetNodeArg(this, name); } PROVIDER_DISALLOW_ALL(Graph) }; diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index 6142b6d393d7e..1567628a66829 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -1283,6 +1283,7 @@ SubGraphCollection_t TensorrtExecutionProvider::GetSupportedList(SubGraphCollect } else { auto model_build = graph.CreateModel(*GetLogger()); auto& graph_build = model_build->MainGraph(); + bool has_control_flow_op = false; // Add node and node args // If node output is also parent graph output, the output will be added to the @@ -1321,6 +1322,10 @@ SubGraphCollection_t TensorrtExecutionProvider::GetSupportedList(SubGraphCollect } } + if (control_flow_op_set_.find(node->OpType()) != control_flow_op_set_.end()) { + has_control_flow_op = true; + } + // If the node has subgraph, it's possible that the ORT graph of that subgraph and the GraphProto in the node attributes are not in sync because of graph optimization. // Therefore, we need to force GraphProto attributes to be updated in order to get the valid GraphProto. if (node->GetAttributes().size() > 0) { @@ -1345,6 +1350,13 @@ SubGraphCollection_t TensorrtExecutionProvider::GetSupportedList(SubGraphCollect } } + if (has_control_flow_op) { + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Handle outer scope values for the subgraph " << graph_build.Name(); + BuildSubGraphContext(graph_build); + SetGraphOuterScopeValuesAndInputs(graph_build, graph.GetGraph()); + SetAllGraphInputs(graph_build); + } + ORT_ENFORCE(graph_build.Resolve().IsOK()); // Add parent graph output to the subgraph @@ -1657,6 +1669,20 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph, std::iota(std::begin(subgraph_nodes_vector), std::end(subgraph_nodes_vector), 0); SubGraphCollection_t parser_subgraph_nodes_vector = {{subgraph_nodes_vector, false}}; bool subgraph_early_termination = false; + + // Another subgraph of "If" control flow has been parsed by GetCapability before and all subgraph's nodes assigned to TRT EP. + if (AllNodesAssignedToSpecificEP(*sub_graph_veiwer, kTensorrtExecutionProvider)) { + all_subgraphs_are_supported = true; + break; + } + // Another subgraph of "If" control flow has been parsed by GetCapability and not all subgraph's nodes assigned to TRT EP. + // (Note: GetExecutionProviderType() returns "" meaning node has not yet been assigned to any EPs) + else if (!AllNodesAssignedToSpecificEP(*sub_graph_veiwer, "")) { + all_subgraphs_are_supported = false; + break; + } + + // Another subgraph of "If" control flow has not yet been parsed by GetCapability. subgraph_supported_nodes_vector = GetSupportedList(parser_subgraph_nodes_vector, 0, max_partition_iterations_, *sub_graph_veiwer, &subgraph_early_termination); all_subgraphs_are_supported = IsSubGraphFullySupported(subgraph_supported_nodes_vector, number_of_ort_subgraph_nodes); break; @@ -1677,6 +1703,9 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph, } } LOGS_DEFAULT(INFO) << "[TensorRT EP] Whole graph will run on TensorRT execution provider"; + + // The context map is only used during EP compile time, release it to save memory space. + subgraph_context_map_.clear(); return result; } } @@ -1700,6 +1729,8 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph, LOGS_DEFAULT(INFO) << "[TensorRT EP] Graph is partitioned and number of subgraphs running on TensorRT execution provider is " << number_of_subgraphs; } + // The context map is only used during EP compile time, release it to save memory space. + subgraph_context_map_.clear(); return result; } diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h index 56eda7ad83537..13c5eff08bc46 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h @@ -138,6 +138,15 @@ struct TensorrtFuncState { bool cuda_graph_enable = 0; }; +// Holds important information for building valid ORT graph. +struct SubGraphContext { + std::unordered_set output_args; + std::unordered_map inputs_and_initializers; + std::unordered_map manually_added_graph_inputs; +}; + +using SubGraphContextMap = std::unordered_map>; + // Logical device representation. class TensorrtExecutionProvider : public IExecutionProvider { public: @@ -224,6 +233,7 @@ class TensorrtExecutionProvider : public IExecutionProvider { const int min_num_runs_before_cuda_graph_capture_ = 1; // required min regular runs before graph capture for the necessary memory allocations. std::unordered_set control_flow_op_set_ = {"If", "Loop", "Scan"}; + mutable std::unordered_map> subgraph_context_map_; std::unordered_map> parsers_; std::unordered_map> engines_; std::unordered_map> contexts_; @@ -273,6 +283,42 @@ class TensorrtExecutionProvider : public IExecutionProvider { /**Check whether all the nodes of subgraph are supported*/ bool IsSubGraphFullySupported(SubGraphCollection_t supported_nodes_vector, const int number_of_ort_nodes) const; + /** + * Set inputs, initializers and outputs for all subgraphs during TensorrtExecutionProvider::GetSupportedList() + * and save those information in subgraph context data structure. It's useful for building a valid graph and + * make Graph::Resolve() happy especially when dealing with nested control-flow op graph. + */ + void BuildSubGraphContext(const Graph& build_graph) const; + + /** + * Set outer scope values for subgraphs and add thoes values as top-level graph's inputs if needed. + */ + void SetGraphOuterScopeValuesAndInputs(Graph& build_graph, const Graph& graph) const; + + /** + * If ORT TRT manually sets graph input in TensorrtExecutionProvider::SetGraphOuterScopeValuesAndInputs(), + * we have to manully set all the graph inputs in order to pass Graph::Resolve(). + */ + void SetAllGraphInputs(Graph& graph) const; + + /** + * The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage + * Graph::ResolveContext::IsInputInitializerOrOutput(). We have to implement this fuction again. + */ + bool IsInputInitializerOrOutput(const Graph& graph, const std::string& name, bool check_ancestors) const; + + /** + * The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage + * Graph::ResolveContext::IsOuterScopeValue(). We have to implement this fuction again. + */ + bool IsOuterScopeValue(const Graph& graph, const std::string& name) const; + + /** + * The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage + * Graph::ResolveContext::IsLocalValue(). We have to implement this fuction again. + */ + bool IsLocalValue(const Graph& graph, const std::string& name) const; + bool IsGraphCaptureAllowed() const; void CaptureBegin(); void CaptureEnd(); diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc new file mode 100644 index 0000000000000..ecc72b1c65476 --- /dev/null +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc @@ -0,0 +1,230 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/shared_library/provider_api.h" +#include "tensorrt_execution_provider.h" +#include + +namespace onnxruntime { + +// The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage +// Graph::ResolveContext::IsInputInitializerOrOutput(). We have to implement this fuction again. +bool TensorrtExecutionProvider::IsInputInitializerOrOutput(const Graph& graph, + const std::string& name, + bool check_ancestors) const { + const Graph* parent_graph = nullptr; + return IsLocalValue(graph, name) || + (check_ancestors && (parent_graph = graph.ParentGraph()) != nullptr && + IsInputInitializerOrOutput(*parent_graph, name, check_ancestors)); +} + +// The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage +// Graph::ResolveContext::IsOuterScopeValue(). We have to implement this function again. +bool TensorrtExecutionProvider::IsOuterScopeValue(const Graph& graph, + const std::string& name) const { + const Graph* parent_graph = nullptr; + return (parent_graph = graph.ParentGraph()) != nullptr && + IsInputInitializerOrOutput(*parent_graph, name, true); +} + +// The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage +// Graph::ResolveContext::IsLocalValue(). We have to implement this function again. +bool TensorrtExecutionProvider::IsLocalValue(const Graph& graph, + const std::string& name) const { + if (subgraph_context_map_.find(graph.Name()) == subgraph_context_map_.end()) { + return false; + } + SubGraphContext* context = subgraph_context_map_.at(graph.Name()).get(); + return context->output_args.find(name) != context->output_args.cend() || + context->inputs_and_initializers.find(name) != context->inputs_and_initializers.cend(); +} + +/** + * Set inputs, initializers and outputs for all subgraphs during TensorrtExecutionProvider::GetSupportedList() + * and save those information in subgraph context data structure. It's useful for building a valid graph and + * make Graph::Resolve() happy especially when dealing with nested control-flow op graph. + */ +void TensorrtExecutionProvider::BuildSubGraphContext(const Graph& graph) const { + // Iterate all the nodes and recurse into inner most subgraph first + for (int i = 0; i < graph.MaxNodeIndex(); ++i) { + auto node = graph.GetNode(i); + if (node == nullptr) { + continue; + } + + auto subgraph_map = node->GetAttributeNameToSubgraphMap(); + for (auto& entry : subgraph_map) { + const Graph* subgraph = entry.second; + BuildSubGraphContext(*subgraph); + } + } + + // Subgraph context has been built before, no need to do it again + if (subgraph_context_map_.find(graph.Name()) != subgraph_context_map_.end()) { + return; + } + + subgraph_context_map_.emplace(graph.Name(), std::make_unique()); + SubGraphContext* context = subgraph_context_map_.at(graph.Name()).get(); + + // Collect all nodes' outputs and nodes' name + for (int i = 0; i < graph.MaxNodeIndex(); ++i) { + auto node = graph.GetNode(i); + if (node == nullptr) { + continue; + } + + for (const auto& output : node->OutputDefs()) { + context->output_args.insert(output->Name()); + } + } + + // Go thru all node's inputs + for (int i = 0; i < graph.MaxNodeIndex(); ++i) { + auto node = graph.GetNode(i); + if (node == nullptr) { + continue; + } + + for (const auto& input : node->InputDefs()) { + if (context->output_args.find(input->Name()) != context->output_args.end()) { + continue; + } + // This input arg is not the output of another node so must come from either a graph input or an initializer. + context->inputs_and_initializers[input->Name()] = input; + } + } +} + +// Set outer scope values for subgraphs and add thoes values as top-level graph's inputs if needed. +void TensorrtExecutionProvider::SetGraphOuterScopeValuesAndInputs(Graph& graph_build, + const Graph& graph) const { + // Iterate all the nodes and recurse into inner most subgraph first for both newly built graph and original graph + for (int i = 0; i < graph_build.MaxNodeIndex(); ++i) { + auto graph_build_node = graph_build.GetNode(i); + if (graph_build_node == nullptr) { + continue; + } + + auto graph_build_map = graph_build_node->GetAttributeNameToMutableSubgraphMap(); + std::unordered_map> subgraph_map; + const Node* graph_node = nullptr; + + // Find corresponding original graph node's subgraphs + for (int j = 0; j < graph.MaxNodeIndex(); ++j) { + if (graph.GetNode(j) && graph.GetNode(j)->Name() == graph_build_node->Name()) { + graph_node = graph.GetNode(j); + subgraph_map = graph_node->GetAttributeNameToSubgraphMap(); + break; + } + } + + for (auto& entry : graph_build_map) { + auto attr_name = entry.first; + Graph* subgraph_build = entry.second; + if (subgraph_map.find(attr_name) != subgraph_map.end()) { + // recurse into subgraph + const Graph* subgraph = subgraph_map.at(attr_name); + SetGraphOuterScopeValuesAndInputs(*subgraph_build, *subgraph); + } + } + } + + // Start from the inner most subgraph first and check whether its outer scope values are existed in the + // newly built graph. If not, we need to add those outer scope values as explicit inputs to the top-level + // of newly built graph. + if (graph_build.ParentNode()) { + auto top_level_graph = &graph_build; + while (top_level_graph->MutableParentGraph()) { + top_level_graph = top_level_graph->MutableParentGraph(); + } + if (subgraph_context_map_.find(top_level_graph->Name()) == subgraph_context_map_.end()) { + LOGS_DEFAULT(ERROR) << "[TensorRT EP] Can't find top-level graph context. \ + Please check BuildSubGraphContext() has built the graph context correctly."; + return; + } + + SubGraphContext* context = subgraph_context_map_.at(top_level_graph->Name()).get(); + + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Subgraph name is " << graph_build.Name(); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Its parent node is " << graph.ParentNode()->Name(); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Its parent node's implicit inputs:"; + + // Iterate all the implicit inputs to set outer scope value for the newly built subgraph + for (const auto& input : graph.ParentNode()->ImplicitInputDefs()) { + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] \t" << input->Name(); + + // The node arg in parent node's implicit inputs could be used for parent node's other subgraph, for example + // "If" op has two subgraphs. So we need to make sure that the node arg is used in current subgraph only. + // (GetNodeArg searches for specific node arg in all node args in the graph) + if (graph_build.GetNodeArg(input->Name())) { + graph_build.AddOuterScopeNodeArg(input->Name()); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] \t" << input->Name() << " is used in this subgraph"; + + if (context && + (context->manually_added_graph_inputs.find(input->Name()) != context->manually_added_graph_inputs.end())) { + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] \t" << input->Name() << " is already been added as an explicit input to graph"; + continue; + } + + // Handle the case where this outer scope value is not existed in any outer scope levels of the + // newly built graph (the newly built graph is the subgraph of the original graph). Need to add + // the outer scope value as an explicit input to the top-level of newly built graph. + if (!IsOuterScopeValue(graph_build, input->Name())) { + const auto& name = input->Name(); + auto graph_inputs_including_initializers = top_level_graph->GetInputsIncludingInitializers(); + auto added_graph_input = std::find_if(graph_inputs_including_initializers.begin(), + graph_inputs_including_initializers.end(), + [&name](const NodeArg* entry) { return entry->Name() == name; }); + + if (added_graph_input == graph_inputs_including_initializers.end()) { + if (context) { + auto type_proto = ONNX_NAMESPACE::TypeProto::Create(); + type_proto->copy_from(input->TypeAsProto()); + auto& n_input = top_level_graph->GetOrCreateNodeArg(name, type_proto.get()); + context->manually_added_graph_inputs[n_input.Name()] = &n_input; + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] \t" << n_input.Name() << " is added as an explicit input into the newly built graph"; + } + } + } + } + } + } +} + +// If ORT TRT manually sets graph input in TensorrtExecutionProvider::SetGraphOuterScopeValuesAndInputs(), +// we have to manully set all the graph inputs in order to pass Graph::Resolve() +void TensorrtExecutionProvider::SetAllGraphInputs(Graph& graph) const { + // If ORT TRT doesn't manully set graph input in TensorrtExecutionProvider::SetGraphOuterScopeValuesAndInputs(), + // Graph::Resolve() will help set graph inputs in Graph::SetGraphInputsOutputs(), so no need to set graph inputs here. + if (subgraph_context_map_.find(graph.Name()) == subgraph_context_map_.end() || + subgraph_context_map_[graph.Name()].get()->manually_added_graph_inputs.size() == 0) { + return; + } + + SubGraphContext* context = subgraph_context_map_[graph.Name()].get(); + std::vector graph_inputs_including_initializers; + std::unordered_set graph_inputs_including_initializers_set; + + for (const auto& entry : context->inputs_and_initializers) { + graph_inputs_including_initializers.push_back(entry.second); + graph_inputs_including_initializers_set.insert(entry.first); + } + + for (const auto& entry : context->manually_added_graph_inputs) { + if (graph_inputs_including_initializers_set.find(entry.first) == graph_inputs_including_initializers_set.end()) { + graph_inputs_including_initializers.push_back(entry.second); + graph_inputs_including_initializers_set.insert(entry.first); + } + } + + for (const auto& node_arg : graph.GetInputsIncludingInitializers()) { + if (graph_inputs_including_initializers_set.find(node_arg->Name()) == graph_inputs_including_initializers_set.end()) { + graph_inputs_including_initializers.push_back(node_arg); + graph_inputs_including_initializers_set.insert(node_arg->Name()); + } + } + + graph.SetInputs(graph_inputs_including_initializers); +} +} // namespace onnxruntime diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index 4003f984beda7..be549f5e665a0 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -763,6 +763,8 @@ struct ProviderHostImpl : ProviderHost { std::unique_ptr Node__OutputEdgesEnd(const Node* p) noexcept override { return std::make_unique(p->OutputEdgesEnd()); } void Node__ForEachDef(const Node* p, std::function func, bool include_missing_optional_defs) override { p->ForEachDef(func, std::move(include_missing_optional_defs)); } + const std::unordered_map>& Node__GetAttributeNameToMutableSubgraphMap(Node* p) noexcept override { return p->GetAttributeNameToMutableSubgraphMap(); } + std::unordered_map> Node__GetAttributeNameToSubgraphMap(const Node* p) const override { return p->GetAttributeNameToSubgraphMap(); } // NodeArg (wrapped) const std::string& NodeArg__Name(const NodeArg* p) noexcept override { return p->Name(); } @@ -803,8 +805,10 @@ struct ProviderHostImpl : ProviderHost { // Graph (wrapped) std::unique_ptr Graph__CreateGraphViewer(const Graph* p) override { return std::make_unique(*p); } std::unique_ptr Graph__ToGraphProto(const Graph* p) override { return std::make_unique(p->ToGraphProto()); } + void Graph__SetInputs(Graph* p, gsl::span inputs) override { p->SetInputs(inputs); } NodeArg& Graph__GetOrCreateNodeArg(Graph* p, const std::string& name, const ONNX_NAMESPACE::TypeProto* p_arg_type) override { return p->GetOrCreateNodeArg(name, p_arg_type); } + void Graph__AddOuterScopeNodeArg(Graph* p, const std::string& name) override { p->AddOuterScopeNodeArg(name); } Status Graph__Resolve(Graph* p) override { return p->Resolve(); } void Graph__AddInitializedTensor(Graph* p, const ONNX_NAMESPACE::TensorProto& tensor) override { p->AddInitializedTensor(tensor); } @@ -820,10 +824,15 @@ struct ProviderHostImpl : ProviderHost { const Node* Graph__ParentNode(const Graph* p) const override { return p->ParentNode(); } const Graph* Graph__ParentGraph(const Graph* p) const override { return p->ParentGraph(); } + Graph* Graph__MutableParentGraph(Graph* p) override { return p->MutableParentGraph(); } const std::string& Graph__Name(const Graph* p) const noexcept override { return p->Name(); } const Path& Graph__ModelPath(const Graph* p) const override { return p->ModelPath(); } const std::vector& Graph__GetInputsIncludingInitializers(const Graph* p) const noexcept override { return p->GetInputsIncludingInitializers(); } bool Graph__IsSubgraph(const Graph* p) override { return p->IsSubgraph(); } + int Graph__MaxNodeIndex(const Graph* p) const noexcept override { return p->MaxNodeIndex(); } + Node* Graph__GetNode(Graph* p, NodeIndex node_index) noexcept override { return p->GetNode(node_index); } + const Node* Graph__GetNode(const Graph* p, NodeIndex node_index) const override { return p->GetNode(node_index); } + const NodeArg* Graph__GetNodeArg(const Graph* p, const std::string& name) const override { return p->GetNodeArg(name); } // GraphViewer (wrapped) void GraphViewer__operator_delete(GraphViewer* p) override { delete p; } diff --git a/onnxruntime/test/python/onnxruntime_test_python_nested_control_flow_op.py b/onnxruntime/test/python/onnxruntime_test_python_nested_control_flow_op.py new file mode 100644 index 0000000000000..bf354ad9f9e10 --- /dev/null +++ b/onnxruntime/test/python/onnxruntime_test_python_nested_control_flow_op.py @@ -0,0 +1,259 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import unittest +from copy import deepcopy +from typing import Optional, Sequence, Tuple + +import numpy as np +from onnx import ModelProto, NodeProto, TensorProto, ValueInfoProto, checker, helper + +import onnxruntime as ort + + +def make_vi_like(vi: ValueInfoProto, name: str) -> ValueInfoProto: + """Makes a copy of `vi` with a new name.""" + new_vi = deepcopy(vi) + new_vi.name = name + return new_vi + + +def make_optional_tensor_value_info(name: str, elem_type: int, shape: Sequence[int]) -> ValueInfoProto: + """Makes a `ValueInfoProto` with optional type.""" + tensor_type_proto = helper.make_tensor_type_proto( + elem_type=elem_type, + shape=shape, + ) + opt_type_proto = helper.make_optional_type_proto(tensor_type_proto) + + vi = helper.make_tensor_value_info(name, elem_type, shape) + vi.type.CopyFrom(opt_type_proto) + return vi + + +def make_optional_vi(vi: ValueInfoProto, name: Optional[str] = None) -> ValueInfoProto: + """Makes a copy of `vi` with optional type.""" + name = name or vi.name + ".opt" + vi_type = vi.type.tensor_type + vi_shape = [d.dim_param if d.dim_param else d.dim_value for d in vi_type.shape.dim] + opt_vi = make_optional_tensor_value_info(name, vi_type.elem_type, vi_shape) + return opt_vi + + +def make_const(vi: ValueInfoProto, name: str, value: int = 0) -> Tuple[ValueInfoProto, NodeProto, TensorProto]: + """Creates a constant 1D tensor from `vi`.""" + const_vi = make_vi_like(vi, name) + const_shape = [d.dim_value for d in vi.type.tensor_type.shape.dim] + const_shape_tensor = helper.make_tensor(f"{name}.shape", TensorProto.INT64, [len(const_shape)], const_shape) + const_fill = helper.make_tensor(f"{name}.const.value", const_vi.type.tensor_type.elem_type, [1], [value]) + const_node = helper.make_node( + "ConstantOfShape", + inputs=[const_shape_tensor.name], + outputs=[const_vi.name], + name=f"ConstantOfShape.{name}", + value=const_fill, + ) + return const_vi, const_node, const_shape_tensor + + +# This is a three-layer nested control flow ops model. +# The innermost subgraphs have the outer scope values that are the inputs, x2 and x3, of the top-level graph. +def make_opt_nested_greater_or_equal() -> ModelProto: + """ + Creates a nested graph with (`optional(x1)`, `x2`, x3`) tensor inputs. + + `x3` is similar to an optional input with default value of -1. + """ + # Inputs/outputs + x1_vi = helper.make_tensor_value_info("x1", TensorProto.FLOAT, [1, 2]) + x2_vi = helper.make_tensor_value_info("x2", TensorProto.FLOAT, [1, 2]) + x3_vi = helper.make_tensor_value_info("x3", TensorProto.FLOAT, [1]) + y_vi = helper.make_tensor_value_info("y", TensorProto.FLOAT, [1, 2]) + opt_x1_vi = make_optional_vi(x1_vi, name="x1.opt") + + # Add `x1` and `x2` subgraph + y1_vi = make_vi_like(y_vi, "x1.add.x2") + input_get_elem_node = helper.make_node( + "OptionalGetElement", + inputs=[opt_x1_vi.name], + outputs=[x1_vi.name], + name="OptionalGetElement.Input", + ) + add_node = helper.make_node("Add", inputs=[x1_vi.name, x2_vi.name], outputs=[y1_vi.name], name="Add_Op") + add_x1_x2_subgraph = helper.make_graph( + [input_get_elem_node, add_node], + name="add-x1-x2-subgraph", + inputs=[], + outputs=[y1_vi], + value_info=[opt_x1_vi, x1_vi, x2_vi], + ) + + # Add `x2` and const subgraph + y2_vi = make_vi_like(y_vi, "x2.add.const") + const_vi, const_node, const_shape_tensor = make_const(x1_vi, "x1.const", value=1) + add_const_node = helper.make_node( + "Add", + inputs=[const_vi.name, x2_vi.name], + outputs=[y2_vi.name], + name="Add_Const", + ) + add_x2_const_subgraph = helper.make_graph( + [const_node, add_const_node], + name="add-x2-const-subgraph", + inputs=[], + outputs=[y2_vi], + value_info=[const_vi, x2_vi], + initializer=[const_shape_tensor], + ) + + # Add `x3` and const subgraph + add_const_out_vi = make_vi_like(x3_vi, "out.1") + y3_vi = make_vi_like(y_vi, "x3.add.const") + const_vi, const_node, const_shape_tensor = make_const(x3_vi, "x3.const", value=2) + add_const_node = helper.make_node( + "Add", + inputs=[const_vi.name, x3_vi.name], + outputs=[add_const_out_vi.name], + name="Add_Const.1", + ) + expand_shape = helper.make_tensor(f"{add_const_out_vi}.shape", TensorProto.INT64, [2], [1, 2]) + expand_node = helper.make_node( + "Expand", + inputs=[add_const_out_vi.name, expand_shape.name], + outputs=[y3_vi.name], + name="Expand.out", + ) + add_x3_const_subgraph = helper.make_graph( + [const_node, add_const_node, expand_node], + name="add-x3-const-subgraph", + inputs=[], + outputs=[y3_vi], + value_info=[x3_vi, const_vi, add_const_out_vi], + initializer=[const_shape_tensor, expand_shape], + ) + + # Subgraph flow based on `x3` value + y3_if_vi = make_vi_like(y_vi, "x3.if.out") + x3_eq_vi, x3_const_node, x3_const_shape_tensor = make_const(x3_vi, "x3.equal", value=0) + x3_ge_vi = helper.make_tensor_value_info( + "x3_ge", + TensorProto.BOOL, + shape=[1], + ) + x3_ge_node = helper.make_node( + "GreaterOrEqual", + inputs=[x3_vi.name, x3_eq_vi.name], + outputs=[x3_ge_vi.name], + name="GreaterOrEqual.Target", + ) + x3_has_elem_vi = helper.make_tensor_value_info( + "x3_has_elem", + TensorProto.BOOL, + shape=[], # scalar + ) + x3_has_elem_node = helper.make_node( + "Squeeze", + inputs=[x3_ge_vi.name], + outputs=[x3_has_elem_vi.name], + name="Squeeze.x3", + ) + if_input_node = helper.make_node( + "If", + inputs=[x3_has_elem_vi.name], # condition + outputs=[y3_if_vi.name], + name="If.OptionalHasElement.x3", + then_branch=add_x3_const_subgraph, + else_branch=add_x2_const_subgraph, + ) + x3_subgraph = helper.make_graph( + [x3_const_node, x3_ge_node, x3_has_elem_node, if_input_node], + name="x3-subgraph", + inputs=[], + outputs=[y3_if_vi], + value_info=[x3_vi, x3_eq_vi, x3_ge_vi, x3_has_elem_vi], + initializer=[x3_const_shape_tensor], + ) + + # Construct main graph + x1_has_elem_vi = helper.make_tensor_value_info( + "x1_has_elem", + TensorProto.BOOL, + shape=[], # scalar + ) + x1_has_elem_node = helper.make_node( + "OptionalHasElement", + inputs=[opt_x1_vi.name], + outputs=[x1_has_elem_vi.name], + name="OptionalHasElement.x1", + ) + if_input_node = helper.make_node( + "If", + inputs=[x1_has_elem_vi.name], # condition + outputs=[y_vi.name], + name="If.OptionalHasElement.x1", + then_branch=add_x1_x2_subgraph, + else_branch=x3_subgraph, + ) + graph = helper.make_graph( + [x1_has_elem_node, if_input_node], + "opt-graph", + [opt_x1_vi, x2_vi, x3_vi], + [y_vi], + value_info=[x1_has_elem_vi], + ) + + m = helper.make_model( + graph, + opset_imports=[ + helper.make_opsetid("", 15), + ], + ) + + checker.check_model(m, full_check=True) + + return m + + +def test_nested_optional_greater_or_equal(use_trt: bool = False) -> None: + m = make_opt_nested_greater_or_equal() + + providers = ["CUDAExecutionProvider"] + if use_trt: + providers.insert(0, "TensorrtExecutionProvider") + session = ort.InferenceSession( + m.SerializeToString(), + providers=providers, + ) + + x1_name, x2_name, x3_name = (i.name for i in m.graph.input) + session.run( + [m.graph.output[0].name], + { + x1_name: None, + x2_name: np.ones((1, 2), dtype=np.float32), + x3_name: np.array([-1], dtype=np.float32), + }, + ) + + return + + +# ORT has a similar unit test Test3LayerNestedSubgraph where this 3-layer nested graph consumes the same initializer in different subgraphs. +# However, this unit test is slightly different. This is also a 3-layer nested graph but consumes the outer scope values (which are the inputs +# of the top-level graph) in different subgraphs. +class TestNestedControlFlowOpsGraph(unittest.TestCase): + # We currently only test CUDA/TRT EP due to users only raise this issue when using CUDA/TRT EP. + @unittest.skipIf( + "TensorrtExecutionProvider" not in ort.get_available_providers() + and "CUDAExecutionProvider" not in ort.get_available_providers(), + reason="Test CUDA/TRT EP only", + ) + def test_3_level_control_flow_ops_graph(self): + if "CUDAExecutionProvider" in ort.get_available_providers(): + test_nested_optional_greater_or_equal(use_trt=False) + if "TensorrtExecutionProvider" in ort.get_available_providers(): + test_nested_optional_greater_or_equal(use_trt=True) + + +if __name__ == "__main__": + unittest.main(module=__name__, buffer=True) diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 401c791bae6ac..85a3488fa8b60 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -1788,6 +1788,11 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs): if not args.disable_ml_ops and not args.use_tensorrt: run_subprocess([sys.executable, "onnxruntime_test_python_mlops.py"], cwd=cwd, dll_path=dll_path) + if args.use_tensorrt: + run_subprocess( + [sys.executable, "onnxruntime_test_python_nested_control_flow_op.py"], cwd=cwd, dll_path=dll_path + ) + try: import onnx # noqa: F401