diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index d75a7a519254e..33056006410c8 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -3878,13 +3878,17 @@ Node& Graph::CreateFusedSubGraphNode(const IndexedSubGraph& sub_graph, const std
 
   int cur_idx = 0;
   for (const auto& arg_name : func_meta_def->inputs) {
-    input_args.push_back(GetNodeArg(arg_name));
+    // In some cases, it needs to get the NodeArgs from ancestors.
+    // For example, if the subgraph we are going to build is the subgraph of the original graph
+    // and the NodeArgs of the outer scope values are defined in the top-level original graph.
+    input_args.push_back(GetNodeArgIncludingParentGraphs(arg_name));
     input_indexes[arg_name] = cur_idx++;
   }
 
   cur_idx = 0;
   for (const auto& arg_name : func_meta_def->outputs) {
-    output_args.push_back(GetNodeArg(arg_name));
+    // In some cases, it needs to get the NodeArgs from ancestors.
+    output_args.push_back(GetNodeArgIncludingParentGraphs(arg_name));
     output_indexes[arg_name] = cur_idx++;
   }
 
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index fa6ad26cc248a..27226005a9c0b 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -658,6 +658,8 @@ struct ProviderHost {
   virtual std::unique_ptr<Node__EdgeIterator> Node__OutputEdgesEnd(const Node* p) noexcept = 0;
 
   virtual void Node__ForEachDef(const Node* p, std::function<void(const NodeArg&, bool is_input)> func, bool include_missing_optional_defs) = 0;
+  virtual const std::unordered_map<std::string, gsl::not_null<Graph*>>& Node__GetAttributeNameToMutableSubgraphMap(Node* p) = 0;
+  virtual std::unordered_map<std::string, gsl::not_null<const Graph*>> Node__GetAttributeNameToSubgraphMap(const Node* p) const = 0;
 
   // NodeArg
   virtual const std::string& NodeArg__Name(const NodeArg* p) noexcept = 0;
@@ -695,6 +697,8 @@ struct ProviderHost {
   virtual std::unique_ptr<ONNX_NAMESPACE::GraphProto> Graph__ToGraphProto(const Graph* p) = 0;
 
   virtual NodeArg& Graph__GetOrCreateNodeArg(Graph* p, const std::string& name, const ONNX_NAMESPACE::TypeProto* p_arg_type) = 0;
+  virtual void Graph__AddOuterScopeNodeArg(Graph* p, const std::string& name) = 0;
+  virtual void Graph__SetInputs(Graph* p, gsl::span<const NodeArg* const> inputs) = 0;
 
   virtual Status Graph__Resolve(Graph* p) = 0;
   virtual void Graph__AddInitializedTensor(Graph* p, const ONNX_NAMESPACE::TensorProto& tensor) = 0;
@@ -708,10 +712,15 @@ struct ProviderHost {
 
   virtual const Node* Graph__ParentNode(const Graph* p) const = 0;
   virtual const Graph* Graph__ParentGraph(const Graph* p) const = 0;
+  virtual Graph* Graph__MutableParentGraph(Graph* p) = 0;
   virtual const std::string& Graph__Name(const Graph* p) const noexcept = 0;
   virtual const Path& Graph__ModelPath(const Graph* p) const = 0;
   virtual const std::vector<const NodeArg*>& Graph__GetInputsIncludingInitializers(const Graph* p) const noexcept = 0;
   virtual bool Graph__IsSubgraph(const Graph* p) = 0;
+  virtual int Graph__MaxNodeIndex(const Graph* p) const noexcept = 0;
+  virtual Node* Graph__GetNode(Graph* p, NodeIndex node_index) noexcept = 0;
+  virtual const Node* Graph__GetNode(const Graph* p, NodeIndex node_index) const = 0;
+  virtual const NodeArg* Graph__GetNodeArg(const Graph* p, const std::string& name) const = 0;
 
   // GraphViewer
   virtual void GraphViewer__operator_delete(GraphViewer* p) = 0;
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index e09c2c495886a..f0ab7869b7d50 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -653,6 +653,8 @@ struct Node final {
   EdgeConstIterator OutputEdgesEnd() const noexcept { return g_host->Node__OutputEdgesEnd(this); }
 
   void ForEachDef(std::function<void(const NodeArg&, bool is_input)> func, bool include_missing_optional_defs = false) const { g_host->Node__ForEachDef(this, func, std::move(include_missing_optional_defs)); }
+  const std::unordered_map<std::string, gsl::not_null<Graph*>>& GetAttributeNameToMutableSubgraphMap() { return g_host->Node__GetAttributeNameToMutableSubgraphMap(this); }
+  std::unordered_map<std::string, gsl::not_null<const Graph*>> GetAttributeNameToSubgraphMap() const { return g_host->Node__GetAttributeNameToSubgraphMap(this); }
 
   PROVIDER_DISALLOW_ALL(Node)
 };
@@ -707,6 +709,8 @@ struct Graph final {
   std::unique_ptr<ONNX_NAMESPACE::GraphProto> ToGraphProto() const { return g_host->Graph__ToGraphProto(this); }
 
   NodeArg& GetOrCreateNodeArg(const std::string& name, const ONNX_NAMESPACE::TypeProto* p_arg_type) { return g_host->Graph__GetOrCreateNodeArg(this, name, p_arg_type); }
+  void AddOuterScopeNodeArg(const std::string& name) { g_host->Graph__AddOuterScopeNodeArg(this, name); }
+  void SetInputs(gsl::span<const NodeArg* const> inputs) { g_host->Graph__SetInputs(this, inputs); }
 
   Status Resolve() { return g_host->Graph__Resolve(this); }
   void AddInitializedTensor(const ONNX_NAMESPACE::TensorProto& tensor) { return g_host->Graph__AddInitializedTensor(this, tensor); }
@@ -721,10 +725,15 @@ struct Graph final {
 
   const Node* ParentNode() const { return g_host->Graph__ParentNode(this); }
   const Graph* ParentGraph() const { return g_host->Graph__ParentGraph(this); }
+  Graph* MutableParentGraph() { return g_host->Graph__MutableParentGraph(this); }
   const std::string& Name() const noexcept { return g_host->Graph__Name(this); }
   const Path& ModelPath() const { return g_host->Graph__ModelPath(this); }
   const std::vector<const NodeArg*>& GetInputsIncludingInitializers() const noexcept { return g_host->Graph__GetInputsIncludingInitializers(this); }
   bool IsSubgraph() const { return g_host->Graph__IsSubgraph(this); }
+  int MaxNodeIndex() const noexcept { return g_host->Graph__MaxNodeIndex(this); }
+  const Node* GetNode(NodeIndex node_index) const noexcept { return g_host->Graph__GetNode(this, node_index); }
+  Node* GetNode(NodeIndex node_index) noexcept { return g_host->Graph__GetNode(this, node_index); }
+  const NodeArg* GetNodeArg(const std::string& name) const { return g_host->Graph__GetNodeArg(this, name); }
 
   PROVIDER_DISALLOW_ALL(Graph)
 };
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 6142b6d393d7e..1567628a66829 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -1283,6 +1283,7 @@ SubGraphCollection_t TensorrtExecutionProvider::GetSupportedList(SubGraphCollect
       } else {
         auto model_build = graph.CreateModel(*GetLogger());
         auto& graph_build = model_build->MainGraph();
+        bool has_control_flow_op = false;
 
         // Add node and node args
         // If node output is also parent graph output, the  output will be added to the
@@ -1321,6 +1322,10 @@ SubGraphCollection_t TensorrtExecutionProvider::GetSupportedList(SubGraphCollect
             }
           }
 
+          if (control_flow_op_set_.find(node->OpType()) != control_flow_op_set_.end()) {
+            has_control_flow_op = true;
+          }
+
           // If the node has subgraph, it's possible that the ORT graph of that subgraph and the GraphProto in the node attributes are not in sync because of graph optimization.
           // Therefore, we need to force GraphProto attributes to be updated in order to get the valid GraphProto.
           if (node->GetAttributes().size() > 0) {
@@ -1345,6 +1350,13 @@ SubGraphCollection_t TensorrtExecutionProvider::GetSupportedList(SubGraphCollect
           }
         }
 
+        if (has_control_flow_op) {
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Handle outer scope values for the subgraph " << graph_build.Name();
+          BuildSubGraphContext(graph_build);
+          SetGraphOuterScopeValuesAndInputs(graph_build, graph.GetGraph());
+          SetAllGraphInputs(graph_build);
+        }
+
         ORT_ENFORCE(graph_build.Resolve().IsOK());
 
         // Add parent graph output to the subgraph
@@ -1657,6 +1669,20 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
           std::iota(std::begin(subgraph_nodes_vector), std::end(subgraph_nodes_vector), 0);
           SubGraphCollection_t parser_subgraph_nodes_vector = {{subgraph_nodes_vector, false}};
           bool subgraph_early_termination = false;
+
+          // Another subgraph of "If" control flow has been parsed by GetCapability before and all subgraph's nodes assigned to TRT EP.
+          if (AllNodesAssignedToSpecificEP(*sub_graph_veiwer, kTensorrtExecutionProvider)) {
+            all_subgraphs_are_supported = true;
+            break;
+          }
+          // Another subgraph of "If" control flow has been parsed by GetCapability and not all subgraph's nodes assigned to TRT EP.
+          // (Note: GetExecutionProviderType() returns "" meaning node has not yet been assigned to any EPs)
+          else if (!AllNodesAssignedToSpecificEP(*sub_graph_veiwer, "")) {
+            all_subgraphs_are_supported = false;
+            break;
+          }
+
+          // Another subgraph of "If" control flow has not yet been parsed by GetCapability.
           subgraph_supported_nodes_vector = GetSupportedList(parser_subgraph_nodes_vector, 0, max_partition_iterations_, *sub_graph_veiwer, &subgraph_early_termination);
           all_subgraphs_are_supported = IsSubGraphFullySupported(subgraph_supported_nodes_vector, number_of_ort_subgraph_nodes);
           break;
@@ -1677,6 +1703,9 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
         }
       }
       LOGS_DEFAULT(INFO) << "[TensorRT EP] Whole graph will run on TensorRT execution provider";
+
+      // The context map is only used during EP compile time, release it to save memory space.
+      subgraph_context_map_.clear();
       return result;
     }
   }
@@ -1700,6 +1729,8 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
     LOGS_DEFAULT(INFO) << "[TensorRT EP] Graph is partitioned and number of subgraphs running on TensorRT execution provider is " << number_of_subgraphs;
   }
 
+  // The context map is only used during EP compile time, release it to save memory space.
+  subgraph_context_map_.clear();
   return result;
 }
 
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index 56eda7ad83537..13c5eff08bc46 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -138,6 +138,15 @@ struct TensorrtFuncState {
   bool cuda_graph_enable = 0;
 };
 
+// Holds important information for building valid ORT graph.
+struct SubGraphContext {
+  std::unordered_set<std::string> output_args;
+  std::unordered_map<std::string, const NodeArg*> inputs_and_initializers;
+  std::unordered_map<std::string, const NodeArg*> manually_added_graph_inputs;
+};
+
+using SubGraphContextMap = std::unordered_map<std::string, std::unique_ptr<SubGraphContext>>;
+
 // Logical device representation.
 class TensorrtExecutionProvider : public IExecutionProvider {
  public:
@@ -224,6 +233,7 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   const int min_num_runs_before_cuda_graph_capture_ = 1;  // required min regular runs before graph capture for the necessary memory allocations.
 
   std::unordered_set<std::string> control_flow_op_set_ = {"If", "Loop", "Scan"};
+  mutable std::unordered_map<std::string, std::unique_ptr<SubGraphContext>> subgraph_context_map_;
   std::unordered_map<std::string, tensorrt_ptr::unique_pointer<nvonnxparser::IParser>> parsers_;
   std::unordered_map<std::string, std::unique_ptr<nvinfer1::ICudaEngine>> engines_;
   std::unordered_map<std::string, std::unique_ptr<nvinfer1::IExecutionContext>> contexts_;
@@ -273,6 +283,42 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   /**Check whether all the nodes of subgraph are supported*/
   bool IsSubGraphFullySupported(SubGraphCollection_t supported_nodes_vector, const int number_of_ort_nodes) const;
 
+  /**
+   * Set inputs, initializers and outputs for all subgraphs during TensorrtExecutionProvider::GetSupportedList()
+   * and save those information in subgraph context data structure. It's useful for building a valid graph and
+   * make Graph::Resolve() happy especially when dealing with nested control-flow op graph.
+   */
+  void BuildSubGraphContext(const Graph& build_graph) const;
+
+  /**
+   * Set outer scope values for subgraphs and add thoes values as top-level graph's inputs if needed.
+   */
+  void SetGraphOuterScopeValuesAndInputs(Graph& build_graph, const Graph& graph) const;
+
+  /**
+   * If ORT TRT manually sets graph input in TensorrtExecutionProvider::SetGraphOuterScopeValuesAndInputs(),
+   * we have to manully set all the graph inputs in order to pass Graph::Resolve().
+   */
+  void SetAllGraphInputs(Graph& graph) const;
+
+  /**
+   * The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage
+   * Graph::ResolveContext::IsInputInitializerOrOutput(). We have to implement this fuction again.
+   */
+  bool IsInputInitializerOrOutput(const Graph& graph, const std::string& name, bool check_ancestors) const;
+
+  /**
+   * The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage
+   * Graph::ResolveContext::IsOuterScopeValue(). We have to implement this fuction again.
+   */
+  bool IsOuterScopeValue(const Graph& graph, const std::string& name) const;
+
+  /**
+   * The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage
+   * Graph::ResolveContext::IsLocalValue(). We have to implement this fuction again.
+   */
+  bool IsLocalValue(const Graph& graph, const std::string& name) const;
+
   bool IsGraphCaptureAllowed() const;
   void CaptureBegin();
   void CaptureEnd();
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc
new file mode 100644
index 0000000000000..ecc72b1c65476
--- /dev/null
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc
@@ -0,0 +1,230 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/shared_library/provider_api.h"
+#include "tensorrt_execution_provider.h"
+#include <iostream>
+
+namespace onnxruntime {
+
+// The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage
+// Graph::ResolveContext::IsInputInitializerOrOutput(). We have to implement this fuction again.
+bool TensorrtExecutionProvider::IsInputInitializerOrOutput(const Graph& graph,
+                                                           const std::string& name,
+                                                           bool check_ancestors) const {
+  const Graph* parent_graph = nullptr;
+  return IsLocalValue(graph, name) ||
+         (check_ancestors && (parent_graph = graph.ParentGraph()) != nullptr &&
+          IsInputInitializerOrOutput(*parent_graph, name, check_ancestors));
+}
+
+// The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage
+// Graph::ResolveContext::IsOuterScopeValue(). We have to implement this function again.
+bool TensorrtExecutionProvider::IsOuterScopeValue(const Graph& graph,
+                                                  const std::string& name) const {
+  const Graph* parent_graph = nullptr;
+  return (parent_graph = graph.ParentGraph()) != nullptr &&
+         IsInputInitializerOrOutput(*parent_graph, name, true);
+}
+
+// The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage
+// Graph::ResolveContext::IsLocalValue(). We have to implement this function again.
+bool TensorrtExecutionProvider::IsLocalValue(const Graph& graph,
+                                             const std::string& name) const {
+  if (subgraph_context_map_.find(graph.Name()) == subgraph_context_map_.end()) {
+    return false;
+  }
+  SubGraphContext* context = subgraph_context_map_.at(graph.Name()).get();
+  return context->output_args.find(name) != context->output_args.cend() ||
+         context->inputs_and_initializers.find(name) != context->inputs_and_initializers.cend();
+}
+
+/**
+ * Set inputs, initializers and outputs for all subgraphs during TensorrtExecutionProvider::GetSupportedList()
+ * and save those information in subgraph context data structure. It's useful for building a valid graph and
+ * make Graph::Resolve() happy especially when dealing with nested control-flow op graph.
+ */
+void TensorrtExecutionProvider::BuildSubGraphContext(const Graph& graph) const {
+  // Iterate all the nodes and recurse into inner most subgraph first
+  for (int i = 0; i < graph.MaxNodeIndex(); ++i) {
+    auto node = graph.GetNode(i);
+    if (node == nullptr) {
+      continue;
+    }
+
+    auto subgraph_map = node->GetAttributeNameToSubgraphMap();
+    for (auto& entry : subgraph_map) {
+      const Graph* subgraph = entry.second;
+      BuildSubGraphContext(*subgraph);
+    }
+  }
+
+  // Subgraph context has been built before, no need to do it again
+  if (subgraph_context_map_.find(graph.Name()) != subgraph_context_map_.end()) {
+    return;
+  }
+
+  subgraph_context_map_.emplace(graph.Name(), std::make_unique<SubGraphContext>());
+  SubGraphContext* context = subgraph_context_map_.at(graph.Name()).get();
+
+  // Collect all nodes' outputs and nodes' name
+  for (int i = 0; i < graph.MaxNodeIndex(); ++i) {
+    auto node = graph.GetNode(i);
+    if (node == nullptr) {
+      continue;
+    }
+
+    for (const auto& output : node->OutputDefs()) {
+      context->output_args.insert(output->Name());
+    }
+  }
+
+  // Go thru all node's inputs
+  for (int i = 0; i < graph.MaxNodeIndex(); ++i) {
+    auto node = graph.GetNode(i);
+    if (node == nullptr) {
+      continue;
+    }
+
+    for (const auto& input : node->InputDefs()) {
+      if (context->output_args.find(input->Name()) != context->output_args.end()) {
+        continue;
+      }
+      // This input arg is not the output of another node so must come from either a graph input or an initializer.
+      context->inputs_and_initializers[input->Name()] = input;
+    }
+  }
+}
+
+// Set outer scope values for subgraphs and add thoes values as top-level graph's inputs if needed.
+void TensorrtExecutionProvider::SetGraphOuterScopeValuesAndInputs(Graph& graph_build,
+                                                                  const Graph& graph) const {
+  // Iterate all the nodes and recurse into inner most subgraph first for both newly built graph and original graph
+  for (int i = 0; i < graph_build.MaxNodeIndex(); ++i) {
+    auto graph_build_node = graph_build.GetNode(i);
+    if (graph_build_node == nullptr) {
+      continue;
+    }
+
+    auto graph_build_map = graph_build_node->GetAttributeNameToMutableSubgraphMap();
+    std::unordered_map<std::string, gsl::not_null<const Graph*>> subgraph_map;
+    const Node* graph_node = nullptr;
+
+    // Find corresponding original graph node's subgraphs
+    for (int j = 0; j < graph.MaxNodeIndex(); ++j) {
+      if (graph.GetNode(j) && graph.GetNode(j)->Name() == graph_build_node->Name()) {
+        graph_node = graph.GetNode(j);
+        subgraph_map = graph_node->GetAttributeNameToSubgraphMap();
+        break;
+      }
+    }
+
+    for (auto& entry : graph_build_map) {
+      auto attr_name = entry.first;
+      Graph* subgraph_build = entry.second;
+      if (subgraph_map.find(attr_name) != subgraph_map.end()) {
+        // recurse into subgraph
+        const Graph* subgraph = subgraph_map.at(attr_name);
+        SetGraphOuterScopeValuesAndInputs(*subgraph_build, *subgraph);
+      }
+    }
+  }
+
+  // Start from the inner most subgraph first and check whether its outer scope values are existed in the
+  // newly built graph. If not, we need to add those outer scope values as explicit inputs to the top-level
+  // of newly built graph.
+  if (graph_build.ParentNode()) {
+    auto top_level_graph = &graph_build;
+    while (top_level_graph->MutableParentGraph()) {
+      top_level_graph = top_level_graph->MutableParentGraph();
+    }
+    if (subgraph_context_map_.find(top_level_graph->Name()) == subgraph_context_map_.end()) {
+      LOGS_DEFAULT(ERROR) << "[TensorRT EP] Can't find top-level graph context. \
+                              Please check BuildSubGraphContext() has built the graph context correctly.";
+      return;
+    }
+
+    SubGraphContext* context = subgraph_context_map_.at(top_level_graph->Name()).get();
+
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Subgraph name is " << graph_build.Name();
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Its parent node is " << graph.ParentNode()->Name();
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Its parent node's implicit inputs:";
+
+    // Iterate all the implicit inputs to set outer scope value for the newly built subgraph
+    for (const auto& input : graph.ParentNode()->ImplicitInputDefs()) {
+      LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] \t" << input->Name();
+
+      // The node arg in parent node's implicit inputs could be used for parent node's other subgraph, for example
+      // "If" op has two subgraphs. So we need to make sure that the node arg is used in current subgraph only.
+      // (GetNodeArg searches for specific node arg in all node args in the graph)
+      if (graph_build.GetNodeArg(input->Name())) {
+        graph_build.AddOuterScopeNodeArg(input->Name());
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] \t" << input->Name() << " is used in this subgraph";
+
+        if (context &&
+            (context->manually_added_graph_inputs.find(input->Name()) != context->manually_added_graph_inputs.end())) {
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] \t" << input->Name() << " is already been added as an explicit input to graph";
+          continue;
+        }
+
+        // Handle the case where this outer scope value is not existed in any outer scope levels of the
+        // newly built graph (the newly built graph is the subgraph of the original graph). Need to add
+        // the outer scope value as an explicit input to the top-level of newly built graph.
+        if (!IsOuterScopeValue(graph_build, input->Name())) {
+          const auto& name = input->Name();
+          auto graph_inputs_including_initializers = top_level_graph->GetInputsIncludingInitializers();
+          auto added_graph_input = std::find_if(graph_inputs_including_initializers.begin(),
+                                                graph_inputs_including_initializers.end(),
+                                                [&name](const NodeArg* entry) { return entry->Name() == name; });
+
+          if (added_graph_input == graph_inputs_including_initializers.end()) {
+            if (context) {
+              auto type_proto = ONNX_NAMESPACE::TypeProto::Create();
+              type_proto->copy_from(input->TypeAsProto());
+              auto& n_input = top_level_graph->GetOrCreateNodeArg(name, type_proto.get());
+              context->manually_added_graph_inputs[n_input.Name()] = &n_input;
+              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] \t" << n_input.Name() << " is added as an explicit input into the newly built graph";
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// If ORT TRT manually sets graph input in TensorrtExecutionProvider::SetGraphOuterScopeValuesAndInputs(),
+// we have to manully set all the graph inputs in order to pass Graph::Resolve()
+void TensorrtExecutionProvider::SetAllGraphInputs(Graph& graph) const {
+  // If ORT TRT doesn't manully set graph input in TensorrtExecutionProvider::SetGraphOuterScopeValuesAndInputs(),
+  // Graph::Resolve() will help set graph inputs in Graph::SetGraphInputsOutputs(), so no need to set graph inputs here.
+  if (subgraph_context_map_.find(graph.Name()) == subgraph_context_map_.end() ||
+      subgraph_context_map_[graph.Name()].get()->manually_added_graph_inputs.size() == 0) {
+    return;
+  }
+
+  SubGraphContext* context = subgraph_context_map_[graph.Name()].get();
+  std::vector<const NodeArg*> graph_inputs_including_initializers;
+  std::unordered_set<std::string> graph_inputs_including_initializers_set;
+
+  for (const auto& entry : context->inputs_and_initializers) {
+    graph_inputs_including_initializers.push_back(entry.second);
+    graph_inputs_including_initializers_set.insert(entry.first);
+  }
+
+  for (const auto& entry : context->manually_added_graph_inputs) {
+    if (graph_inputs_including_initializers_set.find(entry.first) == graph_inputs_including_initializers_set.end()) {
+      graph_inputs_including_initializers.push_back(entry.second);
+      graph_inputs_including_initializers_set.insert(entry.first);
+    }
+  }
+
+  for (const auto& node_arg : graph.GetInputsIncludingInitializers()) {
+    if (graph_inputs_including_initializers_set.find(node_arg->Name()) == graph_inputs_including_initializers_set.end()) {
+      graph_inputs_including_initializers.push_back(node_arg);
+      graph_inputs_including_initializers_set.insert(node_arg->Name());
+    }
+  }
+
+  graph.SetInputs(graph_inputs_including_initializers);
+}
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 4003f984beda7..be549f5e665a0 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -763,6 +763,8 @@ struct ProviderHostImpl : ProviderHost {
   std::unique_ptr<Node__EdgeIterator> Node__OutputEdgesEnd(const Node* p) noexcept override { return std::make_unique<Node__EdgeIterator_Impl>(p->OutputEdgesEnd()); }
 
   void Node__ForEachDef(const Node* p, std::function<void(const NodeArg&, bool is_input)> func, bool include_missing_optional_defs) override { p->ForEachDef(func, std::move(include_missing_optional_defs)); }
+  const std::unordered_map<std::string, gsl::not_null<Graph*>>& Node__GetAttributeNameToMutableSubgraphMap(Node* p) noexcept override { return p->GetAttributeNameToMutableSubgraphMap(); }
+  std::unordered_map<std::string, gsl::not_null<const Graph*>> Node__GetAttributeNameToSubgraphMap(const Node* p) const override { return p->GetAttributeNameToSubgraphMap(); }
 
   // NodeArg (wrapped)
   const std::string& NodeArg__Name(const NodeArg* p) noexcept override { return p->Name(); }
@@ -803,8 +805,10 @@ struct ProviderHostImpl : ProviderHost {
   // Graph (wrapped)
   std::unique_ptr<GraphViewer> Graph__CreateGraphViewer(const Graph* p) override { return std::make_unique<GraphViewer>(*p); }
   std::unique_ptr<ONNX_NAMESPACE::GraphProto> Graph__ToGraphProto(const Graph* p) override { return std::make_unique<ONNX_NAMESPACE::GraphProto>(p->ToGraphProto()); }
+  void Graph__SetInputs(Graph* p, gsl::span<const NodeArg* const> inputs) override { p->SetInputs(inputs); }
 
   NodeArg& Graph__GetOrCreateNodeArg(Graph* p, const std::string& name, const ONNX_NAMESPACE::TypeProto* p_arg_type) override { return p->GetOrCreateNodeArg(name, p_arg_type); }
+  void Graph__AddOuterScopeNodeArg(Graph* p, const std::string& name) override { p->AddOuterScopeNodeArg(name); }
 
   Status Graph__Resolve(Graph* p) override { return p->Resolve(); }
   void Graph__AddInitializedTensor(Graph* p, const ONNX_NAMESPACE::TensorProto& tensor) override { p->AddInitializedTensor(tensor); }
@@ -820,10 +824,15 @@ struct ProviderHostImpl : ProviderHost {
 
   const Node* Graph__ParentNode(const Graph* p) const override { return p->ParentNode(); }
   const Graph* Graph__ParentGraph(const Graph* p) const override { return p->ParentGraph(); }
+  Graph* Graph__MutableParentGraph(Graph* p) override { return p->MutableParentGraph(); }
   const std::string& Graph__Name(const Graph* p) const noexcept override { return p->Name(); }
   const Path& Graph__ModelPath(const Graph* p) const override { return p->ModelPath(); }
   const std::vector<const NodeArg*>& Graph__GetInputsIncludingInitializers(const Graph* p) const noexcept override { return p->GetInputsIncludingInitializers(); }
   bool Graph__IsSubgraph(const Graph* p) override { return p->IsSubgraph(); }
+  int Graph__MaxNodeIndex(const Graph* p) const noexcept override { return p->MaxNodeIndex(); }
+  Node* Graph__GetNode(Graph* p, NodeIndex node_index) noexcept override { return p->GetNode(node_index); }
+  const Node* Graph__GetNode(const Graph* p, NodeIndex node_index) const override { return p->GetNode(node_index); }
+  const NodeArg* Graph__GetNodeArg(const Graph* p, const std::string& name) const override { return p->GetNodeArg(name); }
 
   // GraphViewer (wrapped)
   void GraphViewer__operator_delete(GraphViewer* p) override { delete p; }
diff --git a/onnxruntime/test/python/onnxruntime_test_python_nested_control_flow_op.py b/onnxruntime/test/python/onnxruntime_test_python_nested_control_flow_op.py
new file mode 100644
index 0000000000000..bf354ad9f9e10
--- /dev/null
+++ b/onnxruntime/test/python/onnxruntime_test_python_nested_control_flow_op.py
@@ -0,0 +1,259 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import unittest
+from copy import deepcopy
+from typing import Optional, Sequence, Tuple
+
+import numpy as np
+from onnx import ModelProto, NodeProto, TensorProto, ValueInfoProto, checker, helper
+
+import onnxruntime as ort
+
+
+def make_vi_like(vi: ValueInfoProto, name: str) -> ValueInfoProto:
+    """Makes a copy of `vi` with a new name."""
+    new_vi = deepcopy(vi)
+    new_vi.name = name
+    return new_vi
+
+
+def make_optional_tensor_value_info(name: str, elem_type: int, shape: Sequence[int]) -> ValueInfoProto:
+    """Makes a `ValueInfoProto` with optional type."""
+    tensor_type_proto = helper.make_tensor_type_proto(
+        elem_type=elem_type,
+        shape=shape,
+    )
+    opt_type_proto = helper.make_optional_type_proto(tensor_type_proto)
+
+    vi = helper.make_tensor_value_info(name, elem_type, shape)
+    vi.type.CopyFrom(opt_type_proto)
+    return vi
+
+
+def make_optional_vi(vi: ValueInfoProto, name: Optional[str] = None) -> ValueInfoProto:
+    """Makes a copy of `vi` with optional type."""
+    name = name or vi.name + ".opt"
+    vi_type = vi.type.tensor_type
+    vi_shape = [d.dim_param if d.dim_param else d.dim_value for d in vi_type.shape.dim]
+    opt_vi = make_optional_tensor_value_info(name, vi_type.elem_type, vi_shape)
+    return opt_vi
+
+
+def make_const(vi: ValueInfoProto, name: str, value: int = 0) -> Tuple[ValueInfoProto, NodeProto, TensorProto]:
+    """Creates a constant 1D tensor from `vi`."""
+    const_vi = make_vi_like(vi, name)
+    const_shape = [d.dim_value for d in vi.type.tensor_type.shape.dim]
+    const_shape_tensor = helper.make_tensor(f"{name}.shape", TensorProto.INT64, [len(const_shape)], const_shape)
+    const_fill = helper.make_tensor(f"{name}.const.value", const_vi.type.tensor_type.elem_type, [1], [value])
+    const_node = helper.make_node(
+        "ConstantOfShape",
+        inputs=[const_shape_tensor.name],
+        outputs=[const_vi.name],
+        name=f"ConstantOfShape.{name}",
+        value=const_fill,
+    )
+    return const_vi, const_node, const_shape_tensor
+
+
+# This is a three-layer nested control flow ops model.
+# The innermost subgraphs have the outer scope values that are the inputs, x2 and x3, of the top-level graph.
+def make_opt_nested_greater_or_equal() -> ModelProto:
+    """
+    Creates a nested graph with (`optional(x1)`, `x2`, x3`) tensor inputs.
+
+    `x3` is similar to an optional input with default value of -1.
+    """
+    # Inputs/outputs
+    x1_vi = helper.make_tensor_value_info("x1", TensorProto.FLOAT, [1, 2])
+    x2_vi = helper.make_tensor_value_info("x2", TensorProto.FLOAT, [1, 2])
+    x3_vi = helper.make_tensor_value_info("x3", TensorProto.FLOAT, [1])
+    y_vi = helper.make_tensor_value_info("y", TensorProto.FLOAT, [1, 2])
+    opt_x1_vi = make_optional_vi(x1_vi, name="x1.opt")
+
+    # Add `x1` and `x2` subgraph
+    y1_vi = make_vi_like(y_vi, "x1.add.x2")
+    input_get_elem_node = helper.make_node(
+        "OptionalGetElement",
+        inputs=[opt_x1_vi.name],
+        outputs=[x1_vi.name],
+        name="OptionalGetElement.Input",
+    )
+    add_node = helper.make_node("Add", inputs=[x1_vi.name, x2_vi.name], outputs=[y1_vi.name], name="Add_Op")
+    add_x1_x2_subgraph = helper.make_graph(
+        [input_get_elem_node, add_node],
+        name="add-x1-x2-subgraph",
+        inputs=[],
+        outputs=[y1_vi],
+        value_info=[opt_x1_vi, x1_vi, x2_vi],
+    )
+
+    # Add `x2` and const subgraph
+    y2_vi = make_vi_like(y_vi, "x2.add.const")
+    const_vi, const_node, const_shape_tensor = make_const(x1_vi, "x1.const", value=1)
+    add_const_node = helper.make_node(
+        "Add",
+        inputs=[const_vi.name, x2_vi.name],
+        outputs=[y2_vi.name],
+        name="Add_Const",
+    )
+    add_x2_const_subgraph = helper.make_graph(
+        [const_node, add_const_node],
+        name="add-x2-const-subgraph",
+        inputs=[],
+        outputs=[y2_vi],
+        value_info=[const_vi, x2_vi],
+        initializer=[const_shape_tensor],
+    )
+
+    # Add `x3` and const subgraph
+    add_const_out_vi = make_vi_like(x3_vi, "out.1")
+    y3_vi = make_vi_like(y_vi, "x3.add.const")
+    const_vi, const_node, const_shape_tensor = make_const(x3_vi, "x3.const", value=2)
+    add_const_node = helper.make_node(
+        "Add",
+        inputs=[const_vi.name, x3_vi.name],
+        outputs=[add_const_out_vi.name],
+        name="Add_Const.1",
+    )
+    expand_shape = helper.make_tensor(f"{add_const_out_vi}.shape", TensorProto.INT64, [2], [1, 2])
+    expand_node = helper.make_node(
+        "Expand",
+        inputs=[add_const_out_vi.name, expand_shape.name],
+        outputs=[y3_vi.name],
+        name="Expand.out",
+    )
+    add_x3_const_subgraph = helper.make_graph(
+        [const_node, add_const_node, expand_node],
+        name="add-x3-const-subgraph",
+        inputs=[],
+        outputs=[y3_vi],
+        value_info=[x3_vi, const_vi, add_const_out_vi],
+        initializer=[const_shape_tensor, expand_shape],
+    )
+
+    # Subgraph flow based on `x3` value
+    y3_if_vi = make_vi_like(y_vi, "x3.if.out")
+    x3_eq_vi, x3_const_node, x3_const_shape_tensor = make_const(x3_vi, "x3.equal", value=0)
+    x3_ge_vi = helper.make_tensor_value_info(
+        "x3_ge",
+        TensorProto.BOOL,
+        shape=[1],
+    )
+    x3_ge_node = helper.make_node(
+        "GreaterOrEqual",
+        inputs=[x3_vi.name, x3_eq_vi.name],
+        outputs=[x3_ge_vi.name],
+        name="GreaterOrEqual.Target",
+    )
+    x3_has_elem_vi = helper.make_tensor_value_info(
+        "x3_has_elem",
+        TensorProto.BOOL,
+        shape=[],  # scalar
+    )
+    x3_has_elem_node = helper.make_node(
+        "Squeeze",
+        inputs=[x3_ge_vi.name],
+        outputs=[x3_has_elem_vi.name],
+        name="Squeeze.x3",
+    )
+    if_input_node = helper.make_node(
+        "If",
+        inputs=[x3_has_elem_vi.name],  # condition
+        outputs=[y3_if_vi.name],
+        name="If.OptionalHasElement.x3",
+        then_branch=add_x3_const_subgraph,
+        else_branch=add_x2_const_subgraph,
+    )
+    x3_subgraph = helper.make_graph(
+        [x3_const_node, x3_ge_node, x3_has_elem_node, if_input_node],
+        name="x3-subgraph",
+        inputs=[],
+        outputs=[y3_if_vi],
+        value_info=[x3_vi, x3_eq_vi, x3_ge_vi, x3_has_elem_vi],
+        initializer=[x3_const_shape_tensor],
+    )
+
+    # Construct main graph
+    x1_has_elem_vi = helper.make_tensor_value_info(
+        "x1_has_elem",
+        TensorProto.BOOL,
+        shape=[],  # scalar
+    )
+    x1_has_elem_node = helper.make_node(
+        "OptionalHasElement",
+        inputs=[opt_x1_vi.name],
+        outputs=[x1_has_elem_vi.name],
+        name="OptionalHasElement.x1",
+    )
+    if_input_node = helper.make_node(
+        "If",
+        inputs=[x1_has_elem_vi.name],  # condition
+        outputs=[y_vi.name],
+        name="If.OptionalHasElement.x1",
+        then_branch=add_x1_x2_subgraph,
+        else_branch=x3_subgraph,
+    )
+    graph = helper.make_graph(
+        [x1_has_elem_node, if_input_node],
+        "opt-graph",
+        [opt_x1_vi, x2_vi, x3_vi],
+        [y_vi],
+        value_info=[x1_has_elem_vi],
+    )
+
+    m = helper.make_model(
+        graph,
+        opset_imports=[
+            helper.make_opsetid("", 15),
+        ],
+    )
+
+    checker.check_model(m, full_check=True)
+
+    return m
+
+
+def test_nested_optional_greater_or_equal(use_trt: bool = False) -> None:
+    m = make_opt_nested_greater_or_equal()
+
+    providers = ["CUDAExecutionProvider"]
+    if use_trt:
+        providers.insert(0, "TensorrtExecutionProvider")
+    session = ort.InferenceSession(
+        m.SerializeToString(),
+        providers=providers,
+    )
+
+    x1_name, x2_name, x3_name = (i.name for i in m.graph.input)
+    session.run(
+        [m.graph.output[0].name],
+        {
+            x1_name: None,
+            x2_name: np.ones((1, 2), dtype=np.float32),
+            x3_name: np.array([-1], dtype=np.float32),
+        },
+    )
+
+    return
+
+
+# ORT has a similar unit test Test3LayerNestedSubgraph where this 3-layer nested graph consumes the same initializer in different subgraphs.
+# However, this unit test is slightly different. This is also a 3-layer nested graph but consumes the outer scope values (which are the inputs
+# of the top-level graph) in different subgraphs.
+class TestNestedControlFlowOpsGraph(unittest.TestCase):
+    # We currently only test CUDA/TRT EP due to users only raise this issue when using CUDA/TRT EP.
+    @unittest.skipIf(
+        "TensorrtExecutionProvider" not in ort.get_available_providers()
+        and "CUDAExecutionProvider" not in ort.get_available_providers(),
+        reason="Test CUDA/TRT EP only",
+    )
+    def test_3_level_control_flow_ops_graph(self):
+        if "CUDAExecutionProvider" in ort.get_available_providers():
+            test_nested_optional_greater_or_equal(use_trt=False)
+        if "TensorrtExecutionProvider" in ort.get_available_providers():
+            test_nested_optional_greater_or_equal(use_trt=True)
+
+
+if __name__ == "__main__":
+    unittest.main(module=__name__, buffer=True)
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 401c791bae6ac..85a3488fa8b60 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1788,6 +1788,11 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
             if not args.disable_ml_ops and not args.use_tensorrt:
                 run_subprocess([sys.executable, "onnxruntime_test_python_mlops.py"], cwd=cwd, dll_path=dll_path)
 
+            if args.use_tensorrt:
+                run_subprocess(
+                    [sys.executable, "onnxruntime_test_python_nested_control_flow_op.py"], cwd=cwd, dll_path=dll_path
+                )
+
             try:
                 import onnx  # noqa: F401