From f92e47e95b13a240e37caf7b36577983544f98fc Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Thu, 10 Feb 2022 19:17:08 -0800
Subject: [PATCH] Remove onnxruntime_util dependency on onnxruntime_framework
 (#10512)

There's a circular dependency between onnxruntime_util and onnxruntime_framework.
Remove onnxruntime_util's dependency on onnxruntime_framework.
---
 cmake/onnxruntime.cmake                       |  6 +++--
 cmake/onnxruntime_framework.cmake             |  6 ++++-
 cmake/onnxruntime_util.cmake                  |  6 -----
 .../contrib_ops/cpu/element_wise_ops.cc       |  4 +++-
 onnxruntime/core/framework/math.h             | 23 +++++++++++++++++++
 .../framework/orttraining_partial_executor.cc |  4 ++--
 .../core/framework/sequential_executor.cc     |  4 ++--
 .../providers/cpu/math/element_wise_ops.cc    |  4 +++-
 onnxruntime/core/providers/cpu/math/sign.cc   |  2 +-
 .../core/providers/cpu/math/softmax_shared.cc |  5 +++-
 onnxruntime/core/providers/cpu/nn/shrink.cc   |  2 +-
 .../core/providers/cpu/rnn/rnn_helpers.h      |  1 +
 .../core/providers/cpu/tensor/isinf.cc        |  2 +-
 .../core/providers/cpu/tensor/isnan.cc        |  5 ++--
 .../cuda/nvtx_profile_context.h}              |  0
 onnxruntime/core/util/distance.h              |  2 +-
 onnxruntime/core/util/math.h                  |  6 ++---
 onnxruntime/core/util/math_cpu.cc             | 14 ++++++-----
 onnxruntime/core/util/math_cpuonly.h          | 12 ----------
 onnxruntime/core/util/thread_utils.cc         |  6 ++++-
 .../test/common/tensor_op_test_utils.h        |  3 +++
 .../core/graph/gradient_builder_base.h        |  3 ++-
 .../core/session/training_session.cc          |  2 +-
 .../models/runner/training_runner.cc          |  2 +-
 .../training_ops/cpu/math/scale.cc            |  2 +-
 .../training_ops/cpu/nn/dropout_7.cc          |  2 +-
 .../training_ops/cpu/op_gradients.cc          |  2 +-
 .../cuda/communication/nccl_service.cc        |  2 +-
 .../training_ops/cuda/communication/recv.cc   |  2 +-
 .../training_ops/cuda/communication/send.cc   |  4 ++--
 .../training_ops/cuda/controlflow/record.cc   |  4 ++--
 .../training_ops/cuda/controlflow/wait.cc     |  4 ++--
 32 files changed, 88 insertions(+), 58 deletions(-)
 create mode 100644 onnxruntime/core/framework/math.h
 rename onnxruntime/core/{profile/context.h => providers/cuda/nvtx_profile_context.h} (100%)

diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 5f2312216493e..6fe4c6497a9ca 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -7,7 +7,7 @@ if(UNIX)
     set(OUTPUT_STYLE xcode)
   else()
     set(OUTPUT_STYLE gcc)
-  endif()  
+  endif()
 else()
   set(SYMBOL_FILE ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime_dll.def)
   set(OUTPUT_STYLE vc)
@@ -157,6 +157,8 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Android" AND onnxruntime_BUILD_JAVA)
   endforeach()
 endif()
 
+# This list is a reversed topological ordering of library dependencies.
+# Earlier entries may depend on later ones. Later ones should not depend on earlier ones.
 set(onnxruntime_INTERNAL_LIBRARIES
   onnxruntime_session
   ${onnxruntime_libs}
@@ -174,10 +176,10 @@ set(onnxruntime_INTERNAL_LIBRARIES
   ${onnxruntime_winml}
   onnxruntime_optimizer
   onnxruntime_providers
-  onnxruntime_util
   ${onnxruntime_tvm_libs}
   onnxruntime_framework
   onnxruntime_graph
+  onnxruntime_util
   ${ONNXRUNTIME_MLAS_LIBS}
   onnxruntime_common
   onnxruntime_flatbuffers
diff --git a/cmake/onnxruntime_framework.cmake b/cmake/onnxruntime_framework.cmake
index 82f1e75dbc3c1..5b6681fd20daa 100644
--- a/cmake/onnxruntime_framework.cmake
+++ b/cmake/onnxruntime_framework.cmake
@@ -53,7 +53,7 @@ if (onnxruntime_ENABLE_TRAINING OR onnxruntime_ENABLE_TRAINING_OPS)
     onnxruntime_add_include_to_target(onnxruntime_framework Python::Module)
     target_include_directories(onnxruntime_framework PRIVATE ${PROJECT_SOURCE_DIR}/external/dlpack/include)
   endif()
-  if (onnxruntime_USE_NCCL OR onnxruntime_USE_MPI)  
+  if (onnxruntime_USE_NCCL OR onnxruntime_USE_MPI)
     target_include_directories(onnxruntime_framework PUBLIC ${MPI_CXX_INCLUDE_DIRS})
   endif()
 endif()
@@ -95,4 +95,8 @@ if (onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB)
   target_compile_definitions(onnxruntime_framework PRIVATE DEBUG_NODE_INPUTS_OUTPUTS_ENABLE_DUMP_TO_SQLDB)
 endif()
 
+if (WIN32)
+  target_compile_definitions(onnxruntime_framework PRIVATE _SCL_SECURE_NO_WARNINGS)
+endif()
+
 install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/framework  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core)
diff --git a/cmake/onnxruntime_util.cmake b/cmake/onnxruntime_util.cmake
index 8449cf2e0f4e9..d25bd386ec9f2 100644
--- a/cmake/onnxruntime_util.cmake
+++ b/cmake/onnxruntime_util.cmake
@@ -4,17 +4,12 @@
 file(GLOB_RECURSE onnxruntime_util_srcs CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/core/util/*.h"
     "${ONNXRUNTIME_ROOT}/core/util/*.cc"
-    "${ONNXRUNTIME_ROOT}/core/profile/*.h"
-    "${ONNXRUNTIME_ROOT}/core/profile/*.cc"
 )
 
 source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_util_srcs})
 
 onnxruntime_add_static_library(onnxruntime_util ${onnxruntime_util_srcs})
 target_include_directories(onnxruntime_util PRIVATE ${ONNXRUNTIME_ROOT} PUBLIC ${eigen_INCLUDE_DIRS})
-if (onnxruntime_USE_CUDA)
- target_include_directories(onnxruntime_util PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
-endif()
 onnxruntime_add_include_to_target(onnxruntime_util onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB})
 if(UNIX)
     target_compile_options(onnxruntime_util PUBLIC "-Wno-error=comment")
@@ -24,5 +19,4 @@ set_target_properties(onnxruntime_util PROPERTIES FOLDER "ONNXRuntime")
 add_dependencies(onnxruntime_util ${onnxruntime_EXTERNAL_DEPENDENCIES})
 if (WIN32)
     target_compile_definitions(onnxruntime_util PRIVATE _SCL_SECURE_NO_WARNINGS)
-    target_compile_definitions(onnxruntime_framework PRIVATE _SCL_SECURE_NO_WARNINGS)
 endif()
diff --git a/onnxruntime/contrib_ops/cpu/element_wise_ops.cc b/onnxruntime/contrib_ops/cpu/element_wise_ops.cc
index ab14ce0c45119..633748c0b445b 100644
--- a/onnxruntime/contrib_ops/cpu/element_wise_ops.cc
+++ b/onnxruntime/contrib_ops/cpu/element_wise_ops.cc
@@ -1,7 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "element_wise_ops.h"
+#include "contrib_ops/cpu/element_wise_ops.h"
+
+#include "core/framework/math.h"
 #include "core/providers/cpu/math/element_wise_ops.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/framework/math.h b/onnxruntime/core/framework/math.h
new file mode 100644
index 0000000000000..609c2de118d63
--- /dev/null
+++ b/onnxruntime/core/framework/math.h
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <gsl/gsl>
+
+#include "core/framework/tensor.h"
+#include "core/util/math_cpuonly.h"
+
+namespace onnxruntime {
+
+template <typename T>
+auto EigenMap(Tensor& t) -> EigenVectorMap<T> {
+  return EigenVectorMap<T>(t.template MutableData<T>(), gsl::narrow<ptrdiff_t>(t.Shape().Size()));
+}
+
+template <typename T>
+auto EigenMap(const Tensor& t) -> ConstEigenVectorMap<T> {
+  return ConstEigenVectorMap<T>(t.template Data<T>(), gsl::narrow<ptrdiff_t>(t.Shape().Size()));
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/orttraining_partial_executor.cc b/onnxruntime/core/framework/orttraining_partial_executor.cc
index bfec6d6703574..aee060e2c403b 100644
--- a/onnxruntime/core/framework/orttraining_partial_executor.cc
+++ b/onnxruntime/core/framework/orttraining_partial_executor.cc
@@ -22,7 +22,7 @@
 #ifdef ENABLE_NVTX_PROFILE
 // This header is for profile using Nvidia's visual profilier.
 #include "core/providers/cuda/nvtx_profile.h"
-#include "core/profile/context.h"
+#include "core/providers/cuda/nvtx_profile_context.h"
 #endif
 
 // #define TRACE_EXECUTION
@@ -292,7 +292,7 @@ Status PartialExecutor::Execute(const SessionState& session_state, const std::ve
       }
     }
 #ifdef DEBUG_NODE_INPUTS_OUTPUTS
-    dump_context.program_counter = program_counter; 
+    dump_context.program_counter = program_counter;
     utils::DumpNodeInputs(dump_context, op_kernel_context, p_op_kernel->Node(), session_state);
 #endif
 
diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc
index 2391c2ab57aba..e100fc2aa524d 100644
--- a/onnxruntime/core/framework/sequential_executor.cc
+++ b/onnxruntime/core/framework/sequential_executor.cc
@@ -21,8 +21,8 @@
 
 #ifdef ENABLE_NVTX_PROFILE
 // This header is for profile using Nvidia's visual profilier.
-#include "core/providers/cuda/nvtx_profile.h" 
-#include "core/profile/context.h"
+#include "core/providers/cuda/nvtx_profile.h"
+#include "core/providers/cuda/nvtx_profile_context.h"
 #endif
 
 // #define TRACE_EXECUTION
diff --git a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
index 5224c7fef5735..73978d61a1885 100644
--- a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
+++ b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
@@ -1,8 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/framework/data_types_internal.h"
 #include "core/providers/cpu/math/element_wise_ops.h"
+
+#include "core/framework/data_types_internal.h"
+#include "core/framework/math.h"
 #include "core/providers/cpu/tensor/utils.h"
 #include "core/providers/op_kernel_type_control.h"
 #include <unsupported/Eigen/SpecialFunctions>
diff --git a/onnxruntime/core/providers/cpu/math/sign.cc b/onnxruntime/core/providers/cpu/math/sign.cc
index c7ddda013c904..afeff1073384e 100644
--- a/onnxruntime/core/providers/cpu/math/sign.cc
+++ b/onnxruntime/core/providers/cpu/math/sign.cc
@@ -8,10 +8,10 @@
 #include "core/common/common.h"
 #include "core/framework/data_types.h"
 #include "core/framework/element_type_lists.h"
+#include "core/framework/math.h"
 #include "core/framework/op_kernel.h"
 #include "core/providers/op_kernel_type_control.h"
 #include "core/util/math.h"
-#include "core/util/math_cpuonly.h"
 
 using namespace ::onnxruntime::common;
 using namespace ONNX_NAMESPACE;
diff --git a/onnxruntime/core/providers/cpu/math/softmax_shared.cc b/onnxruntime/core/providers/cpu/math/softmax_shared.cc
index f6e246e1b4ebd..b8eb8dae0977b 100644
--- a/onnxruntime/core/providers/cpu/math/softmax_shared.cc
+++ b/onnxruntime/core/providers/cpu/math/softmax_shared.cc
@@ -18,9 +18,12 @@
 * limitations under the License.
 */
 
+#include "core/providers/cpu/math/softmax_shared.h"
+
 #include <algorithm>
 #include <cmath>
-#include "core/providers/cpu/math/softmax_shared.h"
+#include <gsl/gsl>
+
 #include "core/util/math.h"
 #include "core/util/math_cpuonly.h"
 #include "core/mlas/inc/mlas.h"
diff --git a/onnxruntime/core/providers/cpu/nn/shrink.cc b/onnxruntime/core/providers/cpu/nn/shrink.cc
index 0c336da99e24e..7a18e450b06c4 100644
--- a/onnxruntime/core/providers/cpu/nn/shrink.cc
+++ b/onnxruntime/core/providers/cpu/nn/shrink.cc
@@ -4,9 +4,9 @@
 #include "core/providers/cpu/nn/shrink.h"
 
 #include "core/framework/element_type_lists.h"
+#include "core/framework/math.h"
 #include "core/framework/utils.h"
 #include "core/providers/op_kernel_type_control.h"
-#include "core/util/math_cpuonly.h"
 #include "core/util/math.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h b/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h
index 5551ad1cad0d5..95a8d87cc7665 100644
--- a/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h
+++ b/onnxruntime/core/providers/cpu/rnn/rnn_helpers.h
@@ -10,6 +10,7 @@
 #include "core/common/common.h"
 #include "core/common/logging/logging.h"
 #include "core/framework/allocator.h"
+#include "core/framework/tensor.h"
 #include "core/util/math.h"
 #include "core/util/math_cpuonly.h"
 #include "core/util/qmath.h"
diff --git a/onnxruntime/core/providers/cpu/tensor/isinf.cc b/onnxruntime/core/providers/cpu/tensor/isinf.cc
index 782cfeb9bee92..9b455ac7a9587 100644
--- a/onnxruntime/core/providers/cpu/tensor/isinf.cc
+++ b/onnxruntime/core/providers/cpu/tensor/isinf.cc
@@ -5,10 +5,10 @@
 
 #include "core/common/common.h"
 #include "core/framework/data_types_internal.h"
+#include "core/framework/math.h"
 #include "core/framework/op_kernel.h"
 #include "core/framework/tensor.h"
 #include "core/providers/op_kernel_type_control.h"
-#include "core/util/math_cpuonly.h"
 
 namespace onnxruntime {
 // https://github.com/onnx/onnx/blob/master/docs/Operators.md#IsInf
diff --git a/onnxruntime/core/providers/cpu/tensor/isnan.cc b/onnxruntime/core/providers/cpu/tensor/isnan.cc
index a4ac251e54fe3..9fc784ed30823 100644
--- a/onnxruntime/core/providers/cpu/tensor/isnan.cc
+++ b/onnxruntime/core/providers/cpu/tensor/isnan.cc
@@ -1,9 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "isnan.h"
-#include "core/util/math_cpuonly.h"
+#include "core/providers/cpu/tensor/isnan.h"
+
 #include "core/common/common.h"
+#include "core/framework/math.h"
 #include "core/framework/tensor.h"
 #include "Eigen/src/Core/arch/Default/Half.h"
 
diff --git a/onnxruntime/core/profile/context.h b/onnxruntime/core/providers/cuda/nvtx_profile_context.h
similarity index 100%
rename from onnxruntime/core/profile/context.h
rename to onnxruntime/core/providers/cuda/nvtx_profile_context.h
diff --git a/onnxruntime/core/util/distance.h b/onnxruntime/core/util/distance.h
index 02d6147df5024..1a40d2142cb81 100644
--- a/onnxruntime/core/util/distance.h
+++ b/onnxruntime/core/util/distance.h
@@ -3,7 +3,7 @@
 
 #pragma once
 #include <cmath>
-#include "math_cpuonly.h"
+#include "core/util/math_cpuonly.h"
 
 namespace onnxruntime {
 
diff --git a/onnxruntime/core/util/math.h b/onnxruntime/core/util/math.h
index 393340ffc0938..97d2e7bd4bb4d 100644
--- a/onnxruntime/core/util/math.h
+++ b/onnxruntime/core/util/math.h
@@ -16,9 +16,10 @@
 
 #pragma once
 
+#include <cassert>
+
 #ifndef SHARED_PROVIDER
 #include "core/common/common.h"
-#include "core/framework/tensor.h"
 #endif
 
 #ifndef CBLAS_ENUM_DEFINED_H
@@ -89,8 +90,7 @@ void RowwiseSum(int N, int D, const T* x, T* y,
 
 // Sum of vector x, and writes the result to a single value y.
 template <typename T, class Provider>
-void Sum(int N, const T* x, T* y, Provider* provider,
-         Tensor* scratch_ptr = nullptr);
+void Sum(int N, const T* x, T* y, Provider* provider);
 
 template <typename T, class Provider>
 void Scale(int N, float alpha, const T* x, T* y, Provider* provider);
diff --git a/onnxruntime/core/util/math_cpu.cc b/onnxruntime/core/util/math_cpu.cc
index c09d885a23d81..164e88573c4cb 100644
--- a/onnxruntime/core/util/math_cpu.cc
+++ b/onnxruntime/core/util/math_cpu.cc
@@ -15,9 +15,11 @@
 */
 // Modifications Copyright (c) Microsoft.
 
-#include <algorithm>
-#include "core/util/math.h"
 #include "core/util/math_cpuonly.h"
+#include "core/util/math.h"
+
+#include <algorithm>
+#include <gsl/gsl>
 #include "core/mlas/inc/mlas.h"
 #if defined(__GNUC__)
 #pragma GCC diagnostic push
@@ -859,10 +861,10 @@ SPECIALIZED_ROWWISESUM(int64_t)
 SPECIALIZED_ROWWISESUM(double)
 #undef SPECIALIZED_ROWWISESUM
 
-#define SPECIALIZED_SUM(T)                                                                             \
-  template <>                                                                                          \
-  void Sum<T, CPUMathUtil>(int N, const T* x, T* y, CPUMathUtil* /* unused */, Tensor* /* unused */) { \
-    *y = ConstEigenVectorMap<T>(x, N).sum();                                                           \
+#define SPECIALIZED_SUM(T)                                                       \
+  template <>                                                                    \
+  void Sum<T, CPUMathUtil>(int N, const T* x, T* y, CPUMathUtil* /* unused */) { \
+    *y = ConstEigenVectorMap<T>(x, N).sum();                                     \
   }
 
 SPECIALIZED_SUM(float);
diff --git a/onnxruntime/core/util/math_cpuonly.h b/onnxruntime/core/util/math_cpuonly.h
index d9214b16c0b12..7e70bfc99be7d 100644
--- a/onnxruntime/core/util/math_cpuonly.h
+++ b/onnxruntime/core/util/math_cpuonly.h
@@ -62,9 +62,6 @@
 #pragma warning(pop)
 #endif
 
-#ifndef SHARED_PROVIDER
-#include "core/framework/tensor.h"
-#endif
 namespace onnxruntime {
 
 // common Eigen types that we will often use
@@ -109,15 +106,6 @@ template <typename T>
 using ConstEigenMatrixMapRowMajorOuterStride =
     Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>, 0, Eigen::OuterStride<>>;
 
-template <typename T>
-auto EigenMap(Tensor& t) -> EigenVectorMap<T> {
-  return EigenVectorMap<T>(t.template MutableData<T>(), gsl::narrow<ptrdiff_t>(t.Shape().Size()));
-}
-template <typename T>
-auto EigenMap(const Tensor& t) -> ConstEigenVectorMap<T> {
-  return ConstEigenVectorMap<T>(t.template Data<T>(), gsl::narrow<ptrdiff_t>(t.Shape().Size()));
-}
-
 class CPUMathUtil {
  public:
   /*CPUMathUtil contains some help method like generate a
diff --git a/onnxruntime/core/util/thread_utils.cc b/onnxruntime/core/util/thread_utils.cc
index f47791baac4b3..94218ee7c9e0f 100644
--- a/onnxruntime/core/util/thread_utils.cc
+++ b/onnxruntime/core/util/thread_utils.cc
@@ -1,4 +1,8 @@
-#include "thread_utils.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/util/thread_utils.h"
+
 #include <algorithm>
 
 #ifdef _WIN32
diff --git a/onnxruntime/test/common/tensor_op_test_utils.h b/onnxruntime/test/common/tensor_op_test_utils.h
index 7a7c9b512b3c0..85371f21dc10b 100644
--- a/onnxruntime/test/common/tensor_op_test_utils.h
+++ b/onnxruntime/test/common/tensor_op_test_utils.h
@@ -6,11 +6,14 @@
 #include <random>
 #include <type_traits>
 
+#include <gsl/gsl>
+
 #include "gtest/gtest.h"
 
 #include "core/common/common.h"
 #include "core/common/optional.h"
 #include "core/common/type_utils.h"
+#include "core/framework/tensor.h"
 #include "core/util/math.h"
 
 namespace onnxruntime {
diff --git a/orttraining/orttraining/core/graph/gradient_builder_base.h b/orttraining/orttraining/core/graph/gradient_builder_base.h
index bd615dc0484d1..b2156660b8c1e 100644
--- a/orttraining/orttraining/core/graph/gradient_builder_base.h
+++ b/orttraining/orttraining/core/graph/gradient_builder_base.h
@@ -5,8 +5,9 @@
 
 #include <vector>
 #include <string>
-#include "core/util/math.h"
+#include "core/framework/float16.h"
 #include "core/graph/graph.h"
+#include "core/util/math.h"
 #include "orttraining/core/graph/graph_augmenter.h"
 #include "orttraining/core/graph/gradient_config.h"
 #include "orttraining/core/graph/recompute_graph_utils.h"
diff --git a/orttraining/orttraining/core/session/training_session.cc b/orttraining/orttraining/core/session/training_session.cc
index a5b63d412165b..14c1afccc7aa8 100644
--- a/orttraining/orttraining/core/session/training_session.cc
+++ b/orttraining/orttraining/core/session/training_session.cc
@@ -36,7 +36,7 @@
 #ifdef ENABLE_NVTX_PROFILE
 #include <set>
 #include <thread>
-#include "core/profile/context.h"
+#include "core/providers/cuda/nvtx_profile_context.h"
 #endif
 
 namespace onnxruntime {
diff --git a/orttraining/orttraining/models/runner/training_runner.cc b/orttraining/orttraining/models/runner/training_runner.cc
index d4789d825729f..664b489c62348 100644
--- a/orttraining/orttraining/models/runner/training_runner.cc
+++ b/orttraining/orttraining/models/runner/training_runner.cc
@@ -13,7 +13,7 @@
 #include "core/platform/env.h"
 #include "core/platform/path_lib.h"
 #ifdef ENABLE_NVTX_PROFILE
-#include "core/profile/context.h"
+#include "core/providers/cuda/nvtx_profile_context.h"
 #endif
 #include "core/session/environment.h"
 #include "orttraining/core/framework/checkpointing.h"
diff --git a/orttraining/orttraining/training_ops/cpu/math/scale.cc b/orttraining/orttraining/training_ops/cpu/math/scale.cc
index 42e3efe894b67..552fbc67540c3 100644
--- a/orttraining/orttraining/training_ops/cpu/math/scale.cc
+++ b/orttraining/orttraining/training_ops/cpu/math/scale.cc
@@ -2,8 +2,8 @@
 // Licensed under the MIT License.
 
 #include "orttraining/training_ops/cpu/math/scale.h"
+#include "core/framework/math.h"
 #include "core/providers/common.h"
-#include "core/util/math_cpuonly.h"
 
 namespace onnxruntime {
 namespace contrib {
diff --git a/orttraining/orttraining/training_ops/cpu/nn/dropout_7.cc b/orttraining/orttraining/training_ops/cpu/nn/dropout_7.cc
index ed126eb2607dd..4aea989c9f003 100644
--- a/orttraining/orttraining/training_ops/cpu/nn/dropout_7.cc
+++ b/orttraining/orttraining/training_ops/cpu/nn/dropout_7.cc
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 #include "orttraining/training_ops/cpu/nn/dropout_7.h"
-#include "core/util/math_cpuonly.h"
+#include "core/framework/math.h"
 
 namespace onnxruntime {
 
diff --git a/orttraining/orttraining/training_ops/cpu/op_gradients.cc b/orttraining/orttraining/training_ops/cpu/op_gradients.cc
index a5ef415374907..e25acd6ea0e7d 100644
--- a/orttraining/orttraining/training_ops/cpu/op_gradients.cc
+++ b/orttraining/orttraining/training_ops/cpu/op_gradients.cc
@@ -143,7 +143,7 @@ Status SoftmaxGrad<T>::Compute(OpKernelContext* context) const {
     math::Exp<float, CPUMathUtil>(nd, Ydata, eYdata, nullptr);
     for (size_t i = 0; i < N; ++i) {
       float sdY;
-      math::Sum<float, CPUMathUtil>(d, dYdata + i * d, &sdY, nullptr, nullptr);
+      math::Sum<float, CPUMathUtil>(d, dYdata + i * d, &sdY, nullptr);
       math::Axpy<float, CPUMathUtil>(d, -sdY, eYdata + i * d, dXdata + i * d, nullptr);
     }
   } else {
diff --git a/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc b/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc
index 2628e47c5ec3a..fb9c0e49a3b6a 100644
--- a/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc
+++ b/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc
@@ -5,7 +5,7 @@
 
 #include "orttraining/training_ops/cuda/communication/nccl_service.h"
 #include "core/common/common.h"
-#include "core/profile/context.h"
+#include "core/providers/cuda/nvtx_profile_context.h"
 #include "core/providers/cuda/cuda_check_memory.h"
 #include "core/providers/cuda/cuda_common.h"
 #include "orttraining/core/framework/communication/mpi/mpi_context.h"
diff --git a/orttraining/orttraining/training_ops/cuda/communication/recv.cc b/orttraining/orttraining/training_ops/cuda/communication/recv.cc
index 4f230e23154f8..60ab05d9d2e83 100644
--- a/orttraining/orttraining/training_ops/cuda/communication/recv.cc
+++ b/orttraining/orttraining/training_ops/cuda/communication/recv.cc
@@ -7,7 +7,7 @@
 #include "orttraining/training_ops/communication_common.h"
 #include "orttraining/training_ops/cuda/communication/nccl_service.h"
 #include "core/providers/cuda/nvtx_profile.h"
-#include "core/profile/context.h"
+#include "core/providers/cuda/nvtx_profile_context.h"
 #include "core/providers/cuda/cuda_check_memory.h"
 #include "core/providers/cuda/cuda_common.h"
 #include <mpi.h>
diff --git a/orttraining/orttraining/training_ops/cuda/communication/send.cc b/orttraining/orttraining/training_ops/cuda/communication/send.cc
index 832472013fd29..d36ef09419905 100644
--- a/orttraining/orttraining/training_ops/cuda/communication/send.cc
+++ b/orttraining/orttraining/training_ops/cuda/communication/send.cc
@@ -6,8 +6,8 @@
 #include "orttraining/training_ops/cuda/communication/send.h"
 #include "orttraining/training_ops/communication_common.h"
 #include "orttraining/training_ops/cuda/communication/nccl_service.h"
-#include "core/providers/cuda/nvtx_profile.h" 
-#include "core/profile/context.h"
+#include "core/providers/cuda/nvtx_profile.h"
+#include "core/providers/cuda/nvtx_profile_context.h"
 #include "core/providers/cuda/cuda_check_memory.h"
 #include "core/providers/cuda/cuda_common.h"
 #include <mpi.h>
diff --git a/orttraining/orttraining/training_ops/cuda/controlflow/record.cc b/orttraining/orttraining/training_ops/cuda/controlflow/record.cc
index b491128111a45..f29dcd13f50cb 100644
--- a/orttraining/orttraining/training_ops/cuda/controlflow/record.cc
+++ b/orttraining/orttraining/training_ops/cuda/controlflow/record.cc
@@ -6,8 +6,8 @@
 // Include event mechanism shared by CPU and GPU implementations.
 #include "orttraining/training_ops/cpu/controlflow/event_pool.h"
 #include "orttraining/training_ops/cpu/controlflow/record.h"
-#include "core/providers/cuda/nvtx_profile.h" 
-#include "core/profile/context.h"
+#include "core/providers/cuda/nvtx_profile.h"
+#include "core/providers/cuda/nvtx_profile_context.h"
 
 namespace onnxruntime {
 namespace cuda {
diff --git a/orttraining/orttraining/training_ops/cuda/controlflow/wait.cc b/orttraining/orttraining/training_ops/cuda/controlflow/wait.cc
index 75f44ffba7e85..1e466dd8d111a 100644
--- a/orttraining/orttraining/training_ops/cuda/controlflow/wait.cc
+++ b/orttraining/orttraining/training_ops/cuda/controlflow/wait.cc
@@ -6,8 +6,8 @@
 // Include event mechanism shared by CPU and GPU implementations.
 #include "orttraining/training_ops/cpu/controlflow/event_pool.h"
 #include "orttraining/training_ops/cpu/controlflow/wait.h"
-#include "core/providers/cuda/nvtx_profile.h" 
-#include "core/profile/context.h"
+#include "core/providers/cuda/nvtx_profile.h"
+#include "core/providers/cuda/nvtx_profile_context.h"
 
 namespace onnxruntime {
 namespace cuda {