Merge branch 'microsoft:main' into main

luoyu-intel · Nov 30, 2023 · 9669dfc · 9669dfc
2 parents 66f14e5 + e1d1033
commit 9669dfc
Show file tree

Hide file tree

Showing 144 changed files with 6,654 additions and 1,247 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -13,6 +13,7 @@
         "editor.codeActionsOnSave": {
             "source.organizeImports": true
         },
+        "editor.defaultFormatter": "ms-python.black-formatter"
     },
     // Enable Python linting and Pylance type checking
     "python.analysis.typeCheckingMode": "basic",

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -115,7 +115,9 @@ option(onnxruntime_ENABLE_LTO "Enable link time optimization" OFF)
 option(onnxruntime_CROSS_COMPILING "Cross compiling onnx runtime" OFF)
 option(onnxruntime_GCOV_COVERAGE "Compile with options necessary to run code coverage" OFF)
 option(onnxruntime_DONT_VECTORIZE "Do not vectorize operations in Eigen" OFF)
-option(onnxruntime_USE_FULL_PROTOBUF "Link to libprotobuf instead of libprotobuf-lite when this option is ON" OFF)
+
+#It's preferred to turn it OFF when onnxruntime is dynamically linked to PROTOBUF. But Tensort always required the full version of protobuf.
+cmake_dependent_option(onnxruntime_USE_FULL_PROTOBUF "Link to libprotobuf instead of libprotobuf-lite when this option is ON" OFF "NOT onnxruntime_USE_TENSORRT" ON)
 option(tensorflow_C_PACKAGE_PATH "Path to tensorflow C package installation dir")
 option(onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS "Enable operator implemented in language other than cpp" OFF)
 option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS "Dump debug information about node inputs and outputs when executing the model." OFF)

diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
@@ -282,11 +282,7 @@ endif()
 
 # Assemble the Apple static framework (iOS and macOS)
 if(onnxruntime_BUILD_APPLE_FRAMEWORK)
-  if(${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
-    set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-${CMAKE_OSX_SYSROOT})
-  else() # macOS
-    set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR})
-  endif()
+  set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-${CMAKE_OSX_SYSROOT})
 
   # Setup the various directories required. Remove any existing ones so we start with a clean directory.
   set(STATIC_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/static_libraries)

diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
@@ -172,10 +172,8 @@
       target_link_libraries(${target} PRIVATE cuda)
     endif()
 
-    if (onnxruntime_USE_FLASH_ATTENTION OR onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION)
-      include(cutlass)
-      target_include_directories(${target} PRIVATE ${cutlass_SOURCE_DIR}/include ${cutlass_SOURCE_DIR}/examples)
-    endif()
+    include(cutlass)
+    target_include_directories(${target} PRIVATE ${cutlass_SOURCE_DIR}/include ${cutlass_SOURCE_DIR}/examples)
 
     target_include_directories(${target} PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR}  ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
     # ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found

diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
@@ -783,7 +783,7 @@ if (onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
   onnxruntime_add_shared_library_module(onnxruntime_providers_cuda_ut ${onnxruntime_test_providers_cuda_ut_src} $<TARGET_OBJECTS:onnxruntime_providers_cuda_obj>)
   config_cuda_provider_shared_module(onnxruntime_providers_cuda_ut)
   onnxruntime_add_include_to_target(onnxruntime_providers_cuda_ut GTest::gtest GTest::gmock)
-  target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock)
+  target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common)
   list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_cuda_ut)
 endif()
 

diff --git a/cmake/patches/abseil/absl_gh_issue_1435_workaround.patch b/cmake/patches/abseil/absl_gh_issue_1435_workaround.patch
@@ -0,0 +1,17 @@
+--- absl/container/internal/layout.h	2023-11-28 09:35:48
++++ absl/container/internal/layout.updated.h	2023-11-28 10:13:14
+@@ -181,9 +181,11 @@
+ #include <sanitizer/asan_interface.h>
+ #endif
+
+-#if defined(__GXX_RTTI)
+-#define ABSL_INTERNAL_HAS_CXA_DEMANGLE
+-#endif
++// Comment out ABSL_INTERNAL_HAS_CXA_DEMANGLE definition to work around this issue:
++// https://github.com/abseil/abseil-cpp/issues/1435
++// #if defined(__GXX_RTTI)
++// #define ABSL_INTERNAL_HAS_CXA_DEMANGLE
++// #endif
+
+ #ifdef ABSL_INTERNAL_HAS_CXA_DEMANGLE
+ #include <cxxabi.h>
diff --git a/cmake/winml.cmake b/cmake/winml.cmake
@@ -451,6 +451,8 @@ onnxruntime_add_static_library(winml_lib_api
   ${winml_lib_api_dir}/impl/TensorKindFrom.h
   ${winml_lib_api_dir}/impl/TensorMemoryBufferReference.h
   ${winml_lib_api_dir}/NumericData.cpp
+  ${winml_lib_api_dir}/HardwareCoreEnumerator.cpp
+  ${winml_lib_api_dir}/HardwareCoreEnumerator.h
   ${winml_lib_api_dir}/ImageFeatureDescriptor.cpp
   ${winml_lib_api_dir}/ImageFeatureDescriptor.h
   ${winml_lib_api_dir}/ImageFeatureValue.cpp

diff --git a/docs/ORTModule_Training_Guidelines.md b/docs/ORTModule_Training_Guidelines.md
@@ -379,6 +379,30 @@ Check [FP16_Optimizer implementation](../orttraining/orttraining/python/training
     export ORTMODULE_USE_TRITON=1
     ```
 
+#### ORTMODULE_TRITON_CONFIG_FILE
+
+- **Feature Area**: *ORTMODULE/TritonOp*
+- **Description**: Triton codegen currently supported some Ops such as some elementwise Ops and some reduction Ops. If Triton optimization is enabled, all these supported Ops will be optimized by default if possible. User can provide a customized JSON config file to control which Ops to optimize and how to optimize them. Below is a sample of config JSON. For each Op, Opset version list and domain is needed. Currently "conditions" field can be used to control axis/axes attribute or input, by specify the real value, or "single" means it contains only one dimension, or "constant" means it must be constant tensor. Save the JSON as a file somewhere and assign its path to below env variable to enable the customized config.
+
+    ```json
+    {
+		"ops": {
+			"Add": {"versions": [13, 14]},
+			"Sub": {"versions": [13, 14]},
+			"Identity": {"versions": [13], "is_no_op": True},
+			"ReduceSum": {"versions": [13], "conditions": {"axes": "[-1]"}},
+			"Softmax": {"versions": [13]},
+			"SoftmaxGrad_13": {"domain": "com.microsoft", "versions": [1]}
+		},
+		"initializer": "scalar",
+		"min_nodes": 2
+	}
+	```
+
+    ```bash
+    export ORTMODULE_TRITON_CONFIG_FILE=triton_config.json
+    ```
+
 #### ORTMODULE_ENABLE_TUNING
 
 - **Feature Area**: *ORTMODULE/TritonOp*

diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
@@ -373,7 +373,7 @@ Do not modify directly.*
 |||[13, 17]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[2, 10]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
-|SplitToSequence|*in* input:**T**<br> *in* split:**I**<br> *out* output_sequence:**S**|11+|**I** = tensor(int32), tensor(int64)<br/> **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))<br/> **T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(string)|
+|SplitToSequence|*in* input:**T**<br> *in* split:**I**<br> *out* output_sequence:**S**|11+|**I** = tensor(int32), tensor(int64)<br/> **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))<br/> **T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(string)|
 |Sqrt|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float)|
 |||[6, 12]|**T** = tensor(double), tensor(float)|
 |Squeeze|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* squeezed:**T**<br><br>or<br><br>*in* data:**T**<br> *out* squeezed:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|

diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
@@ -668,7 +668,7 @@ class Node {
 The Graph representation containing the graph inputs and outputs, the Node instances,
 and the edges connecting the nodes.
 */
-class Graph {
+class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve existing data member order for readability
  public:
   /** Gets the Graph name. */
   const std::string& Name() const noexcept;

diff --git a/js/README.md b/js/README.md
@@ -344,13 +344,13 @@ From ORT v1.13 onwards the 'full' ONNX Runtime package is used. It supports both
       Full build:
 
       ```sh
-      python tools/ci_build/github/apple/build_ios_framework.py tools/ci_build/github/apple/default_full_ios_framework_build_settings.json --config Release
+      python tools/ci_build/github/apple/build_apple_framework.py tools/ci_build/github/apple/default_full_apple_framework_build_settings.json --config Release
       ```
 
       Reduced size build:
 
       ```sh
-      python tools/ci_build/github/apple/build_ios_framework.py tools/ci_build/github/apple/default_mobile_ios_framework_build_settings.json --config MinSizeRel --include_ops_by_config <required_ops_and_types_for_your_models.config> --enable_reduced_operator_type_support
+      python tools/ci_build/github/apple/build_apple_framework.py tools/ci_build/github/apple/default_mobile_ios_framework_build_settings.json --config MinSizeRel --include_ops_by_config <required_ops_and_types_for_your_models.config> --enable_reduced_operator_type_support
       ```
 
       The build creates `Headers`, `LICENSE`, and `onnxruntime.xcframework` in `build/iOS_framework/framework_out` directory. From `framework_out` directory, create an archive file named `onnxruntime-c.zip` for a full build or `onnxruntime-mobile-c.zip` for a reduced size build and copy to `<ORT_ROOT>/js/react_native/local_pods` directory.

diff --git a/js/common/lib/backend.ts b/js/common/lib/backend.ts
@@ -49,8 +49,9 @@ export interface TrainingSessionHandler extends SessionHandler {
       feeds: SessionHandler.FeedsType, fetches: SessionHandler.FetchesType,
       options: InferenceSession.RunOptions): Promise<SessionHandler.ReturnType>;
 
+  getParametersSize(trainableOnly: boolean): Promise<number>;
   loadParametersBuffer(array: Uint8Array, trainableOnly: boolean): Promise<void>;
-  getContiguousParameters(trainableOnly: boolean): Promise<Uint8Array>;
+  getContiguousParameters(trainableOnly: boolean): Promise<OnnxValue>;
 }
 
 /**

diff --git a/js/common/lib/training-session-impl.ts b/js/common/lib/training-session-impl.ts
@@ -176,12 +176,24 @@ export class TrainingSession implements TrainingSessionInterface {
     return this.convertHandlerReturnTypeToMapOfTensors(results);
   }
 
-  async loadParametersBuffer(_array: Uint8Array, _trainableOnly: boolean): Promise<void> {
-    throw new Error('Method not implemented.');
+  async getParametersSize(trainableOnly = true): Promise<number> {
+    return this.handler.getParametersSize(trainableOnly);
   }
 
-  async getContiguousParameters(_trainableOnly: boolean): Promise<Uint8Array> {
-    throw new Error('Method not implemented.');
+  async loadParametersBuffer(array: Uint8Array, trainableOnly = true): Promise<void> {
+    const paramsSize = await this.getParametersSize(trainableOnly);
+    // checking that the size of the Uint8Array is equivalent to the byte length of a Float32Array of the number
+    // of parameters
+    if (array.length !== 4 * paramsSize) {
+      throw new Error(
+          'Size of the buffer passed into loadParametersBuffer must match the number of parameters in ' +
+          'the model. Please use getParametersSize method to check.');
+    }
+    return this.handler.loadParametersBuffer(array, trainableOnly);
+  }
+
+  async getContiguousParameters(trainableOnly = true): Promise<OnnxValue> {
+    return this.handler.getContiguousParameters(trainableOnly);
   }
 
   async release(): Promise<void> {

diff --git a/js/common/lib/training-session.ts b/js/common/lib/training-session.ts
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 import {InferenceSession} from './inference-session.js';
+import {OnnxValue} from './onnx-value.js';
 import {TrainingSession as TrainingSessionImpl} from './training-session-impl.js';
 
 /* eslint-disable @typescript-eslint/no-redeclare */
@@ -49,21 +50,33 @@ export interface TrainingSession {
   // #endregion
 
   // #region copy parameters
+
+  /**
+   * Retrieves the size of all parameters for the training state. Calculates the total number of primitive (datatype of
+   * the parameters) elements of all the parameters in the training state.
+   *
+   * @param trainableOnly - When set to true, the size is calculated for trainable params only. Default value is true.
+   */
+  getParametersSize(trainableOnly: boolean): Promise<number>;
+
   /**
-   * Copies from a buffer containing parameters to the TrainingSession parameters.
+   * Copies parameter values from the given array to the training state. Currently, only supporting models with
+   * parameters of type Float32.
    *
-   * @param buffer - buffer containing parameters
-   * @param trainableOnly - True if trainable parameters only to be modified, false otherwise.
+   * @param buffer - Float32 buffer containing parameters converted to a Uint8Array.
+   * @param trainableOnly - True if trainable parameters only to be modified, false otherwise. Default value is true.
    */
   loadParametersBuffer(array: Uint8Array, trainableOnly: boolean): Promise<void>;
 
   /**
-   * Copies from the TrainingSession parameters to a buffer.
+   * Copies the model parameters to a contiguous buffer. Usually used in the context of Federated Learning.
+   * Currently, only supporting models with parameters of type Float32.
    *
-   * @param trainableOnly - True if trainable parameters only to be copied, false othrwise.
-   * @returns A promise that resolves to a buffer of the requested parameters.
+   * @param trainableOnly - When set to true, only trainable parameters are copied. Trainable parameters are parameters
+   * for which requires_grad is set to true. Default value is true.
+   * @returns A promise that resolves to a Float32 OnnxValue of the requested parameters.
    */
-  getContiguousParameters(trainableOnly: boolean): Promise<Uint8Array>;
+  getContiguousParameters(trainableOnly: boolean): Promise<OnnxValue>;
   // #endregion
 
   // #region release()

diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts
@@ -413,6 +413,7 @@ export class WebGpuBackend {
     if (!artifact) {
       artifact = this.programManager.build(program, normalizedDispatchGroup);
       this.programManager.setArtifact(key, artifact);
+      LOG_DEBUG('info', () => `[artifact] key: ${key}, programName: ${program.name}`);
     }
 
     LOG_DEBUG(

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
@@ -180,7 +180,7 @@ export const createConv2DMatMulProgramInfo =
 
       LOG_DEBUG('verbose', () => `[conv2d_mm_webgpu] dispatch = ${dispatch}`);
 
-      const innerElementSize = isVec4 ? (isChannelsLast && inChannels % 4 !== 0 ? 3 : 4) : elementsPerThread[0];
+      const innerElementSize = isVec4 ? (isChannelsLast && inChannels % 4 !== 0 ? 3 : 4) : 1;
 
       const tileAOuter = workGroupSize[1] * elementsPerThread[1];
       const tileBOuter = workGroupSize[0] * elementsPerThread[0];
@@ -197,7 +197,8 @@ export const createConv2DMatMulProgramInfo =
       const components = isVec4 ? 4 : 1;
       const programUniforms: ProgramUniform[] =
           [{type: 'int32', data: dimAOuter}, {type: 'int32', data: dimBOuter}, {type: 'int32', data: dimInner}];
-      const x = inputVariable('x', inputs[0].dataType, inputs[0].dims.length, components);
+      const x =
+          inputVariable('x', inputs[0].dataType, inputs[0].dims.length, innerElementSize === 3 ? 1 : innerElementSize);
       const w = inputVariable('w', inputs[1].dataType, inputs[1].dims.length, components);
       const inputVariables = [x, w];
 

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
@@ -22,7 +22,7 @@
 import {TensorView} from '../../../tensor-view';
 import {ShapeUtil} from '../../../util';
 import {ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../../types';
-import {createTensorShapeVariables, enableShapesUniforms, getBroadcastDims, IndicesHelper, inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType} from '../common';
+import {createTensorShapeVariables, enableShapesUniforms, getBroadcastDims, IndicesHelper, inputVariable, internalVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType} from '../common';
 import {getActivationSnippet, InternalActivationAttributes} from '../fuse-utils';
 
 import {typeSnippet} from './activation_util';
@@ -341,13 +341,8 @@ fn main(@builtin(local_invocation_id) localId : vec3<u32>,
 const matMulReadWriteFnSource =
     (component: number, hasBias: boolean, applyActivation: string, variables: IndicesHelper[],
      batchShapes: Array<readonly number[]>, isChannelsLast = false): string => {
-      const batchAShape = batchShapes[0];
-      const batchBShape = batchShapes[1];
-      const batchShape = batchShapes[2];
-      const batchVariable = variables[0];
-      const aVariable = variables[1];
-      const bVariable = variables[2];
-      const outputVariable = variables[3];
+      const [batchAShape, batchBShape, batchShape] = batchShapes;
+      const [batchVariable, aVariable, bVariable, outputVariable] = variables;
       const broadCastADims = getBroadcastDims(batchAShape, batchShape);
       const broadCastBDims = getBroadcastDims(batchBShape, batchShape);
       const dataType = tensorTypeToWsglStorageType(variables[0].type.tensor);
@@ -434,9 +429,7 @@ export const createMatmulProgramInfo =
       const outerDims = reshapedOutputShape ? reshapedOutputShape.slice(0, -2) : outputShape.slice(0, -2);
       const enableBatchUniforms = enableShapesUniforms(outerDims.length);
       const batchShapeOrRank = enableBatchUniforms ? outerDims.length : outerDims;
-      const batchDims = inputVariable('batchDims', inputs[0].dataType, batchShapeOrRank, 1, true);
-      const variables = [batchDims];
-      const batchShapes = [outerDimsA, outerDimsB, outerDims];
+      const batchDims = internalVariable('batchDims', inputs[0].dataType, batchShapeOrRank, 1);
       const batchSize = ShapeUtil.size(outerDims);
 
       const dimAOuter = aShape[aShape.length - 2];
@@ -469,10 +462,7 @@ export const createMatmulProgramInfo =
       const A = inputVariable('a', inputs[0].dataType, aShapeOrRank, components);
       const B = inputVariable('b', inputs[1].dataType, bShapeOrRank, components);
       const output = outputVariable('result', inputs[0].dataType, outputShapeTemp.length, components);
-      variables.push(A);
-      variables.push(B);
-      variables.push(output);
-      const inputVariables = [batchDims, A, B];
+      const inputVariables = [A, B];
       const programUniforms: ProgramUniform[] =
           [{type: 'int32', data: dimAOuter}, {type: 'int32', data: dimBOuter}, {type: 'int32', data: dimInner}];
       if (enableBatchUniforms) {
@@ -490,8 +480,9 @@ export const createMatmulProgramInfo =
 
       const hasBias = inputs.length > 2;
       const {activationFunction, applyActivation} = getActivationSnippet(activationAttributes, output.type.value);
-      const declareFunctions =
-          matMulReadWriteFnSource(components, hasBias, applyActivation, variables, batchShapes, isChannelsLast);
+      const declareFunctions = matMulReadWriteFnSource(
+          components, hasBias, applyActivation, [batchDims, A, B, output], [outerDimsA, outerDimsB, outerDims],
+          isChannelsLast);
       if (hasBias) {
         const biasComponents = isChannelsLast ? components : 1;
         inputVariables.push(inputVariable('bias', inputs[2].dataType, inputs[2].dims.length, biasComponents));
@@ -506,6 +497,7 @@ export const createMatmulProgramInfo =
           shaderHelper.registerUniform('dimAOuter', 'i32')
               .registerUniform('dimBOuter', 'i32')
               .registerUniform('dimInner', 'i32')
+              .registerInternalVariables(batchDims)
               .declareVariables(...inputVariables, output)}
   ${activationFunction}
   ${declareFunctions}