Skip to content

Commit

Permalink
Merge branch 'microsoft:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
luoyu-intel authored Nov 30, 2023
2 parents 66f14e5 + e1d1033 commit 9669dfc
Show file tree
Hide file tree
Showing 144 changed files with 6,654 additions and 1,247 deletions.
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
"editor.codeActionsOnSave": {
"source.organizeImports": true
},
"editor.defaultFormatter": "ms-python.black-formatter"
},
// Enable Python linting and Pylance type checking
"python.analysis.typeCheckingMode": "basic",
Expand Down
4 changes: 3 additions & 1 deletion cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,9 @@ option(onnxruntime_ENABLE_LTO "Enable link time optimization" OFF)
option(onnxruntime_CROSS_COMPILING "Cross compiling onnx runtime" OFF)
option(onnxruntime_GCOV_COVERAGE "Compile with options necessary to run code coverage" OFF)
option(onnxruntime_DONT_VECTORIZE "Do not vectorize operations in Eigen" OFF)
option(onnxruntime_USE_FULL_PROTOBUF "Link to libprotobuf instead of libprotobuf-lite when this option is ON" OFF)

#It's preferred to turn it OFF when onnxruntime is dynamically linked to PROTOBUF. But Tensort always required the full version of protobuf.
cmake_dependent_option(onnxruntime_USE_FULL_PROTOBUF "Link to libprotobuf instead of libprotobuf-lite when this option is ON" OFF "NOT onnxruntime_USE_TENSORRT" ON)
option(tensorflow_C_PACKAGE_PATH "Path to tensorflow C package installation dir")
option(onnxruntime_ENABLE_LANGUAGE_INTEROP_OPS "Enable operator implemented in language other than cpp" OFF)
option(onnxruntime_DEBUG_NODE_INPUTS_OUTPUTS "Dump debug information about node inputs and outputs when executing the model." OFF)
Expand Down
6 changes: 1 addition & 5 deletions cmake/onnxruntime.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -282,11 +282,7 @@ endif()

# Assemble the Apple static framework (iOS and macOS)
if(onnxruntime_BUILD_APPLE_FRAMEWORK)
if(${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-${CMAKE_OSX_SYSROOT})
else() # macOS
set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR})
endif()
set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-${CMAKE_OSX_SYSROOT})

# Setup the various directories required. Remove any existing ones so we start with a clean directory.
set(STATIC_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/static_libraries)
Expand Down
6 changes: 2 additions & 4 deletions cmake/onnxruntime_providers_cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -172,10 +172,8 @@
target_link_libraries(${target} PRIVATE cuda)
endif()

if (onnxruntime_USE_FLASH_ATTENTION OR onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION)
include(cutlass)
target_include_directories(${target} PRIVATE ${cutlass_SOURCE_DIR}/include ${cutlass_SOURCE_DIR}/examples)
endif()
include(cutlass)
target_include_directories(${target} PRIVATE ${cutlass_SOURCE_DIR}/include ${cutlass_SOURCE_DIR}/examples)

target_include_directories(${target} PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
# ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found
Expand Down
2 changes: 1 addition & 1 deletion cmake/onnxruntime_unittests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -783,7 +783,7 @@ if (onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
onnxruntime_add_shared_library_module(onnxruntime_providers_cuda_ut ${onnxruntime_test_providers_cuda_ut_src} $<TARGET_OBJECTS:onnxruntime_providers_cuda_obj>)
config_cuda_provider_shared_module(onnxruntime_providers_cuda_ut)
onnxruntime_add_include_to_target(onnxruntime_providers_cuda_ut GTest::gtest GTest::gmock)
target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock)
target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common)
list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_cuda_ut)
endif()

Expand Down
17 changes: 17 additions & 0 deletions cmake/patches/abseil/absl_gh_issue_1435_workaround.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
--- absl/container/internal/layout.h 2023-11-28 09:35:48
+++ absl/container/internal/layout.updated.h 2023-11-28 10:13:14
@@ -181,9 +181,11 @@
#include <sanitizer/asan_interface.h>
#endif

-#if defined(__GXX_RTTI)
-#define ABSL_INTERNAL_HAS_CXA_DEMANGLE
-#endif
+// Comment out ABSL_INTERNAL_HAS_CXA_DEMANGLE definition to work around this issue:
+// https://github.com/abseil/abseil-cpp/issues/1435
+// #if defined(__GXX_RTTI)
+// #define ABSL_INTERNAL_HAS_CXA_DEMANGLE
+// #endif

#ifdef ABSL_INTERNAL_HAS_CXA_DEMANGLE
#include <cxxabi.h>
2 changes: 2 additions & 0 deletions cmake/winml.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,8 @@ onnxruntime_add_static_library(winml_lib_api
${winml_lib_api_dir}/impl/TensorKindFrom.h
${winml_lib_api_dir}/impl/TensorMemoryBufferReference.h
${winml_lib_api_dir}/NumericData.cpp
${winml_lib_api_dir}/HardwareCoreEnumerator.cpp
${winml_lib_api_dir}/HardwareCoreEnumerator.h
${winml_lib_api_dir}/ImageFeatureDescriptor.cpp
${winml_lib_api_dir}/ImageFeatureDescriptor.h
${winml_lib_api_dir}/ImageFeatureValue.cpp
Expand Down
24 changes: 24 additions & 0 deletions docs/ORTModule_Training_Guidelines.md
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,30 @@ Check [FP16_Optimizer implementation](../orttraining/orttraining/python/training
export ORTMODULE_USE_TRITON=1
```

#### ORTMODULE_TRITON_CONFIG_FILE

- **Feature Area**: *ORTMODULE/TritonOp*
- **Description**: Triton codegen currently supported some Ops such as some elementwise Ops and some reduction Ops. If Triton optimization is enabled, all these supported Ops will be optimized by default if possible. User can provide a customized JSON config file to control which Ops to optimize and how to optimize them. Below is a sample of config JSON. For each Op, Opset version list and domain is needed. Currently "conditions" field can be used to control axis/axes attribute or input, by specify the real value, or "single" means it contains only one dimension, or "constant" means it must be constant tensor. Save the JSON as a file somewhere and assign its path to below env variable to enable the customized config.

```json
{
"ops": {
"Add": {"versions": [13, 14]},
"Sub": {"versions": [13, 14]},
"Identity": {"versions": [13], "is_no_op": True},
"ReduceSum": {"versions": [13], "conditions": {"axes": "[-1]"}},
"Softmax": {"versions": [13]},
"SoftmaxGrad_13": {"domain": "com.microsoft", "versions": [1]}
},
"initializer": "scalar",
"min_nodes": 2
}
```

```bash
export ORTMODULE_TRITON_CONFIG_FILE=triton_config.json
```

#### ORTMODULE_ENABLE_TUNING

- **Feature Area**: *ORTMODULE/TritonOp*
Expand Down
2 changes: 1 addition & 1 deletion docs/OperatorKernels.md
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ Do not modify directly.*
|||[13, 17]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[11, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|||[2, 10]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
|SplitToSequence|*in* input:**T**<br> *in* split:**I**<br> *out* output_sequence:**S**|11+|**I** = tensor(int32), tensor(int64)<br/> **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))<br/> **T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(string)|
|SplitToSequence|*in* input:**T**<br> *in* split:**I**<br> *out* output_sequence:**S**|11+|**I** = tensor(int32), tensor(int64)<br/> **S** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(string)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8))<br/> **T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(string)|
|Sqrt|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float)|
|||[6, 12]|**T** = tensor(double), tensor(float)|
|Squeeze|*in* data:**T**<br> *in* axes:**tensor(int64)**<br> *out* squeezed:**T**<br><br>or<br><br>*in* data:**T**<br> *out* squeezed:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
Expand Down
2 changes: 1 addition & 1 deletion include/onnxruntime/core/graph/graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -668,7 +668,7 @@ class Node {
The Graph representation containing the graph inputs and outputs, the Node instances,
and the edges connecting the nodes.
*/
class Graph {
class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve existing data member order for readability
public:
/** Gets the Graph name. */
const std::string& Name() const noexcept;
Expand Down
4 changes: 2 additions & 2 deletions js/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -344,13 +344,13 @@ From ORT v1.13 onwards the 'full' ONNX Runtime package is used. It supports both
Full build:

```sh
python tools/ci_build/github/apple/build_ios_framework.py tools/ci_build/github/apple/default_full_ios_framework_build_settings.json --config Release
python tools/ci_build/github/apple/build_apple_framework.py tools/ci_build/github/apple/default_full_apple_framework_build_settings.json --config Release
```

Reduced size build:

```sh
python tools/ci_build/github/apple/build_ios_framework.py tools/ci_build/github/apple/default_mobile_ios_framework_build_settings.json --config MinSizeRel --include_ops_by_config <required_ops_and_types_for_your_models.config> --enable_reduced_operator_type_support
python tools/ci_build/github/apple/build_apple_framework.py tools/ci_build/github/apple/default_mobile_ios_framework_build_settings.json --config MinSizeRel --include_ops_by_config <required_ops_and_types_for_your_models.config> --enable_reduced_operator_type_support
```

The build creates `Headers`, `LICENSE`, and `onnxruntime.xcframework` in `build/iOS_framework/framework_out` directory. From `framework_out` directory, create an archive file named `onnxruntime-c.zip` for a full build or `onnxruntime-mobile-c.zip` for a reduced size build and copy to `<ORT_ROOT>/js/react_native/local_pods` directory.
Expand Down
3 changes: 2 additions & 1 deletion js/common/lib/backend.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,9 @@ export interface TrainingSessionHandler extends SessionHandler {
feeds: SessionHandler.FeedsType, fetches: SessionHandler.FetchesType,
options: InferenceSession.RunOptions): Promise<SessionHandler.ReturnType>;

getParametersSize(trainableOnly: boolean): Promise<number>;
loadParametersBuffer(array: Uint8Array, trainableOnly: boolean): Promise<void>;
getContiguousParameters(trainableOnly: boolean): Promise<Uint8Array>;
getContiguousParameters(trainableOnly: boolean): Promise<OnnxValue>;
}

/**
Expand Down
20 changes: 16 additions & 4 deletions js/common/lib/training-session-impl.ts
Original file line number Diff line number Diff line change
Expand Up @@ -176,12 +176,24 @@ export class TrainingSession implements TrainingSessionInterface {
return this.convertHandlerReturnTypeToMapOfTensors(results);
}

async loadParametersBuffer(_array: Uint8Array, _trainableOnly: boolean): Promise<void> {
throw new Error('Method not implemented.');
async getParametersSize(trainableOnly = true): Promise<number> {
return this.handler.getParametersSize(trainableOnly);
}

async getContiguousParameters(_trainableOnly: boolean): Promise<Uint8Array> {
throw new Error('Method not implemented.');
async loadParametersBuffer(array: Uint8Array, trainableOnly = true): Promise<void> {
const paramsSize = await this.getParametersSize(trainableOnly);
// checking that the size of the Uint8Array is equivalent to the byte length of a Float32Array of the number
// of parameters
if (array.length !== 4 * paramsSize) {
throw new Error(
'Size of the buffer passed into loadParametersBuffer must match the number of parameters in ' +
'the model. Please use getParametersSize method to check.');
}
return this.handler.loadParametersBuffer(array, trainableOnly);
}

async getContiguousParameters(trainableOnly = true): Promise<OnnxValue> {
return this.handler.getContiguousParameters(trainableOnly);
}

async release(): Promise<void> {
Expand Down
27 changes: 20 additions & 7 deletions js/common/lib/training-session.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// Licensed under the MIT License.

import {InferenceSession} from './inference-session.js';
import {OnnxValue} from './onnx-value.js';
import {TrainingSession as TrainingSessionImpl} from './training-session-impl.js';

/* eslint-disable @typescript-eslint/no-redeclare */
Expand Down Expand Up @@ -49,21 +50,33 @@ export interface TrainingSession {
// #endregion

// #region copy parameters

/**
* Retrieves the size of all parameters for the training state. Calculates the total number of primitive (datatype of
* the parameters) elements of all the parameters in the training state.
*
* @param trainableOnly - When set to true, the size is calculated for trainable params only. Default value is true.
*/
getParametersSize(trainableOnly: boolean): Promise<number>;

/**
* Copies from a buffer containing parameters to the TrainingSession parameters.
* Copies parameter values from the given array to the training state. Currently, only supporting models with
* parameters of type Float32.
*
* @param buffer - buffer containing parameters
* @param trainableOnly - True if trainable parameters only to be modified, false otherwise.
* @param buffer - Float32 buffer containing parameters converted to a Uint8Array.
* @param trainableOnly - True if trainable parameters only to be modified, false otherwise. Default value is true.
*/
loadParametersBuffer(array: Uint8Array, trainableOnly: boolean): Promise<void>;

/**
* Copies from the TrainingSession parameters to a buffer.
* Copies the model parameters to a contiguous buffer. Usually used in the context of Federated Learning.
* Currently, only supporting models with parameters of type Float32.
*
* @param trainableOnly - True if trainable parameters only to be copied, false othrwise.
* @returns A promise that resolves to a buffer of the requested parameters.
* @param trainableOnly - When set to true, only trainable parameters are copied. Trainable parameters are parameters
* for which requires_grad is set to true. Default value is true.
* @returns A promise that resolves to a Float32 OnnxValue of the requested parameters.
*/
getContiguousParameters(trainableOnly: boolean): Promise<Uint8Array>;
getContiguousParameters(trainableOnly: boolean): Promise<OnnxValue>;
// #endregion

// #region release()
Expand Down
1 change: 1 addition & 0 deletions js/web/lib/wasm/jsep/backend-webgpu.ts
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,7 @@ export class WebGpuBackend {
if (!artifact) {
artifact = this.programManager.build(program, normalizedDispatchGroup);
this.programManager.setArtifact(key, artifact);
LOG_DEBUG('info', () => `[artifact] key: ${key}, programName: ${program.name}`);
}

LOG_DEBUG(
Expand Down
5 changes: 3 additions & 2 deletions js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ export const createConv2DMatMulProgramInfo =

LOG_DEBUG('verbose', () => `[conv2d_mm_webgpu] dispatch = ${dispatch}`);

const innerElementSize = isVec4 ? (isChannelsLast && inChannels % 4 !== 0 ? 3 : 4) : elementsPerThread[0];
const innerElementSize = isVec4 ? (isChannelsLast && inChannels % 4 !== 0 ? 3 : 4) : 1;

const tileAOuter = workGroupSize[1] * elementsPerThread[1];
const tileBOuter = workGroupSize[0] * elementsPerThread[0];
Expand All @@ -197,7 +197,8 @@ export const createConv2DMatMulProgramInfo =
const components = isVec4 ? 4 : 1;
const programUniforms: ProgramUniform[] =
[{type: 'int32', data: dimAOuter}, {type: 'int32', data: dimBOuter}, {type: 'int32', data: dimInner}];
const x = inputVariable('x', inputs[0].dataType, inputs[0].dims.length, components);
const x =
inputVariable('x', inputs[0].dataType, inputs[0].dims.length, innerElementSize === 3 ? 1 : innerElementSize);
const w = inputVariable('w', inputs[1].dataType, inputs[1].dims.length, components);
const inputVariables = [x, w];

Expand Down
26 changes: 9 additions & 17 deletions js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import {TensorView} from '../../../tensor-view';
import {ShapeUtil} from '../../../util';
import {ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../../types';
import {createTensorShapeVariables, enableShapesUniforms, getBroadcastDims, IndicesHelper, inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType} from '../common';
import {createTensorShapeVariables, enableShapesUniforms, getBroadcastDims, IndicesHelper, inputVariable, internalVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType} from '../common';
import {getActivationSnippet, InternalActivationAttributes} from '../fuse-utils';

import {typeSnippet} from './activation_util';
Expand Down Expand Up @@ -341,13 +341,8 @@ fn main(@builtin(local_invocation_id) localId : vec3<u32>,
const matMulReadWriteFnSource =
(component: number, hasBias: boolean, applyActivation: string, variables: IndicesHelper[],
batchShapes: Array<readonly number[]>, isChannelsLast = false): string => {
const batchAShape = batchShapes[0];
const batchBShape = batchShapes[1];
const batchShape = batchShapes[2];
const batchVariable = variables[0];
const aVariable = variables[1];
const bVariable = variables[2];
const outputVariable = variables[3];
const [batchAShape, batchBShape, batchShape] = batchShapes;
const [batchVariable, aVariable, bVariable, outputVariable] = variables;
const broadCastADims = getBroadcastDims(batchAShape, batchShape);
const broadCastBDims = getBroadcastDims(batchBShape, batchShape);
const dataType = tensorTypeToWsglStorageType(variables[0].type.tensor);
Expand Down Expand Up @@ -434,9 +429,7 @@ export const createMatmulProgramInfo =
const outerDims = reshapedOutputShape ? reshapedOutputShape.slice(0, -2) : outputShape.slice(0, -2);
const enableBatchUniforms = enableShapesUniforms(outerDims.length);
const batchShapeOrRank = enableBatchUniforms ? outerDims.length : outerDims;
const batchDims = inputVariable('batchDims', inputs[0].dataType, batchShapeOrRank, 1, true);
const variables = [batchDims];
const batchShapes = [outerDimsA, outerDimsB, outerDims];
const batchDims = internalVariable('batchDims', inputs[0].dataType, batchShapeOrRank, 1);
const batchSize = ShapeUtil.size(outerDims);

const dimAOuter = aShape[aShape.length - 2];
Expand Down Expand Up @@ -469,10 +462,7 @@ export const createMatmulProgramInfo =
const A = inputVariable('a', inputs[0].dataType, aShapeOrRank, components);
const B = inputVariable('b', inputs[1].dataType, bShapeOrRank, components);
const output = outputVariable('result', inputs[0].dataType, outputShapeTemp.length, components);
variables.push(A);
variables.push(B);
variables.push(output);
const inputVariables = [batchDims, A, B];
const inputVariables = [A, B];
const programUniforms: ProgramUniform[] =
[{type: 'int32', data: dimAOuter}, {type: 'int32', data: dimBOuter}, {type: 'int32', data: dimInner}];
if (enableBatchUniforms) {
Expand All @@ -490,8 +480,9 @@ export const createMatmulProgramInfo =

const hasBias = inputs.length > 2;
const {activationFunction, applyActivation} = getActivationSnippet(activationAttributes, output.type.value);
const declareFunctions =
matMulReadWriteFnSource(components, hasBias, applyActivation, variables, batchShapes, isChannelsLast);
const declareFunctions = matMulReadWriteFnSource(
components, hasBias, applyActivation, [batchDims, A, B, output], [outerDimsA, outerDimsB, outerDims],
isChannelsLast);
if (hasBias) {
const biasComponents = isChannelsLast ? components : 1;
inputVariables.push(inputVariable('bias', inputs[2].dataType, inputs[2].dims.length, biasComponents));
Expand All @@ -506,6 +497,7 @@ export const createMatmulProgramInfo =
shaderHelper.registerUniform('dimAOuter', 'i32')
.registerUniform('dimBOuter', 'i32')
.registerUniform('dimInner', 'i32')
.registerInternalVariables(batchDims)
.declareVariables(...inputVariables, output)}
${activationFunction}
${declareFunctions}
Expand Down
Loading

0 comments on commit 9669dfc

Please sign in to comment.