Merge branch 'microsoft:main' into main

luoyu-intel · Oct 30, 2023 · 127b30b · 127b30b
2 parents 0c1d318 + 8ebdd3b
commit 127b30b
Show file tree

Hide file tree

Showing 118 changed files with 6,908 additions and 1,366 deletions.
diff --git a/.github/stale.yml b/.github/stale.yml
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
@@ -0,0 +1,34 @@
+name: Close stale issues
+on:
+  # Allows you to dictate when you want this workflow to run using cron syntax (times in UTC)
+  schedule:
+    - cron: "0 15 * * *"
+  # Allows you to run this workflow manually from the Actions tab
+  # workflow_dispatch:
+
+jobs:
+  close-stale-issues:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/[email protected]
+        with:
+          # Comma separated list of labels that can be assigned to issues to exclude them from being marked as stale
+          exempt-issue-labels: contributions welcome, feature request, regression
+          # Number of days without activity before the actions/stale action labels an issue
+          days-before-issue-stale: 30
+          # Number of days without activity before the actions/stale action closes an issue
+          days-before-issue-close: 7
+          # Label you want to apply to issues that have been inactive for the amount of time specified by days-before-issue-stale
+          stale-issue-label: "stale"
+          # Comment that you want to add to issues that are labeled by the actions/stale action
+          stale-issue-message: "This issue has been automatically marked as stale due to inactivity and will be closed in 7 days if no further activity occurs. If further support is needed, please provide an update and/or more details."
+          # Comment that you want to add to issues that are closed by the actions/stale action
+          close-issue-message: "This issue has been automatically closed due to inactivity. Please reactivate if further support is needed."
+          # If you never want this action to label PRs, set this value to -1
+          days-before-pr-stale: -1
+          # If you never want this action to close PRs, set this value to -1
+          days-before-pr-close: -1
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
@@ -206,7 +206,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "0462dc31ae78f48744b6141ae376df1f96d3f459",
+          "commitHash": "a43ce67187bab219520fd80f21af8bbd4354bc8c",
           "repositoryUrl": "https://github.com/onnx/onnx-tensorrt.git"
         },
         "comments": "onnx_tensorrt"

diff --git a/cmake/deps.txt b/cmake/deps.txt
@@ -26,7 +26,7 @@ mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
 onnx;https://github.com/onnx/onnx/archive/6a20ba82b439ea1fd650da4d389e96b60a1dd828.zip;179a22ad4cd67109c60031ae4b6cf2f434d8bd7e
 #use the commit of supporting all the plugins and TRT 8.6-GA (https://github.com/onnx/onnx-tensorrt/commit/0462dc31ae78f48744b6141ae376df1f96d3f459)
-onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/0462dc31ae78f48744b6141ae376df1f96d3f459.zip;5ff086361956cceb81ed17453a1fd8db2aa4328d
+onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/a43ce67187bab219520fd80f21af8bbd4354bc8c.zip;572535aefef477050f86744dfab1fef840198035
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
 protoc_win64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip;b4521f7ada5b260380f94c4bd7f1b7684c76969a
 protoc_win32;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win32.zip;3688010318192c46ce73213cdfb6b3e5656da874

diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
@@ -38,6 +38,8 @@
         "${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/sharding.cc"
         "${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/distributed_matmul.cc"
         "${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/distributed_slice.cc"
+        "${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/distributed_reshape.cc"
+        "${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/distributed_expand.cc"
       )
     endif()
     # add using ONNXRUNTIME_ROOT so they show up under the 'contrib_ops' folder in Visual Studio
@@ -246,4 +248,4 @@
   install(TARGETS onnxruntime_providers_cuda
           ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
           LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
-          RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
+          RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
@@ -387,6 +387,9 @@ if (onnxruntime_ENABLE_TRAINING)
   file(GLOB onnxruntime_python_ortmodule_torch_cpp_ext_fused_ops_srcs CONFIGURE_DEPENDS
     "${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/*"
   )
+  file(GLOB onnxruntime_python_ortmodule_graph_optimizers_srcs CONFIGURE_DEPENDS
+    "${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/graph_optimizers/*"
+  )
   file(GLOB onnxruntime_python_ort_triton_srcs CONFIGURE_DEPENDS
     "${ORTTRAINING_SOURCE_DIR}/python/training/ort_triton/*.py"
   )
@@ -741,6 +744,7 @@ if (onnxruntime_ENABLE_TRAINING)
     COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils
     COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/cuda/torch_gpu_allocator
     COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/cuda/fused_ops
+    COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/graph_optimizers
     COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ort_triton
     COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ort_triton/kernel
     COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/utils
@@ -794,6 +798,9 @@ if (onnxruntime_ENABLE_TRAINING)
     COMMAND ${CMAKE_COMMAND} -E copy
         ${onnxruntime_python_ortmodule_torch_cpp_ext_fused_ops_srcs}
         $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/
+    COMMAND ${CMAKE_COMMAND} -E copy
+        ${onnxruntime_python_ortmodule_graph_optimizers_srcs}
+        $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/graph_optimizers/
     COMMAND ${CMAKE_COMMAND} -E copy
         ${onnxruntime_python_ort_triton_srcs}
         $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ort_triton/

diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake
@@ -48,6 +48,9 @@ set(contrib_ops_excluded_files
   "diffusion/group_norm_impl.cu"
   "diffusion/group_norm_impl.h"
   "diffusion/nhwc_conv.cc"
+  "math/gemm_float8.cc"
+  "math/gemm_float8.cu"
+  "math/gemm_float8.h"
   "quantization/attention_quantization.cc"
   "quantization/attention_quantization.h"
   "quantization/attention_quantization_impl.cu"
@@ -103,6 +106,9 @@ if (NOT onnxruntime_USE_NCCL)
   list(APPEND contrib_ops_excluded_files "collective/sharding.cc")
   list(APPEND contrib_ops_excluded_files "collective/sharding_spec.cc")
   list(APPEND contrib_ops_excluded_files "collective/distributed_matmul.cc")
+  list(APPEND contrib_ops_excluded_files "collective/distributed_slice.cc")
+  list(APPEND contrib_ops_excluded_files "collective/distributed_reshape.cc")
+  list(APPEND contrib_ops_excluded_files "collective/distributed_expand.cc")
 endif()
 
 set(provider_excluded_files

diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
@@ -40,6 +40,7 @@ Do not modify directly.*
   * <a href="#com.microsoft.GatherND">com.microsoft.GatherND</a>
   * <a href="#com.microsoft.Gelu">com.microsoft.Gelu</a>
   * <a href="#com.microsoft.GemmFastGelu">com.microsoft.GemmFastGelu</a>
+  * <a href="#com.microsoft.GemmFloat8">com.microsoft.GemmFloat8</a>
   * <a href="#com.microsoft.GreedySearch">com.microsoft.GreedySearch</a>
   * <a href="#com.microsoft.GridSample">com.microsoft.GridSample</a>
   * <a href="#com.microsoft.GroupNorm">com.microsoft.GroupNorm</a>
@@ -2138,6 +2139,71 @@ This version of the operator has been available since version 1 of the 'com.micr
 </dl>
 
 
+### <a name="com.microsoft.GemmFloat8"></a><a name="com.microsoft.gemmfloat8">**com.microsoft.GemmFloat8**</a>
+
+  Generic Gemm for float and float 8.
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>activation</tt> : string</dt>
+<dd>Activation function, RELU or GELU or NONE (default).</dd>
+<dt><tt>alpha</tt> : float</dt>
+<dd>Scalar multiplier for the product of input tensors A * B.</dd>
+<dt><tt>beta</tt> : float</dt>
+<dd>Scalar multiplier for the product of input bias C.</dd>
+<dt><tt>dtype</tt> : int</dt>
+<dd>Output Type. Same definition as attribute 'to' for operator Cast.</dd>
+<dt><tt>transA</tt> : int</dt>
+<dd>Whether A should be transposed. Float 8 only supprted transA=0.</dd>
+<dt><tt>transB</tt> : int</dt>
+<dd>Whether B should be transposed. Float 8 only supprted transB=1.</dd>
+</dl>
+
+#### Inputs (2 - 6)
+
+<dl>
+<dt><tt>A</tt> : TA</dt>
+<dd>Input tensor A. The shape of A should be (M, K) if transA is 0, or (K, M) if transA is non-zero.</dd>
+<dt><tt>B</tt> : TB</dt>
+<dd>Input tensor B. The shape of B should be (K, N) if transB is 0, or (N, K) if transB is non-zero.</dd>
+<dt><tt>C</tt> (optional) : TC</dt>
+<dd>Input tensor C.</dd>
+<dt><tt>scaleA</tt> (optional) : TS</dt>
+<dd>Scale of tensor A if A is float 8 tensor</dd>
+<dt><tt>scaleB</tt> (optional) : TS</dt>
+<dd>Scale of tensor B if B is float 8 tensor</dd>
+<dt><tt>scaleY</tt> (optional) : TS</dt>
+<dd>Scale of the output tensor if A or B is float 8.</dd>
+</dl>
+
+#### Outputs
+
+<dl>
+<dt><tt>Y</tt> : TR</dt>
+<dd>Output tensor of shape (M, N).</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>TA</tt> : tensor(float8e4m3fn), tensor(float8e5m2), tensor(float16), tensor(bfloat16), tensor(float)</dt>
+<dd>Constrain type to input A.</dd>
+<dt><tt>TB</tt> : tensor(float8e4m3fn), tensor(float8e5m2), tensor(float16), tensor(bfloat16), tensor(float)</dt>
+<dd>Constrain type to input B.</dd>
+<dt><tt>TC</tt> : tensor(float16), tensor(bfloat16), tensor(float)</dt>
+<dd>Constrain type to input C.</dd>
+<dt><tt>TR</tt> : tensor(float8e4m3fn), tensor(float8e5m2), tensor(float16), tensor(bfloat16), tensor(float)</dt>
+<dd>Constrain type to result type.</dd>
+<dt><tt>TS</tt> : tensor(float)</dt>
+<dd>Constrain type for all input scales (scaleA, scaleB, scaleY).</dd>
+</dl>
+
+
 ### <a name="com.microsoft.GreedySearch"></a><a name="com.microsoft.greedysearch">**com.microsoft.GreedySearch**</a>
 
   Greedy Search for text generation.