add command line parameter in perf test for using remote tensors

intel · Aug 29, 2024 · ef44c87 · ef44c87
2 parents a6004c5 + 966c48a
commit ef44c87
Show file tree

Hide file tree

Showing 103 changed files with 2,847 additions and 1,232 deletions.
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
@@ -892,6 +892,8 @@ if (MSVC)
   set_property(SOURCE "${TEST_SRC_DIR}/optimizer/graph_transform_test.cc"
                       "${TEST_SRC_DIR}/optimizer/qdq_transformer_test.cc"
                APPEND PROPERTY COMPILE_OPTIONS "/bigobj")
+  set_property(SOURCE "${TEST_SRC_DIR}/optimizer/qdq_transformer_test.cc"
+               APPEND PROPERTY COMPILE_OPTIONS "/bigobj")
 else()
   target_compile_options(onnxruntime_test_all PRIVATE "-Wno-parentheses")
 endif()

diff --git a/cmake/patches/cutlass/cutlass_3.5.0.patch b/cmake/patches/cutlass/cutlass_3.5.0.patch
@@ -1,13 +1,64 @@
+diff --git a/examples/41_fused_multi_head_attention/kernel_forward.h b/examples/41_fused_multi_head_attention/kernel_forward.h
+index 4c80f549..34327633 100644
+--- a/examples/41_fused_multi_head_attention/kernel_forward.h
++++ b/examples/41_fused_multi_head_attention/kernel_forward.h
+@@ -221,6 +221,8 @@ struct AttentionKernel {
+     int32_t num_batches = 0;
+     int32_t num_heads = 0;
+
++    bool use_smooth_softmax = false;
++
+     // dropout
+     bool use_dropout = false;
+     unsigned long long dropout_batch_head_rng_offset = 0;
+@@ -897,7 +899,8 @@ struct AttentionKernel {
+           p.num_keys - iter_key_start,
+           iter_key_start == 0,
+           iteratorC_tile_offset,
+-          kSupportsBias ? 1.0f : p.scale);
++          kSupportsBias ? 1.0f : p.scale,
++          p.use_smooth_softmax);
+
+       // Output results to shared-memory
+       int warp_idx_mn_0 = my_warp_id %
+@@ -1166,7 +1169,8 @@ struct AttentionKernel {
+       int max_col,
+       bool is_first,
+       typename WarpIteratorC::TensorCoord const& tile_offset,
+-      float scaling) {
++      float scaling,
++      bool use_smooth_softmax) {
+     /* Iterates on the accumulator and corresponding position on result matrix
+
+     (1) Update `mi[r]` to the max value of the row `r`
+@@ -1257,7 +1261,7 @@ struct AttentionKernel {
+       accum_t mi_row, total_row;
+       LambdaIterator::iterateRows(
+           lane_offset,
+-          [&](int accum_m) { mi_row = mi[accum_m]; },
++          [&](int accum_m) { mi_row = mi[accum_m];},
+           [&](int accum_m, int accum_n, int idx) {
+             frag[idx] =
+                 (accum_n < max_col) ? exp2f(frag[idx] - mi_row) : accum_t(0.0);
+@@ -1294,7 +1298,7 @@ struct AttentionKernel {
+       for (int i = 0; i < MM0::MmaCore::WarpCount::kN; ++i) {
+         total_row += addition_storage[id + kQueriesPerBlock * i];
+       }
+-      s_prime[id] = total_row;
++      s_prime[id] = (use_smooth_softmax && (max_col <= kKeysPerBlock)) ? total_row + exp2f(-mi[id]) : total_row;
+     }
+   }
+
 diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h
 index 964d2ff3..b366bc14 100644
 --- a/include/cutlass/functional.h
 +++ b/include/cutlass/functional.h
 @@ -39,6 +39,7 @@
  #include "cutlass/numeric_types.h"
- 
+
  #include <cuda_runtime.h>
 +#include <cuda_fp16.h>
- 
+
  #if defined(CUTLASS_ARCH_WMMA_ENABLED)
  #include <mma.h>
 @@ -230,8 +231,12 @@ struct inverse_square_root<half_t> {
@@ -19,7 +70,7 @@ index 964d2ff3..b366bc14 100644
      return reinterpret_cast<half_t const &>(result);
 +#else
 +    return half_t::convert((rsqrtf(half_t::convert(lhs))));
-+#endif    
++#endif
  #else
      return half_t(1.f / std::sqrt(half_t::convert(lhs)));
- #endif
+ #endif
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
@@ -2541,6 +2541,8 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>Rotate using interleaved pattern. Default value is 0 (False).</dd>
 <dt><tt>scale</tt> : float</dt>
 <dd>Custom scale will be used if specified. Default value is 1/sqrt(head_size)</dd>
+<dt><tt>smooth_softmax</tt> : int</dt>
+<dd>Use a smooth factor in softmax.</dd>
 </dl>
 
 #### Inputs (7 - 9)
@@ -3081,6 +3083,8 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>Number of top experts to select from expert pool</dd>
 <dt><tt>normalize_routing_weights</tt> : int</dt>
 <dd>Whether to normalize routing weights</dd>
+<dt><tt>use_sparse_mixer</tt> : int</dt>
+<dd>Whether to use sparse mixer</dd>
 </dl>
 
 #### Inputs (5 - 8)
@@ -4396,7 +4400,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 
 ### <a name="com.microsoft.QMoE"></a><a name="com.microsoft.qmoe">**com.microsoft.QMoE**</a>
 
-  Int4 MoE
+  Quantized MoE
 
 #### Version
 
@@ -4407,10 +4411,14 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dl>
 <dt><tt>activation_type</tt> : string</dt>
 <dd>Activation function to use. Choose from relu, gelu, silu and identity. Default is relu</dd>
+<dt><tt>expert_weight_bits</tt> : int</dt>
+<dd>Number of bits used in quantized weights. Default is 4 bits</dd>
 <dt><tt>k</tt> : int</dt>
 <dd>Number of top experts to select from expert pool</dd>
 <dt><tt>normalize_routing_weights</tt> : int</dt>
 <dd>Whether to normalize routing weights</dd>
+<dt><tt>use_sparse_mixer</tt> : int</dt>
+<dd>Whether to use sparse mixer</dd>
 </dl>
 
 #### Inputs (7 - 11)
@@ -4421,19 +4429,19 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dt><tt>router_probs</tt> : T</dt>
 <dd>2D input tensor with shape (num_rows, num_experts)</dd>
 <dt><tt>fc1_experts_weights</tt> : T1</dt>
-<dd>3D input tensor with shape (num_experts, hidden_size, inter_size / 2)</dd>
+<dd>3D input tensor with shape (num_experts, hidden_size, inter_size) or (num_experts, hidden_size, inter_size / 2)</dd>
 <dt><tt>fc1_scales</tt> : T</dt>
 <dd>2D input tensor with shape (num_experts, inter_size)</dd>
 <dt><tt>fc1_experts_bias</tt> (optional) : T</dt>
 <dd>2D optional input tensor with shape (num_experts, inter_size)</dd>
 <dt><tt>fc2_experts_weights</tt> : T1</dt>
-<dd>3D input tensor with shape (num_experts, inter_size, hidden_size / 2)</dd>
+<dd>3D input tensor with shape (num_experts, inter_size, hidden_size) or (num_experts, inter_size, hidden_size / 2)</dd>
 <dt><tt>fc2_scales</tt> : T</dt>
 <dd>2D input tensor with shape (num_experts, hidden_size)</dd>
 <dt><tt>fc2_experts_bias</tt> (optional) : T</dt>
 <dd>2D optional input tensor with shape (num_experts, hidden_size)</dd>
 <dt><tt>fc3_experts_weights</tt> (optional) : T1</dt>
-<dd>3D optional input tensor with shape (num_experts, hidden_size, inter_size / 2)</dd>
+<dd>3D optional input tensor with shape (num_experts, hidden_size, inter_size) or (num_experts, hidden_size, inter_size / 2)</dd>
 <dt><tt>fc3_scales</tt> (optional) : T</dt>
 <dd>2D optional input tensor with shape (num_experts, inter_size)</dd>
 <dt><tt>fc3_experts_bias</tt> (optional) : T</dt>

diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
@@ -11,6 +11,7 @@
 #include "core/common/logging/logging.h"
 #include "core/common/status.h"
 #include "core/framework/data_transfer.h"
+#include "core/framework/external_data_loader.h"
 #include "core/framework/tensor.h"
 
 namespace onnxruntime {
@@ -88,6 +89,19 @@ class IExecutionProvider {
     return nullptr;
   }
 
+  /**
+   * Returns an external data loader object that implements methods to load data from external sources.
+   *
+   * By default, framework will handle external data loading by loading the data into CPU memory and then copying
+   * it to the target device if required. So in most cases, it's not necessary to override this method. Specifically,
+   * in WebAssembly build, because the memory is limited and Web platform supports loading data from external sources
+   * directly into GPU memory, this method is overridden to provide a custom external data loader to avoid the extra
+   * CPU memory usage.
+   */
+  virtual std::unique_ptr<onnxruntime::IExternalDataLoader> GetExternalDataLoader() const {
+    return nullptr;
+  }
+
   /**
    * Interface for performing kernel lookup within kernel registries.
    * Abstracts away lower-level details about kernel registries and kernel matching.

diff --git a/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
@@ -49,3 +49,8 @@ static const char* const kOrtRunOptionsConfigQnnRpcControlLatency = "qnn.rpc_con
 // If the value is set to -1, cuda graph capture/replay is disabled in that run.
 // User are not expected to set the value to 0 as it is reserved for internal use.
 static const char* const kOrtRunOptionsConfigCudaGraphAnnotation = "gpu_graph_id";
+
+// Specify the type of workload for this run.
+// “Default”: OS determines the scheduling priority and processor performance to service this workload. [Default]
+// “Efficient”: OS treats this workload is efficiency oriented with low scheduling priority and efficient processor performance.
+static const char* const kOrtRunOptionsWorkloadType = "run.workload_type";
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -279,3 +279,8 @@ static const char* const kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16 = "mlas
 // Refer to MatMulNBits op schema for more details.
 // If not provided, default is 4.
 static const char* const kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel = "session.qdq_matmulnbits_accuracy_level";
+
+// Specify the type of workload for this session.
+// “Default”: OS determines the scheduling priority and processor performance to service this workload. [Default]
+// “Efficient”: OS treats this workload is efficiency oriented with low scheduling priority and efficient processor performance.
+static const char* const kOrtSessionOptionsWorkloadType = "session.workload_type";
diff --git a/js/.prettierignore b/js/.prettierignore
@@ -1,7 +1,10 @@
 # ignore generated docs
 web/docs/
 
-# this JSON file is too large, so it takes too long to format it.
+# these JSON file is too large, so it takes too long to format it.
+web/test/data/ops/pad-big.jsonc
+web/test/testdata-config.json
+web/test/ort.test.js
 node/test/testdata/squeezenet.input0.json
 
 # ignore dist folder

diff --git a/js/web/docs/webgpu-operators.md b/js/web/docs/webgpu-operators.md
@@ -90,10 +90,10 @@ Do not modify directly.*
 | ReduceSum | ai.onnx(1-10,11-12,13+) |  |
 | ReduceSumSquare | ai.onnx(1-10,11-12,13-17,18+) |  |
 | Relu | ai.onnx(6-12,13,14+) |  |
-| Reshape | ai.onnx(5-12,13,14+) | no GPU kernel |
+| Reshape | ai.onnx(5-12,13,14-18,19-20,21+) | no GPU kernel |
 | Resize | ai.onnx(10,11-12,13-17,18,19+); com.ms.internal.nhwc(10,11-12,13-17,18,19+) | CoordinateTransformMode align_corners is not supported with downsampling |
 | RotaryEmbedding | com.microsoft(1+) |  |
-| Shape | ai.onnx(1-12,13-14,15+) | no GPU kernel; an ORT warning is generated - need to fix |
+| Shape | ai.onnx(1-12,13-14,15-18,19-20,21+) | no GPU kernel; an ORT warning is generated - need to fix |
 | Sigmoid | ai.onnx(6-12,13+) |  |
 | SimplifiedLayerNormalization | ai.onnx(1+) |  |
 | Sin | ai.onnx(7+) |  |

diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts
@@ -23,14 +23,6 @@ class TensorViewImpl implements TensorView {
     public readonly dims: readonly number[],
   ) {}
 
-  getUint16Array(): Uint16Array {
-    if (this.dataType !== DataType.float16 && this.dataType !== DataType.uint16) {
-      throw new Error('Invalid data type');
-    }
-    const elementCount = ShapeUtil.size(this.dims);
-    return elementCount === 0 ? new Uint16Array() : new Uint16Array(this.module.HEAP8.buffer, this.data, elementCount);
-  }
-
   getFloat32Array(): Float32Array {
     if (this.dataType !== DataType.float) {
       throw new Error('Invalid data type');
@@ -59,6 +51,14 @@ class TensorViewImpl implements TensorView {
     return elementCount === 0 ? new Int32Array() : new Int32Array(this.module.HEAP8.buffer, this.data, elementCount);
   }
 
+  getUint16Array(): Uint16Array {
+    if (this.dataType !== DataType.float16 && this.dataType !== DataType.uint16) {
+      throw new Error('Invalid data type');
+    }
+    const elementCount = ShapeUtil.size(this.dims);
+    return elementCount === 0 ? new Uint16Array() : new Uint16Array(this.module.HEAP8.buffer, this.data, elementCount);
+  }
+
   reshape(newDims: readonly number[]): TensorView {
     if (ShapeUtil.size(newDims) !== ShapeUtil.size(this.dims)) {
       throw new Error('Invalid new shape');

diff --git a/js/web/lib/wasm/jsep/tensor-view.ts b/js/web/lib/wasm/jsep/tensor-view.ts
@@ -48,6 +48,11 @@ export interface TensorView {
    */
   getInt32Array(): Int32Array;
 
+  /**
+   * get a Uint16Array data view of the tensor data. tensor data must be on CPU.
+   */
+  getUint16Array(): Uint16Array;
+
   /**
    * create a new tensor view with the same data but different dimensions.
    */