Skip to content

Commit

Permalink
add command line parameter in perf test for using remote tensors
Browse files Browse the repository at this point in the history
  • Loading branch information
saurabhkale17 committed Aug 29, 2024
2 parents a6004c5 + 966c48a commit ef44c87
Show file tree
Hide file tree
Showing 103 changed files with 2,847 additions and 1,232 deletions.
2 changes: 2 additions & 0 deletions cmake/onnxruntime_unittests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -892,6 +892,8 @@ if (MSVC)
set_property(SOURCE "${TEST_SRC_DIR}/optimizer/graph_transform_test.cc"
"${TEST_SRC_DIR}/optimizer/qdq_transformer_test.cc"
APPEND PROPERTY COMPILE_OPTIONS "/bigobj")
set_property(SOURCE "${TEST_SRC_DIR}/optimizer/qdq_transformer_test.cc"
APPEND PROPERTY COMPILE_OPTIONS "/bigobj")
else()
target_compile_options(onnxruntime_test_all PRIVATE "-Wno-parentheses")
endif()
Expand Down
59 changes: 55 additions & 4 deletions cmake/patches/cutlass/cutlass_3.5.0.patch
Original file line number Diff line number Diff line change
@@ -1,13 +1,64 @@
diff --git a/examples/41_fused_multi_head_attention/kernel_forward.h b/examples/41_fused_multi_head_attention/kernel_forward.h
index 4c80f549..34327633 100644
--- a/examples/41_fused_multi_head_attention/kernel_forward.h
+++ b/examples/41_fused_multi_head_attention/kernel_forward.h
@@ -221,6 +221,8 @@ struct AttentionKernel {
int32_t num_batches = 0;
int32_t num_heads = 0;

+ bool use_smooth_softmax = false;
+
// dropout
bool use_dropout = false;
unsigned long long dropout_batch_head_rng_offset = 0;
@@ -897,7 +899,8 @@ struct AttentionKernel {
p.num_keys - iter_key_start,
iter_key_start == 0,
iteratorC_tile_offset,
- kSupportsBias ? 1.0f : p.scale);
+ kSupportsBias ? 1.0f : p.scale,
+ p.use_smooth_softmax);

// Output results to shared-memory
int warp_idx_mn_0 = my_warp_id %
@@ -1166,7 +1169,8 @@ struct AttentionKernel {
int max_col,
bool is_first,
typename WarpIteratorC::TensorCoord const& tile_offset,
- float scaling) {
+ float scaling,
+ bool use_smooth_softmax) {
/* Iterates on the accumulator and corresponding position on result matrix

(1) Update `mi[r]` to the max value of the row `r`
@@ -1257,7 +1261,7 @@ struct AttentionKernel {
accum_t mi_row, total_row;
LambdaIterator::iterateRows(
lane_offset,
- [&](int accum_m) { mi_row = mi[accum_m]; },
+ [&](int accum_m) { mi_row = mi[accum_m];},
[&](int accum_m, int accum_n, int idx) {
frag[idx] =
(accum_n < max_col) ? exp2f(frag[idx] - mi_row) : accum_t(0.0);
@@ -1294,7 +1298,7 @@ struct AttentionKernel {
for (int i = 0; i < MM0::MmaCore::WarpCount::kN; ++i) {
total_row += addition_storage[id + kQueriesPerBlock * i];
}
- s_prime[id] = total_row;
+ s_prime[id] = (use_smooth_softmax && (max_col <= kKeysPerBlock)) ? total_row + exp2f(-mi[id]) : total_row;
}
}

diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h
index 964d2ff3..b366bc14 100644
--- a/include/cutlass/functional.h
+++ b/include/cutlass/functional.h
@@ -39,6 +39,7 @@
#include "cutlass/numeric_types.h"

#include <cuda_runtime.h>
+#include <cuda_fp16.h>

#if defined(CUTLASS_ARCH_WMMA_ENABLED)
#include <mma.h>
@@ -230,8 +231,12 @@ struct inverse_square_root<half_t> {
Expand All @@ -19,7 +70,7 @@ index 964d2ff3..b366bc14 100644
return reinterpret_cast<half_t const &>(result);
+#else
+ return half_t::convert((rsqrtf(half_t::convert(lhs))));
+#endif
+#endif
#else
return half_t(1.f / std::sqrt(half_t::convert(lhs)));
#endif
#endif
16 changes: 12 additions & 4 deletions docs/ContribOperators.md
Original file line number Diff line number Diff line change
Expand Up @@ -2541,6 +2541,8 @@ This version of the operator has been available since version 1 of the 'com.micr
<dd>Rotate using interleaved pattern. Default value is 0 (False).</dd>
<dt><tt>scale</tt> : float</dt>
<dd>Custom scale will be used if specified. Default value is 1/sqrt(head_size)</dd>
<dt><tt>smooth_softmax</tt> : int</dt>
<dd>Use a smooth factor in softmax.</dd>
</dl>

#### Inputs (7 - 9)
Expand Down Expand Up @@ -3081,6 +3083,8 @@ This version of the operator has been available since version 1 of the 'com.micr
<dd>Number of top experts to select from expert pool</dd>
<dt><tt>normalize_routing_weights</tt> : int</dt>
<dd>Whether to normalize routing weights</dd>
<dt><tt>use_sparse_mixer</tt> : int</dt>
<dd>Whether to use sparse mixer</dd>
</dl>

#### Inputs (5 - 8)
Expand Down Expand Up @@ -4396,7 +4400,7 @@ This version of the operator has been available since version 1 of the 'com.micr

### <a name="com.microsoft.QMoE"></a><a name="com.microsoft.qmoe">**com.microsoft.QMoE**</a>

Int4 MoE
Quantized MoE

#### Version

Expand All @@ -4407,10 +4411,14 @@ This version of the operator has been available since version 1 of the 'com.micr
<dl>
<dt><tt>activation_type</tt> : string</dt>
<dd>Activation function to use. Choose from relu, gelu, silu and identity. Default is relu</dd>
<dt><tt>expert_weight_bits</tt> : int</dt>
<dd>Number of bits used in quantized weights. Default is 4 bits</dd>
<dt><tt>k</tt> : int</dt>
<dd>Number of top experts to select from expert pool</dd>
<dt><tt>normalize_routing_weights</tt> : int</dt>
<dd>Whether to normalize routing weights</dd>
<dt><tt>use_sparse_mixer</tt> : int</dt>
<dd>Whether to use sparse mixer</dd>
</dl>

#### Inputs (7 - 11)
Expand All @@ -4421,19 +4429,19 @@ This version of the operator has been available since version 1 of the 'com.micr
<dt><tt>router_probs</tt> : T</dt>
<dd>2D input tensor with shape (num_rows, num_experts)</dd>
<dt><tt>fc1_experts_weights</tt> : T1</dt>
<dd>3D input tensor with shape (num_experts, hidden_size, inter_size / 2)</dd>
<dd>3D input tensor with shape (num_experts, hidden_size, inter_size) or (num_experts, hidden_size, inter_size / 2)</dd>
<dt><tt>fc1_scales</tt> : T</dt>
<dd>2D input tensor with shape (num_experts, inter_size)</dd>
<dt><tt>fc1_experts_bias</tt> (optional) : T</dt>
<dd>2D optional input tensor with shape (num_experts, inter_size)</dd>
<dt><tt>fc2_experts_weights</tt> : T1</dt>
<dd>3D input tensor with shape (num_experts, inter_size, hidden_size / 2)</dd>
<dd>3D input tensor with shape (num_experts, inter_size, hidden_size) or (num_experts, inter_size, hidden_size / 2)</dd>
<dt><tt>fc2_scales</tt> : T</dt>
<dd>2D input tensor with shape (num_experts, hidden_size)</dd>
<dt><tt>fc2_experts_bias</tt> (optional) : T</dt>
<dd>2D optional input tensor with shape (num_experts, hidden_size)</dd>
<dt><tt>fc3_experts_weights</tt> (optional) : T1</dt>
<dd>3D optional input tensor with shape (num_experts, hidden_size, inter_size / 2)</dd>
<dd>3D optional input tensor with shape (num_experts, hidden_size, inter_size) or (num_experts, hidden_size, inter_size / 2)</dd>
<dt><tt>fc3_scales</tt> (optional) : T</dt>
<dd>2D optional input tensor with shape (num_experts, inter_size)</dd>
<dt><tt>fc3_experts_bias</tt> (optional) : T</dt>
Expand Down
14 changes: 14 additions & 0 deletions include/onnxruntime/core/framework/execution_provider.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "core/common/logging/logging.h"
#include "core/common/status.h"
#include "core/framework/data_transfer.h"
#include "core/framework/external_data_loader.h"
#include "core/framework/tensor.h"

namespace onnxruntime {
Expand Down Expand Up @@ -88,6 +89,19 @@ class IExecutionProvider {
return nullptr;
}

/**
* Returns an external data loader object that implements methods to load data from external sources.
*
* By default, framework will handle external data loading by loading the data into CPU memory and then copying
* it to the target device if required. So in most cases, it's not necessary to override this method. Specifically,
* in WebAssembly build, because the memory is limited and Web platform supports loading data from external sources
* directly into GPU memory, this method is overridden to provide a custom external data loader to avoid the extra
* CPU memory usage.
*/
virtual std::unique_ptr<onnxruntime::IExternalDataLoader> GetExternalDataLoader() const {
return nullptr;
}

/**
* Interface for performing kernel lookup within kernel registries.
* Abstracts away lower-level details about kernel registries and kernel matching.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,8 @@ static const char* const kOrtRunOptionsConfigQnnRpcControlLatency = "qnn.rpc_con
// If the value is set to -1, cuda graph capture/replay is disabled in that run.
// User are not expected to set the value to 0 as it is reserved for internal use.
static const char* const kOrtRunOptionsConfigCudaGraphAnnotation = "gpu_graph_id";

// Specify the type of workload for this run.
// “Default”: OS determines the scheduling priority and processor performance to service this workload. [Default]
// “Efficient”: OS treats this workload is efficiency oriented with low scheduling priority and efficient processor performance.
static const char* const kOrtRunOptionsWorkloadType = "run.workload_type";
Original file line number Diff line number Diff line change
Expand Up @@ -279,3 +279,8 @@ static const char* const kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16 = "mlas
// Refer to MatMulNBits op schema for more details.
// If not provided, default is 4.
static const char* const kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel = "session.qdq_matmulnbits_accuracy_level";

// Specify the type of workload for this session.
// “Default”: OS determines the scheduling priority and processor performance to service this workload. [Default]
// “Efficient”: OS treats this workload is efficiency oriented with low scheduling priority and efficient processor performance.
static const char* const kOrtSessionOptionsWorkloadType = "session.workload_type";
5 changes: 4 additions & 1 deletion js/.prettierignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
# ignore generated docs
web/docs/

# this JSON file is too large, so it takes too long to format it.
# these JSON file is too large, so it takes too long to format it.
web/test/data/ops/pad-big.jsonc
web/test/testdata-config.json
web/test/ort.test.js
node/test/testdata/squeezenet.input0.json

# ignore dist folder
Expand Down
4 changes: 2 additions & 2 deletions js/web/docs/webgpu-operators.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,10 @@ Do not modify directly.*
| ReduceSum | ai.onnx(1-10,11-12,13+) | |
| ReduceSumSquare | ai.onnx(1-10,11-12,13-17,18+) | |
| Relu | ai.onnx(6-12,13,14+) | |
| Reshape | ai.onnx(5-12,13,14+) | no GPU kernel |
| Reshape | ai.onnx(5-12,13,14-18,19-20,21+) | no GPU kernel |
| Resize | ai.onnx(10,11-12,13-17,18,19+); com.ms.internal.nhwc(10,11-12,13-17,18,19+) | CoordinateTransformMode align_corners is not supported with downsampling |
| RotaryEmbedding | com.microsoft(1+) | |
| Shape | ai.onnx(1-12,13-14,15+) | no GPU kernel; an ORT warning is generated - need to fix |
| Shape | ai.onnx(1-12,13-14,15-18,19-20,21+) | no GPU kernel; an ORT warning is generated - need to fix |
| Sigmoid | ai.onnx(6-12,13+) | |
| SimplifiedLayerNormalization | ai.onnx(1+) | |
| Sin | ai.onnx(7+) | |
Expand Down
16 changes: 8 additions & 8 deletions js/web/lib/wasm/jsep/init.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,6 @@ class TensorViewImpl implements TensorView {
public readonly dims: readonly number[],
) {}

getUint16Array(): Uint16Array {
if (this.dataType !== DataType.float16 && this.dataType !== DataType.uint16) {
throw new Error('Invalid data type');
}
const elementCount = ShapeUtil.size(this.dims);
return elementCount === 0 ? new Uint16Array() : new Uint16Array(this.module.HEAP8.buffer, this.data, elementCount);
}

getFloat32Array(): Float32Array {
if (this.dataType !== DataType.float) {
throw new Error('Invalid data type');
Expand Down Expand Up @@ -59,6 +51,14 @@ class TensorViewImpl implements TensorView {
return elementCount === 0 ? new Int32Array() : new Int32Array(this.module.HEAP8.buffer, this.data, elementCount);
}

getUint16Array(): Uint16Array {
if (this.dataType !== DataType.float16 && this.dataType !== DataType.uint16) {
throw new Error('Invalid data type');
}
const elementCount = ShapeUtil.size(this.dims);
return elementCount === 0 ? new Uint16Array() : new Uint16Array(this.module.HEAP8.buffer, this.data, elementCount);
}

reshape(newDims: readonly number[]): TensorView {
if (ShapeUtil.size(newDims) !== ShapeUtil.size(this.dims)) {
throw new Error('Invalid new shape');
Expand Down
5 changes: 5 additions & 0 deletions js/web/lib/wasm/jsep/tensor-view.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ export interface TensorView {
*/
getInt32Array(): Int32Array;

/**
* get a Uint16Array data view of the tensor data. tensor data must be on CPU.
*/
getUint16Array(): Uint16Array;

/**
* create a new tensor view with the same data but different dimensions.
*/
Expand Down
Loading

0 comments on commit ef44c87

Please sign in to comment.