From 78280bf565082141693008d8ca58805697fe45f4 Mon Sep 17 00:00:00 2001 From: JiCheng Date: Fri, 3 Feb 2023 11:01:20 +0800 Subject: [PATCH 01/68] fix build err inbuild with minimal_build conjuncting disable_exceptions flags (#14524) ### Description If we set flag 'disable_exceptions' to build ORT: `onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.cc.o` woundn't generate such symbols which used by qlinear_pool.c ``` 0000000000000000 W _ZN11onnxruntime7contrib27ComputeQLinearGlobalAvgPoolIaEENS_6common6StatusEPKT_fS4_PS4_fS4_lllbPNS_11concurrency10ThreadPoolE 0000000000000000 W _ZN11onnxruntime7contrib27ComputeQLinearGlobalAvgPoolIhEENS_6common6StatusEPKT_fS4_PS4_fS4_lllbPNS_11concurrency10ThreadPoolE ``` so we get a error of undefined symbols of ComputeQLinearGlobalAvgPool and ComputeQLinearGlobalAvgPool...... ### Motivation and Context --- .../qlinear_global_average_pool.cc | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.cc b/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.cc index 7eab6986930e3..e9924bf616eb5 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.cc @@ -55,6 +55,38 @@ Status ComputeQLinearGlobalAvgPool( return Status::OK(); } +// GCC's unexplained behavior: +// GCC wouldn't generate corresponding symbols versus function instances below when "--disable-exceptions" +// and "--minimal-build" are combined on linux build. +// But this two symbols are required by qlinear_pool.cc. +// The other compilers wouldn't hit it and works fine, and we also didn't see it in the other platforms, such as Android. +// So we are doing explicit instantiation here for every compilers/platforms happy. +template Status ComputeQLinearGlobalAvgPool( + const int8_t* x, + float x_scale, + int8_t x_zero_point, + int8_t* y, + float y_scale, + int8_t y_zero_point, + int64_t N, + int64_t C, + int64_t image_size, + bool channels_last, + concurrency::ThreadPool* tp); + +template Status ComputeQLinearGlobalAvgPool( + const uint8_t* x, + float x_scale, + uint8_t x_zero_point, + uint8_t* y, + float y_scale, + uint8_t y_zero_point, + int64_t N, + int64_t C, + int64_t image_size, + bool channels_last, + concurrency::ThreadPool* tp); + Status QLinearGlobalAveragePool::Compute(OpKernelContext* context) const { const auto tensor_x_scale = context->Input(1); const auto tensor_x_zero_point = context->Input(2); From 7b75ebdb31ddff0cb7c210d1909be7e116b0c077 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 3 Feb 2023 03:16:37 +0000 Subject: [PATCH 02/68] Bump http-cache-semantics from 4.1.0 to 4.1.1 in /js/web (#14535) --- js/web/package-lock.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/js/web/package-lock.json b/js/web/package-lock.json index 72fbdd07bffc6..186fee0dcfd54 100644 --- a/js/web/package-lock.json +++ b/js/web/package-lock.json @@ -3018,9 +3018,9 @@ } }, "node_modules/http-cache-semantics": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/http-cache-semantics/-/http-cache-semantics-4.1.0.tgz", - "integrity": "sha512-carPklcUh7ROWRK7Cv27RPtdhYhUsela/ue5/jKzjegVvXDqM2ILE9Q2BGn9JZJh1g87cp56su/FgQSzcWS8cQ==", + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/http-cache-semantics/-/http-cache-semantics-4.1.1.tgz", + "integrity": "sha512-er295DKPVsV82j5kw1Gjt+ADA/XYHsajl82cGNQG2eyoPkvgUhX+nDIyelzhIWbbsXP39EHcI6l5tYs2FYqYXQ==", "dev": true }, "node_modules/http-errors": { @@ -9501,9 +9501,9 @@ } }, "http-cache-semantics": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/http-cache-semantics/-/http-cache-semantics-4.1.0.tgz", - "integrity": "sha512-carPklcUh7ROWRK7Cv27RPtdhYhUsela/ue5/jKzjegVvXDqM2ILE9Q2BGn9JZJh1g87cp56su/FgQSzcWS8cQ==", + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/http-cache-semantics/-/http-cache-semantics-4.1.1.tgz", + "integrity": "sha512-er295DKPVsV82j5kw1Gjt+ADA/XYHsajl82cGNQG2eyoPkvgUhX+nDIyelzhIWbbsXP39EHcI6l5tYs2FYqYXQ==", "dev": true }, "http-errors": { From 1059cf6d986787716081404f6b0a15c309becf46 Mon Sep 17 00:00:00 2001 From: PeixuanZuo <94887879+PeixuanZuo@users.noreply.github.com> Date: Fri, 3 Feb 2023 13:34:59 +0800 Subject: [PATCH 03/68] [ROCm] Fix ROCm build issue caused by REMOVE_ITEM incorrect path (#14534) ### Description Fix not working REMOVE_ITEM. `onnxruntime/contrib_ops/rocm/aten_ops/aten_op.cc` is hipyfied from `onnxruntime/contrib_ops/cuda/aten_ops/aten_op.cc`. The file correct path is `${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime/contrib_ops/rocm/aten_ops/aten_op.cc` and it exists in hipyfied source files list `onnxruntime_rocm_generated_contrib_ops_cc_srcs`. A better way to fix it: If we don't want to build a file. Add it into hipify excluded files and will not hipify it. --- cmake/onnxruntime_providers.cmake | 6 ------ cmake/onnxruntime_rocm_hipify.cmake | 4 ++++ 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake index fe9e83db6b27c..6697493fbb3c9 100644 --- a/cmake/onnxruntime_providers.cmake +++ b/cmake/onnxruntime_providers.cmake @@ -1352,12 +1352,6 @@ if (onnxruntime_USE_ROCM) # disable contrib ops conditionally if(NOT onnxruntime_DISABLE_CONTRIB_OPS) - if (NOT onnxruntime_ENABLE_ATEN) - list(REMOVE_ITEM onnxruntime_rocm_contrib_ops_cc_srcs - "${ONNXRUNTIME_ROOT}/contrib_ops/rocm/aten_ops/aten_op.cc" - ) - endif() - hipify("onnxruntime/contrib_ops" contrib_ops_excluded_files onnxruntime_rocm_generated_contrib_ops_cc_srcs onnxruntime_rocm_generated_contrib_ops_cu_srcs) # add using ONNXRUNTIME_ROOT so they show up under the 'contrib_ops' folder in Visual Studio diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake index d3b8f5ebfcc26..7e0bb9f6fb419 100644 --- a/cmake/onnxruntime_rocm_hipify.cmake +++ b/cmake/onnxruntime_rocm_hipify.cmake @@ -95,6 +95,10 @@ set(contrib_ops_excluded_files "fused_conv.cc" ) +if (NOT onnxruntime_ENABLE_ATEN) + list(APPEND contrib_ops_excluded_files "aten_ops/aten_op.cc") +endif() + set(provider_excluded_files "atomic/common.cuh" "controlflow/if.cc" From a6c5ba01851ace73491a3f8edcb5743c747dc78d Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Thu, 2 Feb 2023 23:43:51 -0800 Subject: [PATCH 04/68] Stable Diffusion CUDA Optimizations (#14428) ### Description Add stable diffusion CUDA kernel optimizations. The following are included: (1) GroupNorm operator. This kernel is from TensorRT 8.5. (2) BiasSplitGelu operator. This kernel is modified from SplitGelu of TensorRT 8.5. We added bias to the SplitGelu. (3) NhwcConv operator. This adds support of NHWC format (ONNX Conv operator uses NCHW format). (3) Update MultiHeadAttention (packed kv and no bias) for cross attention. This could avoid transpose of kv for TRT fused cross attention kernel. (4) Optimization and benchmark script Not included: (1) Script to convert Conv to NhwcConv in onnx graph. (2) Update symbolic shape inference for NhwcConv. (3) Add SeqLen2Spatial operator (4) Documents Limitations: GroupNorm, BiasSplitGelu and NhwcConv kernels are implemented based on stable diffusion usage. They might not be applicable to any input size or dimensions. For example, BiasSplitGelu requires hidden size to be 2560 | 5120 | 10240, and NhwcConv assumes 4D input/weight. There is minor increasement of binary size. For SM=75 only, python package wheel size adds (33757K - 33640K) = 117 KB. It is possible to move NHWC from template parameter to constructor to reduce binary size (with slight cost of performance). Note: for RTX 4090/4080/4070 Ti, need build with CUDA 11.8 and latest cuDNN to get best performance. --- cmake/onnxruntime_rocm_hipify.cmake | 9 + docs/ContribOperators.md | 98 +++- docs/OperatorKernels.md | 3 + .../cpu/bert/multihead_attention_helper.h | 117 +++-- .../cpu/transformers/generation_shared.h | 40 +- .../cuda/bert/add_bias_transpose.cu | 128 +++-- .../cuda/bert/add_bias_transpose.h | 4 + .../contrib_ops/cuda/bert/attention_impl.cu | 142 +++--- .../cuda/bert/multihead_attention.cc | 6 +- .../contrib_ops/cuda/cuda_contrib_kernels.cc | 10 + .../cuda/diffusion/bias_split_gelu.cc | 76 +++ .../cuda/diffusion/bias_split_gelu.h | 23 + .../cuda/diffusion/bias_split_gelu_impl.cu | 89 ++++ .../cuda/diffusion/bias_split_gelu_impl.h | 19 + .../contrib_ops/cuda/diffusion/group_norm.cc | 129 +++++ .../contrib_ops/cuda/diffusion/group_norm.h | 27 + .../cuda/diffusion/group_norm_impl.cu | 475 ++++++++++++++++++ .../cuda/diffusion/group_norm_impl.h | 42 ++ .../contrib_ops/cuda/diffusion/nhwc_conv.cc | 31 ++ onnxruntime/contrib_ops/cuda/fused_conv.cc | 6 +- .../cuda/transformers/dump_cuda_tensor.cc | 2 +- .../core/graph/contrib_ops/bert_defs.cc | 41 +- .../core/graph/contrib_ops/diffusion_defs.cc | 115 +++++ onnxruntime/core/graph/contrib_ops/ms_opset.h | 4 + .../core/providers/cpu/nn/conv_attributes.h | 20 +- .../core/providers/cuda/cudnn_common.cc | 18 +- .../core/providers/cuda/cudnn_common.h | 5 + onnxruntime/core/providers/cuda/nn/conv.cc | 122 +++-- onnxruntime/core/providers/cuda/nn/conv.h | 4 +- .../python/tools/symbolic_shape_infer.py | 100 ++-- .../transformers/fusion_attention_unet.py | 121 ++++- .../transformers/fusion_biassplitgelu.py | 110 ++++ .../tools/transformers/fusion_group_norm.py | 198 ++++++++ .../tools/transformers/fusion_options.py | 51 +- .../python/tools/transformers/fusion_utils.py | 19 +- .../__init__.py | 0 .../models/stable_diffusion/benchmark.py | 244 +++++++++ .../optimize_pipeline.py} | 99 ++-- .../python/tools/transformers/onnx_model.py | 4 + .../tools/transformers/onnx_model_unet.py | 15 +- .../contrib_ops/bias_split_gelu_op_test.cc | 145 ++++++ .../test/contrib_ops/group_norm_op_test.cc | 436 ++++++++++++++++ .../test/contrib_ops/nhwc_conv_op_test.cc | 223 ++++++++ .../transformers/test_attention_fusion.py | 21 +- 44 files changed, 3263 insertions(+), 328 deletions(-) create mode 100644 onnxruntime/contrib_ops/cuda/diffusion/bias_split_gelu.cc create mode 100644 onnxruntime/contrib_ops/cuda/diffusion/bias_split_gelu.h create mode 100644 onnxruntime/contrib_ops/cuda/diffusion/bias_split_gelu_impl.cu create mode 100644 onnxruntime/contrib_ops/cuda/diffusion/bias_split_gelu_impl.h create mode 100644 onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc create mode 100644 onnxruntime/contrib_ops/cuda/diffusion/group_norm.h create mode 100644 onnxruntime/contrib_ops/cuda/diffusion/group_norm_impl.cu create mode 100644 onnxruntime/contrib_ops/cuda/diffusion/group_norm_impl.h create mode 100644 onnxruntime/contrib_ops/cuda/diffusion/nhwc_conv.cc create mode 100644 onnxruntime/core/graph/contrib_ops/diffusion_defs.cc create mode 100644 onnxruntime/python/tools/transformers/fusion_biassplitgelu.py create mode 100644 onnxruntime/python/tools/transformers/fusion_group_norm.py rename onnxruntime/python/tools/transformers/models/{diffusion => stable_diffusion}/__init__.py (100%) create mode 100755 onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py rename onnxruntime/python/tools/transformers/models/{diffusion/convert_to_fp16.py => stable_diffusion/optimize_pipeline.py} (52%) create mode 100644 onnxruntime/test/contrib_ops/bias_split_gelu_op_test.cc create mode 100644 onnxruntime/test/contrib_ops/group_norm_op_test.cc create mode 100644 onnxruntime/test/contrib_ops/nhwc_conv_op_test.cc diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake index 7e0bb9f6fb419..92a3260714a36 100644 --- a/cmake/onnxruntime_rocm_hipify.cmake +++ b/cmake/onnxruntime_rocm_hipify.cmake @@ -27,6 +27,15 @@ set(contrib_ops_excluded_files "bert/tensorrt_fused_multihead_attention/*" "bert/transformer_common.h" "bert/transformer_common.cc" + "diffusion/group_norm.h" + "diffusion/group_norm.cc" + "diffusion/group_norm_impl.cu" + "diffusion/group_norm_impl.h" + "diffusion/bias_split_gelu_impl.h" + "diffusion/bias_split_gelu_impl.cu" + "diffusion/bias_split_gelu.h" + "diffusion/bias_split_gelu.cc" + "diffusion/nhwc_conv.cc" "math/complex_mul.cc" "math/complex_mul.h" "math/complex_mul_impl.cu" diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md index 1e6d46963cd21..8cd6d4c9e26f1 100644 --- a/docs/ContribOperators.md +++ b/docs/ContribOperators.md @@ -9,6 +9,7 @@ Do not modify directly.* * com.microsoft.BiasDropout * com.microsoft.BiasGelu * com.microsoft.BiasSoftmax + * com.microsoft.BiasSplitGelu * com.microsoft.BifurcationDetector * com.microsoft.BitmaskBiasDropout * com.microsoft.BitmaskDropout @@ -34,6 +35,7 @@ Do not modify directly.* * com.microsoft.GemmFastGelu * com.microsoft.GreedySearch * com.microsoft.GridSample + * com.microsoft.GroupNorm * com.microsoft.Inverse * com.microsoft.Irfft * com.microsoft.LongformerAttention @@ -590,6 +592,39 @@ This version of the operator has been available since version 1 of the 'com.micr +### **com.microsoft.BiasSplitGelu** + + A fusion used in diffusion model that after adding bias, hidden state is sliced into two tensors of same size, then left + tensor multiplies the Gelu activation result of right tensor. + +#### Version + +This version of the operator has been available since version 1 of the 'com.microsoft' operator set. + +#### Inputs + +
+
X : T
+
Input tensor. Dimensions are (N, S, D), where N is the batch size, S are image size, and D is hidden dimension
+
bias : T
+
Bias tensor. Dimensions are (D), where D is the same hidden dimension as input tensor
+
+ +#### Outputs + +
+
Y : T
+
The output tensor with dimensions (N, S, D/2)
+
+ +#### Type Constraints + +
+
T : tensor(float16), tensor(float)
+
Constrain input X and output Y types to float tensors.
+
+ + ### **com.microsoft.BifurcationDetector** Component for aggressive decoding. Find the bifurcation index of predicted tokens, between source tokens, @@ -1811,6 +1846,61 @@ This version of the operator has been available since version 1 of the 'com.micr +### **com.microsoft.GroupNorm** + + Applies Group Normalization over a mini-batch of inputs as described in the paper Group Normalization (https://arxiv.org/abs/1803.08494). + + This operator transforms input according to + y = gamma * (x - mean) / sqrt(variance + epsilon) + beta + + The input channels are separated into num_groups groups, each containing num_channels / num_groups channels. num_channels must be divisible by num_groups. The mean and standard-deviation are calculated separately over the each group. + The weight and bias are per-channel affine transform parameter vectors of size num_channels. + + The activation attribute can be used to enable activation after group normalization. + +#### Version + +This version of the operator has been available since version 1 of the 'com.microsoft' operator set. + +#### Attributes + +
+
activation : int (required)
+
Activation after group normalization: 0 for None, 1 for Swish
+
epsilon : float
+
The epsilon value to use to avoid division by zero
+
groups : int (required)
+
The number of groups of channels. It should be a divisor of the number of channels C
+
+ +#### Inputs + +
+
X : T
+
Input data tensor. Dimensions are (N x H x W x C), where N is the batch size, C is the number of channels, and H and W are the height and width of the data
+
gamma : M
+
1D gamma tensor for normalization with shape (C), where C is number of channels
+
beta : M
+
1D beta tensor for normalization with shape (C), where C is number of channels
+
+ +#### Outputs + +
+
Y : T
+
The output tensor of the same shape as X
+
+ +#### Type Constraints + +
+
T : tensor(float16), tensor(float)
+
Constrain input X and output Y types to float tensors.
+
M : tensor(float)
+
Constrain gamma and beta to float tensors.
+
+ + ### **com.microsoft.Inverse** #### Version @@ -2132,16 +2222,16 @@ This version of the operator has been available since version 1 of the 'com.micr
Number of attention heads
-#### Inputs (4 - 5) +#### Inputs (2 - 5)
query : T
Query with shape (batch_size, sequence_length, hidden_size)
key : T
-
Key with shape (batch_size, kv_sequence_length, hidden_size)
-
value : T
+
Key with shape (batch_size, kv_sequence_length, hidden_size), or packed KV with shape (batch_size, kv_sequence_length, num_heads, 2, head_size)
+
value (optional) : T
Value with shape (batch_size, kv_sequence_length, v_hidden_size)
-
bias : T
+
bias (optional) : T
Bias tensor with shape (hidden_size + hidden_size + v_hidden_size) from input projection
key_padding_mask (optional) : M
Key padding mask with shape (batch_size) or (batch_size, kv_sequence_length)
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index 2dc4fbfb790b2..7e4eb38be780b 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -790,6 +790,7 @@ Do not modify directly.* |BiasDropout|*in* data:**T**
*in* bias:**T**
*in* residual:**T**
*in* ratio:**T1**
*in* training_mode:**T2**
*out* output:**T**
*out* mask:**T2**|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)
**T1** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)
**T2** = tensor(bool)| |BiasGelu|*in* A:**T**
*in* B:**T**
*out* C:**T**|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)| |BiasSoftmax|*in* data:**T**
*in* bias:**T**
*out* output:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)| +|BiasSplitGelu|*in* X:**T**
*in* bias:**T**
*out* Y:**T**|1+|**T** = tensor(float), tensor(float16)| |BitmaskBiasDropout|*in* data:**T**
*in* bias:**T**
*in* residual:**T**
*in* ratio:**T1**
*in* training_mode:**T2**
*out* output:**T**
*out* mask:**T3**|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)
**T1** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)
**T2** = tensor(bool)
**T3** = tensor(uint32)| |BitmaskDropout|*in* data:**T**
*in* ratio:**T1**
*in* training_mode:**T2**
*out* output:**T**
*out* mask:**T3**|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)
**T1** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)
**T2** = tensor(bool)
**T3** = tensor(uint32)| |ComplexMul|*in* A:**T**
*in* B:**T**
*out* C:**T**|1+|**T** = tensor(float), tensor(float16)| @@ -805,11 +806,13 @@ Do not modify directly.* |Gelu|*in* X:**T**
*out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)| |GreedySearch|*in* input_ids:**I**
*in* max_length:**I**
*in* min_length:**I**
*in* repetition_penalty:**T**
*in* vocab_mask:**I**
*in* prefix_vocab_mask:**I**
*in* attention_mask:**I**
*out* sequences:**I**|1+|**T** = tensor(float), tensor(float16)| |GridSample|*in* X:**T1**
*in* Grid:**T1**
*out* Y:**T2**|1+|**T1** = tensor(float)
**T2** = tensor(float)| +|GroupNorm|*in* X:**T**
*in* gamma:**M**
*in* beta:**M**
*out* Y:**T**|1+|**T** = tensor(float), tensor(float16)| |Inverse|*in* X:**T**
*out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)| |Irfft|*in* X:**T**
*out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)| |LongformerAttention|*in* input:**T**
*in* weight:**T**
*in* bias:**T**
*in* mask:**T**
*in* global_weight:**T**
*in* global_bias:**T**
*in* global:**G**
*out* output:**T**|1+|**T** = tensor(float), tensor(float16)| |MultiHeadAttention|*in* query:**T**
*in* key:**T**
*in* value:**T**
*in* bias:**T**
*in* key_padding_mask:**M**
*out* output:**T**|1+|**T** = tensor(float), tensor(float16)| |NGramRepeatBlock|*in* input_ids:**Tid**
*in* scores:**T**
*out* scores_out:**T**|1+|**T** = tensor(float)
**Tid** = tensor(int64)| +|NhwcConv|*in* X:**T**
*in* W:**T**
*in* B:**T**
*out* Y:**T**|1+|**T** = tensor(float), tensor(float16)| |QAttention|*in* input:**T1**
*in* weight:**T2**
*in* bias:**T3**
*in* input_scale:**T3**
*in* weight_scale:**T3**
*in* mask_index:**T4**
*in* input_zero_point:**T1**
*in* weight_zero_point:**T2**
*in* past:**T3**
*out* output:**T3**
*out* present:**T3**|1+|**T1** = tensor(int8)
**T2** = tensor(int8)
**T3** = tensor(float), tensor(float16)
**T4** = tensor(int32)| |QOrderedAttention|*in* input:**Q**
*in* scale_input:**S**
*in* scale_Q_gemm:**S**
*in* scale_K_gemm:**S**
*in* scale_V_gemm:**S**
*in* Q_weight:**Q**
*in* K_weight:**Q**
*in* V_weight:**Q**
*in* scale_Q_weight:**S**
*in* scale_K_weight:**S**
*in* scale_V_weight:**S**
*in* Q_bias:**S**
*in* K_bias:**S**
*in* V_bias:**S**
*in* scale_QKT_gemm:**S**
*in* scale_QKT_softmax:**S**
*in* scale_values_gemm:**S**
*in* mask_index:**G**
*in* past:**Q**
*in* extra_add:**S**
*out* output:**Q**|1+|**G** = tensor(int32)
**Q** = tensor(int8)
**S** = tensor(float)| |QOrderedGelu|*in* X:**Q**
*in* scale_X:**S**
*in* scale_Y:**S**
*out* Y:**Q**|1+|**Q** = tensor(int8)
**S** = tensor(float)| diff --git a/onnxruntime/contrib_ops/cpu/bert/multihead_attention_helper.h b/onnxruntime/contrib_ops/cpu/bert/multihead_attention_helper.h index ce109a83720b9..8c3af05972c95 100644 --- a/onnxruntime/contrib_ops/cpu/bert/multihead_attention_helper.h +++ b/onnxruntime/contrib_ops/cpu/bert/multihead_attention_helper.h @@ -21,11 +21,15 @@ Status CheckInputs(const T* query, int num_heads, float mask_filter_value, int max_threads_per_block) { - // query (Q) : (B, S, D) - // key (K) : (B, L, D) - // value (V) : (B, L, D_v) - // bias (Q/K/V) : (D + D + D_v) - // key_padding_mask (K/V) : (B, L) or (L) + // query (Q) : (B, S, D) + // key (K) : (B, L, D) + // value (V) : (B, L, D_v) + // bias (Q/K/V) : (D + D + D_v) + // key_padding_mask (K/V) : (B) or (B, L) or None + // When packed kv is used: + // key (K) : (B, L, N, 2, H) + // value (V) : None + // bias (Q/K/V) : None const auto& query_dims = query->Shape().GetDims(); if (query_dims.size() != 3) { @@ -34,15 +38,50 @@ Status CheckInputs(const T* query, } const auto& key_dims = key->Shape().GetDims(); - if (key_dims.size() != 3) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'key' is expected to have 3 dimensions, got ", + if (key_dims.size() != 3 && key_dims.size() != 5) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'key' is expected to have 3 or 5 dimensions, got ", key_dims.size()); } + if (query_dims[0] != key_dims[0]) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'query' and 'key' shall have same dim 0 (batch size)"); + } - const auto& bias_dims = bias->Shape().GetDims(); - if (bias_dims.size() != 1) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'bias' is expected to have 1 dimension, got ", - bias_dims.size()); + int batch_size = static_cast(query_dims[0]); + int sequence_length = static_cast(query_dims[1]); + int hidden_size = static_cast(query_dims[2]); + int head_size = static_cast(hidden_size) / num_heads; + int kv_sequence_length = static_cast(key_dims[1]); + + if (key_dims.size() == 3) { + if (key_dims[2] != query_dims[2]) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'query' and 'key' shall have same dim 2 (hidden_size)"); + } + } else // if (key_dims.size() == 5) + { + if (static_cast(key_dims[2]) != num_heads || static_cast(key_dims[3]) != 2 || static_cast(key_dims[4]) != head_size) { + return ORT_MAKE_STATUS( + ONNXRUNTIME, INVALID_ARGUMENT, + "Expect 'key' shape (batch_size, kv_sequence_length, num_heads, 2, head_size) for packed kv"); + } + if (value != nullptr) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Expect 'value' be none when 'key' has packed kv format."); + } + } + + if (bias != nullptr) { + const auto& bias_dims = bias->Shape().GetDims(); + if (bias_dims.size() != 1) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'bias' is expected to have 1 dimension, got ", + bias_dims.size()); + } + + // Currently, bias is not allowed for packed KV. This constraint can be removed later. + // Here we assume that fusion tool will not include bias for packed KV. + if (value == nullptr) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "'bias' is not allowed for packed kv. "); + } } AttentionMaskType mask_type = AttentionMaskType::MASK_NONE; @@ -61,47 +100,39 @@ Status CheckInputs(const T* query, } } - if (query_dims[0] != key_dims[0]) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Input 'query' and 'key' shall have same dim 0 (batch size)"); - } - - int64_t batch_size = query_dims[0]; - int64_t sequence_length = query_dims[1]; - int64_t kv_sequence_length = key_dims[1]; - int64_t q_hidden_size = query_dims[2]; - int64_t v_hidden_size = 0; - - const auto& value_dims = value->Shape().GetDims(); - if (value_dims.size() != 3) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'value' is expected to have 3 dimensions, got ", - value_dims.size()); - } + int v_hidden_size = hidden_size; + if (value != nullptr) { + const auto& value_dims = value->Shape().GetDims(); + if (value_dims.size() != 3) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'value' is expected to have 3 dimensions, got ", + value_dims.size()); + } - if (query_dims[0] != value_dims[0]) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Input 'query' and 'value' shall have same dim 0 (batch_size)"); - } + if (query_dims[0] != value_dims[0]) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'query' and 'value' shall have same dim 0 (batch_size)"); + } - if (key_dims[1] != value_dims[1]) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Input 'key' and 'value' shall have same same dim 1 (sequence_length)"); + if (key_dims[1] != value_dims[1]) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'key' and 'value' shall have same same dim 1 (kv_sequence_length)"); + } + v_hidden_size = static_cast(value_dims[2]); } - v_hidden_size = value_dims[2]; if (parameters != nullptr) { AttentionParameters* output_parameters = reinterpret_cast(parameters); - output_parameters->batch_size = static_cast(batch_size); - output_parameters->sequence_length = static_cast(sequence_length); + output_parameters->batch_size = batch_size; + output_parameters->sequence_length = sequence_length; output_parameters->past_sequence_length = 0; - output_parameters->kv_sequence_length = static_cast(kv_sequence_length); - output_parameters->total_sequence_length = static_cast(kv_sequence_length); + output_parameters->kv_sequence_length = kv_sequence_length; + output_parameters->total_sequence_length = kv_sequence_length; output_parameters->max_sequence_length = 0; output_parameters->input_hidden_size = 0; - output_parameters->hidden_size = static_cast(q_hidden_size); - output_parameters->v_hidden_size = static_cast(v_hidden_size); - output_parameters->head_size = static_cast(q_hidden_size) / num_heads; - output_parameters->v_head_size = static_cast(v_hidden_size) / num_heads; + output_parameters->hidden_size = hidden_size; + output_parameters->v_hidden_size = v_hidden_size; + output_parameters->head_size = hidden_size / num_heads; + output_parameters->v_head_size = v_hidden_size / num_heads; output_parameters->num_heads = num_heads; output_parameters->is_unidirectional = false; output_parameters->past_present_share_buffer = false; diff --git a/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h b/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h index cf1d99688546a..630c533c47323 100644 --- a/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h +++ b/onnxruntime/contrib_ops/cpu/transformers/generation_shared.h @@ -9,10 +9,6 @@ #include "core/framework/allocator.h" #include "core/framework/ort_value.h" -#ifndef NDEBUG -//#define DEBUG_GENERATION 1 // uncomment it for debugging beam search -#endif - namespace onnxruntime { namespace concurrency { @@ -57,14 +53,14 @@ struct IBeamSearchCpuState { template struct IGreedySearchState { - gsl::span sequences_space; // shape (2, batch_size, max_length) - gsl::span sequence_lengths; // shape (batch_size) - gsl::span next_positions; // shape (batch_size, num_beams). Next position value for position_ids. - gsl::span eos_meet; // shape (batch_size) - gsl::span next_token_scores; // shape (batch_size, vocab_size) - gsl::span next_tokens; // shape (batch_size) - gsl::span temp_topk_scores_buffer; // shape (batch_size, parts_of_vocab), temp buffer for topk stage 1 (GPU only) - gsl::span temp_topk_tokens_buffer; // shape (batch_size, parts_of_vocab), temp buffer for topk stage 1(GPU only) + gsl::span sequences_space; // shape (2, batch_size, max_length) + gsl::span sequence_lengths; // shape (batch_size) + gsl::span next_positions; // shape (batch_size, num_beams). Next position value for position_ids. + gsl::span eos_meet; // shape (batch_size) + gsl::span next_token_scores; // shape (batch_size, vocab_size) + gsl::span next_tokens; // shape (batch_size) + gsl::span temp_topk_scores_buffer; // shape (batch_size, parts_of_vocab), temp buffer for topk stage 1 (GPU only) + gsl::span temp_topk_tokens_buffer; // shape (batch_size, parts_of_vocab), temp buffer for topk stage 1(GPU only) gsl::span topk_scores_buffer; // shape (batch_size), output buffer for topk stage 2 (GPU only) gsl::span topk_tokens_buffer; // shape (batch_size), output buffer for topk stage 2 (GPU only) }; @@ -167,6 +163,26 @@ struct IGenerationParameters { bool custom_sampling = false; }; +// #define DEBUG_GENERATION 1 // uncomment it for debugging generation (like beam search etc) +#ifdef DEBUG_GENERATION +#define DUMP_TENSOR_LEVEL 2 +#else +#define DUMP_TENSOR_LEVEL 0 // change it to 1 or 2 if want to enable dumping for code not in generation. +#endif + +#if DUMP_TENSOR_LEVEL > 0 +#define DUMP_TENSOR_INIT() transformers::CudaTensorConsoleDumper dumper +#define DUMP_TENSOR(...) dumper.Print(__VA_ARGS__) +#else +#define DUMP_TENSOR_INIT() +#define DUMP_TENSOR(...) +#endif +#if DUMP_TENSOR_LEVEL > 1 +#define DUMP_TENSOR_D(...) dumper.Print(__VA_ARGS__) +#else +#define DUMP_TENSOR_D(...) +#endif + class IConsoleDumper { public: IConsoleDumper() : is_enabled_(true) {} diff --git a/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.cu b/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.cu index b7eebb9d48785..e86736726c224 100644 --- a/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.cu +++ b/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.cu @@ -366,6 +366,39 @@ __global__ void AddBiasTransposeCutlass(const T* input, const T* biases, T* outp } } +template +__global__ void AddBiasUnpack(int M, const T* input, const T* biases, T* output) { + // Format 4 to unpack TRT packed input format for memory efficient attention. + // Input: BxSxNxMxH + // Output: MxBxSxNxH + // B is batch_size, S is sequence_length, M is number of matrices, N is num_heads, H is head_size + int n = threadIdx.y; + int s = blockIdx.x; + int b = blockIdx.y; + int m = blockIdx.z; // matrix id + + const int head_size = blockDim.x; + const int num_heads = blockDim.y; + + const int sequence_length = gridDim.x; + const int batch_size = gridDim.y; + const int H = head_size; + const int NH = num_heads * head_size; + const int NHS = NH * sequence_length; + + int in_offset = m * head_size + n * M * H + (s * NH + b * NHS) * M; + const int out_offset = n * head_size + s * NH + b * NHS + m * NHS * batch_size; + + const int h = threadIdx.x; + if (h < head_size) { + if (biases != nullptr) { + output[out_offset + h] = input[in_offset + h] + biases[m * NH + n * H + h]; + } else { + output[out_offset + h] = input[in_offset + h]; + } + } +} + template __global__ void AddBiasTransposeCutlass(int M, const T* input, const T* biases, T* output) { // Format 3 for cutlass memory efficient attention @@ -481,7 +514,6 @@ __global__ void AddBiasTransposeLarge(const int head_size, const T* input, const } } - template void InvokeAddBiasTranspose( cudaStream_t stream, const int num_matrices, const int format, const int max_threads_per_block, @@ -506,7 +538,9 @@ void InvokeAddBiasTranspose( ORT_ENFORCE(total_matrix_count == 3); AddBiasTransposeCutlass<<>>(input, biases, output, v_head_size); } - } else { // format == 0 + } else if (format == 4) { // format == 4 + AddBiasUnpack<<>>(total_matrix_count, input, biases, output); + } else { // format == 0 AddBiasTranspose<<>>(input, biases, output); } } else { @@ -528,6 +562,8 @@ void InvokeAddBiasTranspose( } else { ORT_THROW("AddBiasTranspose (format 3) not implemented for hidden_size > max_threads_per_block when qk_head_size != v_head_size"); } + } else if (format == 4) { // format == 4 + ORT_THROW("AddBiasTranspose (format 4) not implemented for hidden_size > max_threads_per_block"); } else { // format 0 AddBiasTransposeLarge<<>>(qk_head_size, input, biases, output); } @@ -551,7 +587,7 @@ void LaunchAddBiasTranspose( InvokeAddBiasTranspose(stream, num_matrices, format, max_threads_per_block, batch_size, sequence_length, num_heads, H, input2, biases2, output2, qkv_add_bias2, H_v, total_matrix_count); - } else if (0 == (qk_head_size & 1) && 0 == (v_head_size % 1)) { + } else if (0 == (qk_head_size & 1) && 0 == (v_head_size & 1)) { const int H = qk_head_size / 2; const int H_v = v_head_size / 2; const half2* input2 = reinterpret_cast(input); @@ -610,7 +646,6 @@ void InvokeAddBiasTransposeTrt( const int batch_size, const int sequence_length, const int num_heads, const int head_size, const T* biases, const T* query, const T* key, const T* value, T* output, bool is_cross_attention, int kv_sequence_length) { - if (!is_cross_attention) { ORT_ENFORCE(sequence_length == kv_sequence_length); constexpr int num_matrices = 3; @@ -696,52 +731,51 @@ void LaunchAddBiasTransposeTrt( } } - template void InvokeAddBias( cudaStream_t stream, const int max_threads_per_block, const int batch_size, const int sequence_length, const int kv_sequence_length, const int num_heads, const int head_size, const int v_head_size, const T* biases, const T* query, const T* key, const T* value, T* q, T* k, T* v) { - constexpr int num_matrices = 1; - // Q - { - const dim3 grid(sequence_length, batch_size, num_matrices); - if (head_size * num_heads <= max_threads_per_block) { - const dim3 block(head_size, num_heads, 1); - AddBiasTransposeTrt<<>>(query, biases, q); - } else { - const dim3 block(CeilDiv(max_threads_per_block, num_heads), num_heads, 1); - AddBiasTransposeTrtLarge<<>>(head_size, query, biases, q); - } + constexpr int num_matrices = 1; + // Q + { + const dim3 grid(sequence_length, batch_size, num_matrices); + if (head_size * num_heads <= max_threads_per_block) { + const dim3 block(head_size, num_heads, 1); + AddBiasTransposeTrt<<>>(query, biases, q); + } else { + const dim3 block(CeilDiv(max_threads_per_block, num_heads), num_heads, 1); + AddBiasTransposeTrtLarge<<>>(head_size, query, biases, q); } - // K - { - const dim3 grid(kv_sequence_length, batch_size, num_matrices); - const T* biases_k = biases + num_heads * head_size; + } + // K + { + const dim3 grid(kv_sequence_length, batch_size, num_matrices); + const T* biases_k = biases + num_heads * head_size; - if (head_size * num_heads <= max_threads_per_block) { - const dim3 block(head_size, num_heads, 1); - AddBiasTransposeTrt<<>>(key, biases_k, k); - } else { - const dim3 block(CeilDiv(max_threads_per_block, num_heads), num_heads, 1); - AddBiasTransposeTrtLarge<<>>(head_size, key, biases_k, k); - } + if (head_size * num_heads <= max_threads_per_block) { + const dim3 block(head_size, num_heads, 1); + AddBiasTransposeTrt<<>>(key, biases_k, k); + } else { + const dim3 block(CeilDiv(max_threads_per_block, num_heads), num_heads, 1); + AddBiasTransposeTrtLarge<<>>(head_size, key, biases_k, k); } + } - // V - { - const dim3 grid(kv_sequence_length, batch_size, num_matrices); + // V + { + const dim3 grid(kv_sequence_length, batch_size, num_matrices); - const T* biases_v = biases + 2 * num_heads * head_size; - if (v_head_size * num_heads <= max_threads_per_block) { - const dim3 block(v_head_size, num_heads, 1); - AddBiasTransposeTrt<<>>(value, biases_v, v); - } else { - const dim3 block(CeilDiv(max_threads_per_block, num_heads), num_heads, 1); - AddBiasTransposeTrtLarge<<>>(v_head_size, value, biases_v, v); - } + const T* biases_v = biases + 2 * num_heads * head_size; + if (v_head_size * num_heads <= max_threads_per_block) { + const dim3 block(v_head_size, num_heads, 1); + AddBiasTransposeTrt<<>>(value, biases_v, v); + } else { + const dim3 block(CeilDiv(max_threads_per_block, num_heads), num_heads, 1); + AddBiasTransposeTrtLarge<<>>(v_head_size, value, biases_v, v); } + } } template <> @@ -750,7 +784,7 @@ void LaunchAddBias( const int batch_size, const int sequence_length, const int kv_sequence_length, const int num_heads, const int head_size, const int v_head_size, const float* biases, const float* query, const float* key, const float* value, float* q, float* k, float* v) { -if (0 == (head_size % 4) && 0 == (v_head_size % 4)) { + if (0 == (head_size % 4) && 0 == (v_head_size % 4)) { const int H = head_size / 4; const int H_v = v_head_size / 4; const float4* query2 = reinterpret_cast(query); @@ -761,8 +795,8 @@ if (0 == (head_size % 4) && 0 == (v_head_size % 4)) { float4* k2 = reinterpret_cast(k); float4* v2 = reinterpret_cast(v); InvokeAddBias(stream, max_threads_per_block, - batch_size, sequence_length, kv_sequence_length, num_heads, H, H_v, - biases2, query2, key2, value2, q2, k2, v2); + batch_size, sequence_length, kv_sequence_length, num_heads, H, H_v, + biases2, query2, key2, value2, q2, k2, v2); } else if (0 == (head_size & 1) && 0 == (v_head_size & 1)) { const int H = head_size / 2; const int H_v = v_head_size / 2; @@ -774,14 +808,13 @@ if (0 == (head_size % 4) && 0 == (v_head_size % 4)) { float2* k2 = reinterpret_cast(k); float2* v2 = reinterpret_cast(v); InvokeAddBias(stream, max_threads_per_block, - batch_size, sequence_length, kv_sequence_length, num_heads, H, H_v, - biases2, query2, key2, value2, q2, k2, v2); + batch_size, sequence_length, kv_sequence_length, num_heads, H, H_v, + biases2, query2, key2, value2, q2, k2, v2); } else { InvokeAddBias(stream, max_threads_per_block, - batch_size, sequence_length, kv_sequence_length, num_heads, head_size, v_head_size, - biases, query, key, value, q, k, v); + batch_size, sequence_length, kv_sequence_length, num_heads, head_size, v_head_size, + biases, query, key, value, q, k, v); } - } template <> @@ -790,8 +823,7 @@ void LaunchAddBias( const int batch_size, const int sequence_length, const int kv_sequence_length, const int num_heads, const int head_size, const int v_head_size, const half* biases, const half* query, const half* key, const half* value, half* q, half* k, half* v) { - - if (0 == (head_size % 4) && 0 == (v_head_size % 4)) { + if (0 == (head_size % 4) && 0 == (v_head_size % 4)) { const int H = head_size / 4; const int H_v = v_head_size / 4; const Half4* query2 = reinterpret_cast(query); diff --git a/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.h b/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.h index 8cc36637054e7..a2c3265284a4d 100644 --- a/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.h +++ b/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.h @@ -24,6 +24,10 @@ namespace cuda { // format 3: (requires sequence_length = kv_sequence_length and qk_head_size = v_head_size when num_matrices == 3) // input: (batch_size, sequence_length, num_matrices, num_heads, head_size) // output: (num_matrices, batch_size, sequence_length, num_heads, head_size) +// format 4: (requires qk_head_size = v_head_size) +// input: (batch_size, sequence_length, num_heads, num_matrices, head_size) +// output: (num_matrices, batch_size, sequence_length, num_heads, head_size) + template void LaunchAddBiasTranspose( cudaStream_t stream, const int num_matrices, const int format, const int max_threads_per_block, diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu index 187f1bb37edc5..8c7ef9f919519 100644 --- a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu +++ b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu @@ -48,21 +48,6 @@ using namespace cub; #define CHECK_CUDA(expr) CUDA_RETURN_IF_ERROR(expr) #define CUDA_MEMORY_ALIGNMENT 256 -#define DUMP_ATTENTION_LEVEL 0 -#if DUMP_ATTENTION_LEVEL > 1 -#define DUMP_ATTENTION_INIT() transformers::CudaTensorConsoleDumper dumper -#define DUMP_ATTENTION(...) dumper.Print(__VA_ARGS__) -#define DUMP_ATTENTION_D(...) dumper.Print(__VA_ARGS__) -#elif DUMP_ATTENTION_LEVEL > 0 -#define DUMP_ATTENTION_INIT() transformers::CudaTensorConsoleDumper dumper -#define DUMP_ATTENTION(...) dumper.Print(__VA_ARGS__) -#define DUMP_ATTENTION_D(...) -#else -#define DUMP_ATTENTION_INIT() -#define DUMP_ATTENTION(...) -#define DUMP_ATTENTION_D(...) -#endif - namespace onnxruntime { namespace contrib { namespace cuda { @@ -283,7 +268,7 @@ Status PrepareQkv(contrib::AttentionParameters& parameters, // Default format for memory efficient attention. // When there is past state, the format shal be BxNxSxH, so we disable memory efficient attention when there is past. - DUMP_ATTENTION_INIT(); + DUMP_TENSOR_INIT(); if (nullptr != data.gemm_buffer) { if (data.bias == nullptr) { // For quantized attention, bias has been added so only need transpose here. @@ -317,15 +302,42 @@ Status PrepareQkv(contrib::AttentionParameters& parameters, data.gemm_buffer, data.bias, qkv, true, v_head_size, qkv_add_bias, 3); } - } else { // gemm_buffer == nullptr + } else if (data.value == nullptr) { // gemm_buffer == nullptr and packed kv + // TODO: unpack kv to BNSH for unfused kernel so that we can remove the following constraint. + // CheckInputs verified this constraint. + assert(data.bias == nullptr); + assert(qk_head_size == v_head_size); + + DUMP_TENSOR_D("packed_kv", data.key, batch_size * kv_sequence_length, num_heads, 2, qk_head_size); + + if (use_memory_efficient_attention) { + // unpack kv to BSNH. Note that there is no bias so we need not output query to q. + constexpr int format = 4; + T* qkv_add_bias = nullptr; + const T* kv_bias = (data.bias == nullptr ? data.bias : data.bias + parameters.hidden_size); + LaunchAddBiasTranspose(stream, 2, format, max_threads_per_block, + batch_size, kv_sequence_length, num_heads, qk_head_size, + data.key, kv_bias, k, + true, v_head_size, qkv_add_bias, 2); + DUMP_TENSOR_D("k(BSNH)", k, batch_size * kv_sequence_length, num_heads, qk_head_size); + DUMP_TENSOR_D("v(BSNH)", v, batch_size * kv_sequence_length, num_heads, v_head_size); + qkv_format = AttentionQkvFormat::Q_K_V_BSNH; + } else { + if (data.fused_cross_attention_kernel == nullptr) { + return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "packed KV format is not implemented for current GPU. Please disable packed kv in fusion options."); + } + + qkv_format = AttentionQkvFormat::Q_KV_BSNH_BSN2H; + } + } else { // gemm_buffer == nullptr and not packed kv assert(data.query != nullptr && data.key != nullptr && data.value != nullptr && data.bias != nullptr); - DUMP_ATTENTION_D("query", data.query, batch_size * sequence_length, num_heads, qk_head_size); - DUMP_ATTENTION_D("query_bias", data.bias, num_heads, qk_head_size); - DUMP_ATTENTION_D("key", data.key, batch_size * kv_sequence_length, num_heads, qk_head_size); - DUMP_ATTENTION_D("key_bias", data.bias + num_heads * qk_head_size, num_heads, qk_head_size); - DUMP_ATTENTION_D("value", data.value, batch_size * kv_sequence_length, num_heads, v_head_size); - DUMP_ATTENTION_D("value_bias", data.bias + 2 * num_heads * qk_head_size, num_heads, v_head_size); + DUMP_TENSOR_D("query", data.query, batch_size * sequence_length, num_heads, qk_head_size); + DUMP_TENSOR_D("query_bias", data.bias, num_heads, qk_head_size); + DUMP_TENSOR_D("key", data.key, batch_size * kv_sequence_length, num_heads, qk_head_size); + DUMP_TENSOR_D("key_bias", data.bias + num_heads * qk_head_size, num_heads, qk_head_size); + DUMP_TENSOR_D("value", data.value, batch_size * kv_sequence_length, num_heads, v_head_size); + DUMP_TENSOR_D("value_bias", data.bias + 2 * num_heads * qk_head_size, num_heads, v_head_size); if (data.fused_cross_attention_kernel != nullptr) { assert(qk_head_size == v_head_size); @@ -347,9 +359,9 @@ Status PrepareQkv(contrib::AttentionParameters& parameters, num_heads, qk_head_size, v_head_size, data.bias, data.query, data.key, data.value, q, k, v); - DUMP_ATTENTION_D("q(BSNH)", q, batch_size * sequence_length, num_heads, qk_head_size); - DUMP_ATTENTION_D("k(BSNH)", k, batch_size * kv_sequence_length, num_heads, qk_head_size); - DUMP_ATTENTION_D("v(BSNH)", v, batch_size * kv_sequence_length, num_heads, v_head_size); + DUMP_TENSOR_D("q(BSNH)", q, batch_size * sequence_length, num_heads, qk_head_size); + DUMP_TENSOR_D("k(BSNH)", k, batch_size * kv_sequence_length, num_heads, qk_head_size); + DUMP_TENSOR_D("v(BSNH)", v, batch_size * kv_sequence_length, num_heads, v_head_size); qkv_format = AttentionQkvFormat::Q_K_V_BSNH; } #endif @@ -362,7 +374,7 @@ Status PrepareQkv(contrib::AttentionParameters& parameters, batch_size, sequence_length, num_heads, qk_head_size, data.bias, data.query, data.key, data.value, qkv, false, kv_sequence_length); - DUMP_ATTENTION_D("qkv(BSN3H)", qkv, batch_size, sequence_length, num_heads, 2 * qk_head_size + v_head_size); + DUMP_TENSOR_D("qkv(BSN3H)", qkv, batch_size, sequence_length, num_heads, 2 * qk_head_size + v_head_size); qkv_format = AttentionQkvFormat::QKV_BSN3H; } else { // unfused kernel @@ -387,9 +399,9 @@ Status PrepareQkv(contrib::AttentionParameters& parameters, data.value, data.bias + 2 * num_heads * qk_head_size, v, true, -1); - DUMP_ATTENTION_D("q(BNSH)", q, batch_size * num_heads, sequence_length, qk_head_size); - DUMP_ATTENTION_D("k(BNSH)", k, batch_size * num_heads, kv_sequence_length, qk_head_size); - DUMP_ATTENTION_D("v(BNSH)", v, batch_size * num_heads, kv_sequence_length, v_head_size); + DUMP_TENSOR_D("q(BNSH)", q, batch_size * num_heads, sequence_length, qk_head_size); + DUMP_TENSOR_D("k(BNSH)", k, batch_size * num_heads, kv_sequence_length, qk_head_size); + DUMP_TENSOR_D("v(BNSH)", v, batch_size * num_heads, kv_sequence_length, v_head_size); qkv_format = AttentionQkvFormat::Q_K_V_BNSH; } } @@ -419,8 +431,7 @@ Status QkvToContext( void* fused_runner = data.fused_runner; // At most one fused kernel is enabled. - assert(int(data.use_memory_efficient_attention) + int(fused_runner != nullptr) + - int(data.fused_cross_attention_kernel != nullptr) <= 1); + assert(int(data.use_memory_efficient_attention) + int(fused_runner != nullptr) + int(data.fused_cross_attention_kernel != nullptr) <= 1); const int batches = batch_size * num_heads; const int size_per_batch_q = sequence_length * qk_head_size; @@ -481,7 +492,7 @@ Status QkvToContext( ORT_RETURN_IF_ERROR(LaunchAddBiasTransAppendKvToPresent( stream, parameters.max_sequence_length, parameters.past_sequence_length, sequence_length, batch_size, qk_head_size, num_heads, max_threads_per_block, - use_fused_causal ? nullptr : data.bias, // For fused causal, bias has been added to gemm_buffer + use_fused_causal ? nullptr : data.bias, // For fused causal, bias has been added to gemm_buffer data.gemm_buffer, data.present)); present_size_per_batch_k = parameters.max_sequence_length * qk_head_size; @@ -491,7 +502,7 @@ Status QkvToContext( } // Q, K and V are ready now - DUMP_ATTENTION_INIT(); + DUMP_TENSOR_INIT(); if (data.fused_cross_attention_kernel != nullptr) { assert(qkv_format == AttentionQkvFormat::Q_KV_BSNH_BSN2H); @@ -499,7 +510,7 @@ Status QkvToContext( LaunchTrtSequenceOffset(q_sequence_offset, nullptr, batch_size, sequence_length, stream); CUDA_RETURN_IF_ERROR(cudaGetLastError()); - DUMP_ATTENTION_D("q_sequence_offset", q_sequence_offset, 1, batch_size + 1); + DUMP_TENSOR_D("q_sequence_offset", q_sequence_offset, 1, batch_size + 1); // We only enable fused cross attention when there is no key padding mask. // Otherwise, key have effective batch size 2 * batch_size, which is different from batch_size of query. @@ -509,26 +520,34 @@ Status QkvToContext( LaunchTrtSequenceOffset(kv_sequence_offset, data.mask_index, batch_size, kv_sequence_length, stream); CUDA_RETURN_IF_ERROR(cudaGetLastError()); - DUMP_ATTENTION_D("kv_sequence_offset", kv_sequence_offset, 1, batch_size + 1); + DUMP_TENSOR_D("kv_sequence_offset", kv_sequence_offset, 1, batch_size + 1); FusedMultiHeadCrossAttentionKernel const* cross_attention_kernel = reinterpret_cast(data.fused_cross_attention_kernel); + // When there is no bias, we can directly use q and packed kv from inputs. TODO: not need qkv in workspace. + void const* query = q; + void const* packed_kv = k; + if (data.value == nullptr && data.bias == nullptr) { + query = data.query; + packed_kv = data.key; + } + run_fused_cross_attention( - q, // Q - k, // packed KV - q_sequence_offset, // cumulated sequence length of Q - kv_sequence_offset, // cumulated sequence length of KV - data.output, // output - cross_attention_kernel, // kernels - batch_size, // batch size - num_heads, // number of heads - qk_head_size, // head size of Q/K/V - sequence_length, // sequence length of Q - kv_sequence_length, // sequence length of KV + query, // Q + packed_kv, // packed KV + q_sequence_offset, // cumulated sequence length of Q + kv_sequence_offset, // cumulated sequence length of KV + data.output, // output + cross_attention_kernel, // kernels + batch_size, // batch size + num_heads, // number of heads + qk_head_size, // head size of Q/K/V + sequence_length, // sequence length of Q + kv_sequence_length, // sequence length of KV stream); - DUMP_ATTENTION("trt cross output", data.output, batch_size * sequence_length, num_heads, v_head_size); + DUMP_TENSOR("trt cross output", data.output, batch_size * sequence_length, num_heads, v_head_size); return Status::OK(); } @@ -554,11 +573,11 @@ Status QkvToContext( if (use_fused_kernel) { assert(qkv_format == AttentionQkvFormat::QKV_BSN3H); fused_fp16_runner->run(qkv, sequence_offset, data.output, stream); - DUMP_ATTENTION("fused output", data.output, batch_size * sequence_length, num_heads, v_head_size); + DUMP_TENSOR("fused output", data.output, batch_size * sequence_length, num_heads, v_head_size); } else { assert(qkv_format == AttentionQkvFormat::Q_K_V_BNSH_QKV_BS3NH); fused_fp16_runner->run(data.gemm_buffer, sequence_offset, data.output, stream); - DUMP_ATTENTION("fused causal output", data.output, batch_size * sequence_length, num_heads, v_head_size); + DUMP_TENSOR("fused causal output", data.output, batch_size * sequence_length, num_heads, v_head_size); } return Status::OK(); } @@ -570,6 +589,13 @@ Status QkvToContext( assert(data.mask_index == nullptr); assert(qkv_format == AttentionQkvFormat::Q_K_V_BSNH); + const void* query = q; + const void* key = k; + const void* value = v; + if (data.gemm_buffer == nullptr && data.value == nullptr) { // packed KV + query = data.query; + } + MemoryEfficientAttentionParams p; p.sm = device_prop.major * 10 + device_prop.minor; p.is_half = sizeof(T) == 2; @@ -582,15 +608,15 @@ Status QkvToContext( p.causal = parameters.is_unidirectional; p.cu_seqlens_q = nullptr; p.cu_seqlens_k = nullptr; - p.query = q; - p.key = k; - p.value = v; + p.query = query; + p.key = key; + p.value = value; p.output = data.output; p.workspace = MemoryEfficientAttentionParams::need_workspace(v_head_size, sizeof(T) == sizeof(float)) ? scratch1 : nullptr; p.stream = stream; run_memory_efficient_attention(p); - DUMP_ATTENTION("cutlass output", data.output, batch_size * sequence_length, num_heads, v_head_size); + DUMP_TENSOR("cutlass output", data.output, batch_size * sequence_length, num_heads, v_head_size); return Status::OK(); } #endif @@ -610,7 +636,7 @@ Status QkvToContext( // For raw attention mask, the scalar 1/sqrt(H) is moved to combine with softmax computation. const float scale = parameters.scale == 0.0f ? 1.f / sqrt(static_cast(qk_head_size)) - : parameters.scale; + : parameters.scale; float alpha = use_raw_attention_mask ? one : scale; cublasSetStream(cublas, stream); @@ -622,7 +648,7 @@ Status QkvToContext( q, qk_head_size, sequence_length * qk_head_size, &zero, scratch1, total_sequence_length, sequence_length * total_sequence_length, batches, device_prop)); - DUMP_ATTENTION_D("QK", scratch1, batch_size * num_heads, sequence_length, total_sequence_length); + DUMP_TENSOR_D("QK", scratch1, batch_size * num_heads, sequence_length, total_sequence_length); const size_t bytes = GetAttentionScratchSize(element_size, batch_size, num_heads, sequence_length, total_sequence_length); @@ -656,7 +682,7 @@ Status QkvToContext( scratch1, scratch2, parameters.is_unidirectional)); } - DUMP_ATTENTION_D("Softmax", scratch2, batch_size * num_heads, sequence_length, total_sequence_length); + DUMP_TENSOR_D("Softmax", scratch2, batch_size * num_heads, sequence_length, total_sequence_length); // compute R*V (as V*R), and store in temp_output (space used by Q): BxNxSxH_v T* temp_output = qkv; @@ -670,7 +696,7 @@ Status QkvToContext( // Temp_output is BxNxSxH_v, transpose to output BxSxNxH_v Status result = LaunchTransCtx(stream, sequence_length, batch_size, v_head_size, num_heads, max_threads_per_block, false, temp_output, data.output); - DUMP_ATTENTION("unfused output", data.output, batch_size * sequence_length, num_heads, v_head_size); + DUMP_TENSOR("unfused output", data.output, batch_size * sequence_length, num_heads, v_head_size); return result; } diff --git a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc index c7e5d34e1691b..93e5e59ed00ae 100644 --- a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc +++ b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc @@ -94,6 +94,7 @@ Status MultiHeadAttention::ComputeInternal(OpKernelContext* context) const { bool use_fused_cross_attention = !disable_fused_cross_attention_ && nullptr == key_padding_mask && + (value != nullptr || bias == nullptr) && // TODO: new kernel for adding bias to packed KV parameters.hidden_size == parameters.v_hidden_size && has_fused_cross_attention_kernel(sm, parameters.head_size, parameters.kv_sequence_length); @@ -111,6 +112,7 @@ Status MultiHeadAttention::ComputeInternal(OpKernelContext* context) const { bool use_fused_runner = !disable_fused_runner_ && fused_cross_attention_kernel == nullptr && + value != nullptr && // fused runner requires packed qkv instead of packed kv (nullptr == key_padding_mask || is_mask_1d_seq_len) && parameters.hidden_size == parameters.v_hidden_size && parameters.sequence_length == parameters.kv_sequence_length && @@ -162,10 +164,10 @@ Status MultiHeadAttention::ComputeInternal(OpKernelContext* context) const { typedef typename ToCudaType::MappedType CudaT; AttentionData data; data.gemm_buffer = nullptr; - data.bias = reinterpret_cast(bias->Data()); + data.bias = (nullptr == bias) ? nullptr : reinterpret_cast(bias->Data()); data.query = reinterpret_cast(query->Data()); data.key = reinterpret_cast(key->Data()); - data.value = reinterpret_cast(value->Data()); + data.value = (nullptr == value) ? nullptr : reinterpret_cast(value->Data()); data.mask_index = (nullptr == key_padding_mask) ? nullptr : key_padding_mask->Data(); data.mask_index_dims = (nullptr == key_padding_mask) ? gsl::span() : key_padding_mask->Shape().GetDims(); data.past = nullptr; diff --git a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc index 38bcbc298b939..a239e528af148 100644 --- a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc @@ -19,6 +19,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, BiasGelu); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, BiasGelu); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, BiasGelu); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, BiasSplitGelu); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, BiasSplitGelu); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, QuickGelu); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, QuickGelu); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, QuickGelu); @@ -71,6 +73,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, EmbedLayerNormalization); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, GreedySearch); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, GroupNorm); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, NhwcConv); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, NhwcConv); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, ImageScaler); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, ImageScaler); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, ImageScaler); @@ -144,6 +149,8 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -192,6 +199,9 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/onnxruntime/contrib_ops/cuda/diffusion/bias_split_gelu.cc b/onnxruntime/contrib_ops/cuda/diffusion/bias_split_gelu.cc new file mode 100644 index 0000000000000..2b13cdbd803ef --- /dev/null +++ b/onnxruntime/contrib_ops/cuda/diffusion/bias_split_gelu.cc @@ -0,0 +1,76 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/cuda/cuda_common.h" +#include "contrib_ops/cuda/diffusion/bias_split_gelu.h" +#include "contrib_ops/cuda/diffusion/bias_split_gelu_impl.h" + +namespace onnxruntime { +namespace contrib { +namespace cuda { + +#define REGISTER_KERNEL_TYPED(T) \ + ONNX_OPERATOR_TYPED_KERNEL_EX( \ + BiasSplitGelu, \ + kMSDomain, \ + 1, \ + T, \ + kCudaExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ + BiasSplitGelu); + +REGISTER_KERNEL_TYPED(MLFloat16); +REGISTER_KERNEL_TYPED(float); + +using namespace ONNX_NAMESPACE; + +template +BiasSplitGelu::BiasSplitGelu(const OpKernelInfo& op_info) : CudaKernel(op_info) { +} + +template +Status BiasSplitGelu::ComputeInternal(OpKernelContext* context) const { + const Tensor* input = context->Input(0); + + const auto& input_dims = input->Shape().GetDims(); + if (input_dims.size() != 3) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "input is expected to have 3 dimensions, got ", input_dims.size()); + } + + if (input_dims[2] != 2560 && input_dims[2] != 5120 && input_dims[2] != 10240) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "hidden size should be 2560, 5120 or 10240, got ", input_dims[2]); + } + + const Tensor* bias = context->Input(1); + const auto& bias_dims = bias->Shape().GetDims(); + if (bias_dims.size() != 1) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "bias is expected to have 1 dimensions, got ", bias_dims.size()); + } + if (bias_dims[0] != input_dims[2]) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "last dimension of input and bias are not the same"); + } + + TensorShapeVector output_shape = input->Shape().AsShapeVector(); + output_shape[2] = input_dims[2] / 2; + Tensor* output = context->Output(0, output_shape); + + typedef typename ToCudaType::MappedType CudaT; + const int32_t grid_size = static_cast(input_dims[0] * input_dims[1]); + const int32_t half_hidden_size = static_cast(input_dims[2] / 2); + LaunchBiasSplitGeluKernel(Stream(context), grid_size, half_hidden_size, + reinterpret_cast(input->Data()), + reinterpret_cast(bias->Data()), + reinterpret_cast(output->MutableData())); + + CUDA_RETURN_IF_ERROR(cudaPeekAtLastError()); + return Status::OK(); +} + +} // namespace cuda +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cuda/diffusion/bias_split_gelu.h b/onnxruntime/contrib_ops/cuda/diffusion/bias_split_gelu.h new file mode 100644 index 0000000000000..feec45600bbce --- /dev/null +++ b/onnxruntime/contrib_ops/cuda/diffusion/bias_split_gelu.h @@ -0,0 +1,23 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/common/common.h" +#include "core/providers/cuda/cuda_kernel.h" + +namespace onnxruntime { +namespace contrib { +namespace cuda { + +using namespace onnxruntime::cuda; + +template +class BiasSplitGelu final : public CudaKernel { + public: + BiasSplitGelu(const OpKernelInfo& op_kernel_info); + Status ComputeInternal(OpKernelContext* context) const override; +}; + +} // namespace cuda +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cuda/diffusion/bias_split_gelu_impl.cu b/onnxruntime/contrib_ops/cuda/diffusion/bias_split_gelu_impl.cu new file mode 100644 index 0000000000000..3cb95dad26b36 --- /dev/null +++ b/onnxruntime/contrib_ops/cuda/diffusion/bias_split_gelu_impl.cu @@ -0,0 +1,89 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +// The CUDA kernel is modified from SplitGelu plugin of TensorRT 8.5. +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "core/providers/cuda/cu_inc/common.cuh" +#include "contrib_ops/cuda/diffusion/bias_split_gelu_impl.h" + +namespace onnxruntime { +namespace contrib { +namespace cuda { + +template +__global__ void biasSplitGeluKernel(T const* input, T const* bias, T* output) { + int32_t index_input = blockIdx.x * HHS * 2 + threadIdx.x; + int32_t index_output = blockIdx.x * HHS + threadIdx.x; + int32_t index_bias = threadIdx.x; + +#pragma unroll + for (int32_t i = 0; i < HHS / TPB; ++i) { +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) + auto value_left = (float)(input[index_input] + bias[index_bias]); + auto value_right = (float)(input[index_input + HHS] + bias[index_bias + HHS]); +#else + auto value_left = (float)(input[index_input]) + (float)(bias[index_bias]); + auto value_right = (float)(input[index_input + HHS]) + (float)(bias[index_bias + HHS]); +#endif + // Gelu is applied to right side only: Gelu(x) = x * 0.5 * (erf(x / sqrt(2)) + 1.0) + float gelu_right = value_right * 0.5f * (erff(value_right / 1.41421356237f) + 1.0f); + float result = value_left * gelu_right; + output[index_output] = static_cast(result); + index_input += TPB; + index_output += TPB; + index_bias += TPB; + } + return; +} + +template +void LaunchBiasSplitGeluKernel(cudaStream_t stream, int32_t grid_size, int32_t half_hidden_size, + T const* input, T const* bias, T* output) { + constexpr int32_t TPB = 256; // thread per block + switch (half_hidden_size) { + case 1280: + (biasSplitGeluKernel)<<>>(input, bias, output); + break; + case 2560: + (biasSplitGeluKernel)<<>>(input, bias, output); + break; + case 5120: + (biasSplitGeluKernel)<<>>(input, bias, output); + break; + default: + ORT_NOT_IMPLEMENTED("Not implemented"); + } +} + +template __global__ void biasSplitGeluKernel(float const*, float const*, float*); +template __global__ void biasSplitGeluKernel(float const*, float const*, float*); +template __global__ void biasSplitGeluKernel(float const*, float const*, float*); +template __global__ void biasSplitGeluKernel(half const*, half const*, half*); +template __global__ void biasSplitGeluKernel(half const*, half const*, half*); +template __global__ void biasSplitGeluKernel(half const*, half const*, half*); + +template void LaunchBiasSplitGeluKernel(cudaStream_t stream, int32_t grid_size, int32_t half_hidden_size, + float const* input, float const* bias, float* output); + +template void LaunchBiasSplitGeluKernel(cudaStream_t stream, int32_t grid_size, int32_t half_hidden_size, + half const* input, half const* bias, half* output); +} // namespace cuda +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cuda/diffusion/bias_split_gelu_impl.h b/onnxruntime/contrib_ops/cuda/diffusion/bias_split_gelu_impl.h new file mode 100644 index 0000000000000..a04201bd12e3c --- /dev/null +++ b/onnxruntime/contrib_ops/cuda/diffusion/bias_split_gelu_impl.h @@ -0,0 +1,19 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/common/common.h" +#include "core/common/status.h" +#include + +namespace onnxruntime { +namespace contrib { +namespace cuda { + +template +void LaunchBiasSplitGeluKernel(cudaStream_t stream, int32_t grid_size, int32_t half_hidden_size, + T const* input, T const* bias, T* output); + +} // namespace cuda +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc b/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc new file mode 100644 index 0000000000000..36a2bd11257d6 --- /dev/null +++ b/onnxruntime/contrib_ops/cuda/diffusion/group_norm.cc @@ -0,0 +1,129 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/cuda/cuda_common.h" +#include "contrib_ops/cuda/diffusion/group_norm.h" +#include "contrib_ops/cuda/diffusion/group_norm_impl.h" + +namespace onnxruntime { +namespace contrib { +namespace cuda { + +#define GROUP_NORM_TYPES float, MLFloat16 + +ONNX_OPERATOR_KERNEL_EX( + GroupNorm, kMSDomain, 1, kCudaExecutionProvider, + (*KernelDefBuilder::Create()).TypeConstraint("T", BuildKernelDefConstraints()), GroupNorm); + +using namespace ONNX_NAMESPACE; + +namespace { +template +struct DispatchGroupNorm { + Status operator()(cudaStream_t stream, + Tensor* output, + const Tensor* input, + const Tensor* gamma, + const Tensor* beta, + void* workspace, + float epsilon, + int batch_size, + int num_channels, + int height, + int width, + int num_groups, + bool use_swish_activation) { + typedef typename ToCudaType::MappedType CudaT; + return LaunchGroupNormKernel( + stream, + reinterpret_cast(output->MutableData()), + reinterpret_cast(input->Data()), + gamma->Data(), + beta->Data(), + workspace, + epsilon, + batch_size, + num_channels, + height, + width, + num_groups, + use_swish_activation); + } +}; + +} // namespace + +GroupNorm::GroupNorm(const OpKernelInfo& op_info) : CudaKernel(op_info) { + epsilon_ = op_info.GetAttrOrDefault("epsilon", 1e-5f); + ORT_ENFORCE(epsilon_ >= 0); + + int64_t num_groups; + ORT_ENFORCE(op_info.GetAttr("groups", &num_groups).IsOK()); + ORT_ENFORCE(num_groups >= 0); + num_groups_ = static_cast(num_groups); + + int64_t activation; + ORT_ENFORCE(op_info.GetAttr("activation", &activation).IsOK()); + ORT_ENFORCE(activation == 0 || activation == 1); // 0 is None, 1 is Swish + use_swish_activation_ = (activation == 1); +} + +Status GroupNorm::ComputeInternal(OpKernelContext* context) const { + const Tensor* input = context->Input(0); + const Tensor* gamma = context->Input(1); + const Tensor* beta = context->Input(2); + Tensor* output = context->Output(0, input->Shape()); + + const auto& input_dims = input->Shape().GetDims(); + if (input_dims.size() != 4) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "input is expected to have 4 dimensions, got ", input_dims.size()); + } + + const auto& gamma_dims = gamma->Shape().GetDims(); + if (gamma_dims.size() != 1) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "gamma is expected to have 1 dimension, got ", gamma_dims.size()); + } + if (gamma_dims[0] != input_dims[3]) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Number of channels in gamma and input does not match"); + } + + const auto& beta_dims = beta->Shape().GetDims(); + if (beta_dims.size() != 1) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "beta is expected to have 1 dimension, got ", beta_dims.size()); + } + if (beta_dims[0] != input_dims[3]) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Number of channels in beta and input does not match"); + } + + // Input and output format is NHWC + int batch_size = static_cast(input_dims[0]); + int num_channels = static_cast(input_dims[3]); + int height = static_cast(input_dims[1]); + int width = static_cast(input_dims[2]); + + if (num_channels % num_groups_ != 0) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "number of channels should be divisiable by num_groups"); + } + + auto workspace = GetScratchBuffer(GetGroupNormWorkspaceSizeInBytes(), context->GetComputeStream()); + + utils::MLTypeCallDispatcher dispatcher(input->GetElementType()); + return dispatcher.InvokeRet(Stream(context), output, input, gamma, beta, workspace.get(), + epsilon_, + batch_size, + num_channels, + height, + width, + num_groups_, + use_swish_activation_); +} + +} // namespace cuda +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cuda/diffusion/group_norm.h b/onnxruntime/contrib_ops/cuda/diffusion/group_norm.h new file mode 100644 index 0000000000000..8578a1642198f --- /dev/null +++ b/onnxruntime/contrib_ops/cuda/diffusion/group_norm.h @@ -0,0 +1,27 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/common/common.h" +#include "core/providers/cuda/cuda_kernel.h" + +namespace onnxruntime { +namespace contrib { +namespace cuda { + +using namespace onnxruntime::cuda; + +class GroupNorm final : public CudaKernel { + public: + GroupNorm(const OpKernelInfo& op_kernel_info); + Status ComputeInternal(OpKernelContext* context) const override; + + private: + bool use_swish_activation_; + float epsilon_; + int num_groups_; +}; + +} // namespace cuda +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cuda/diffusion/group_norm_impl.cu b/onnxruntime/contrib_ops/cuda/diffusion/group_norm_impl.cu new file mode 100644 index 0000000000000..01ba078b4be77 --- /dev/null +++ b/onnxruntime/contrib_ops/cuda/diffusion/group_norm_impl.cu @@ -0,0 +1,475 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// The CUDA kernel is modified from GroupNorm plugin of TensorRT 8.5 +#include +#include +#include +#include "core/providers/cuda/cuda_common.h" +#include "contrib_ops/cuda/diffusion/group_norm_impl.h" +#include "contrib_ops/cuda/transformers/dump_cuda_tensor.h" + +namespace onnxruntime { +namespace contrib { +namespace cuda { + +static inline int32_t divUp(int32_t m, int32_t n) { + return (m + n - 1) / n; +} + +static inline __device__ __host__ float sigmoid(float x) { + return 1.F / (1.F + expf(-x)); +} + +struct GroupSums { + // Is it the 1st element of the group? + int32_t flag; + // The sum. + float sum; + // The sum of squares. + float sumSq; +}; + +struct GroupSumsOp { + inline __device__ GroupSums operator()(GroupSums const& a, GroupSums const& b) { + GroupSums dst; + dst.sum = b.flag ? b.sum : (a.sum + b.sum); + dst.sumSq = b.flag ? b.sumSq : (a.sumSq + b.sumSq); + dst.flag = a.flag + b.flag; + return dst; + } +}; + +template +struct GroupNormNHWCParams { + // The output buffer. Layout NHWC. + T* dst; + // The input buffer. Layout NHWC. + T const* src; + // The gamma scaling factor. + float const* gamma; + // The beta term to add in GN. + float const* beta; + // The temporary buffer to do the global parallel reduction. Size: + // BLOCKS_PER_BATCH x C x 2. + float* redBuffer; + + // The number of instances in the batch. + int32_t n; + // The height and width of each activation map. + int32_t h; + int32_t w; + // The number of channels. + int32_t c; + // The number of groups. + int32_t groups; + // Do we apply the Swish activation function? + bool withSwish; + + // Precomputed values and parameters to control the execution of the kernels. + + // The number of activations per instance (h * w) and the number of + // activations per block. + int32_t hw; + int32_t hwPerBlock; + // The number of channels per group and blocks per activation in the C + // dimension. + int32_t cPerBlock; + int32_t cPerGroup; + + // The precomputed stride between instances. + int32_t hwc; + // The inverse of hwc in floats (to compute mean/var). + float invHWC; + // The precomputed number of groups per block. + int32_t groupsPerBlock; +}; + +template +inline __device__ void UpdateSum(const T* src, int64_t offset, float& sum, float& sumSq); + +template <> +inline __device__ void UpdateSum(const half* src, int64_t offset, float& sum, float& sumSq) { + // Fetch two channels per thread. + __half2 h2 = *reinterpret_cast<__half2 const*>(&src[offset]); + + float2 f2 = __half22float2(h2); + + // Update the sum. + sum += f2.x + f2.y; + + // Update the sum of squares. + sumSq += f2.x * f2.x + f2.y * f2.y; +} + +template <> +inline __device__ void UpdateSum(const float* src, int64_t offset, float& sum, float& sumSq) { + // Fetch two channels per thread. + float2 f2 = *reinterpret_cast(&src[offset]); + + // Update the sum. + sum += f2.x + f2.y; + + // Update the sum of squares. + sumSq += f2.x * f2.x + f2.y * f2.y; +} + +template +__global__ void groupNormNHWCSumKernel(GroupNormNHWCParams params) { + // The object in charge of doing the sums for the different blocks. + typedef cub::BlockScan BlockScan; + + // Allocate shared memory for BlockScan. + __shared__ typename BlockScan::TempStorage tempStorage; + // Allocate shared memory for the groups. We could reduce the amount of shared + // memory reserved. + __shared__ float2 smem[tTHREADS_PER_BLOCK]; + + // The instance in the batch. + int32_t ni = blockIdx.z; + // The channel loaded by that thread (2 channels per thread for F16x2). + int32_t ci = blockIdx.x * params.cPerBlock + threadIdx.x * 2; + + // The first activation loaded by that block. + int32_t hwBegin = blockIdx.y * params.hwPerBlock; + // The last activation loaded by that block. + int32_t hwEnd = min(hwBegin + params.hwPerBlock, params.hw); + + // The sums. + float sum = 0.F; + float sumSq = 0.F; + + // Iterate over the activations to compute the sums. + if (ci < params.c) { + for (int32_t hwi = hwBegin; hwi < hwEnd; ++hwi) { + // The offset. + int64_t offset = static_cast(ni) * params.hwc + static_cast(hwi) * params.c + ci; + UpdateSum(params.src, offset, sum, sumSq); + } + } + + // The group that thread works on and the channel in the group (modulus). + int32_t gi = threadIdx.x * 2 / params.cPerGroup; + int32_t cj = threadIdx.x * 2 - params.cPerGroup * gi; + + // The data for the summations. + GroupSums inp{cj == 0 ? 1 : 0, sum, sumSq}; + + // Do the segmented scan. + GroupSums out; + BlockScan(tempStorage).InclusiveScan(inp, out, GroupSumsOp()); + + // Store the results for the groups in shared memory (to produce coalesced + // stores later). + if (cj == params.cPerGroup - 2) { //2 channels per thread + smem[gi] = make_float2(out.sum, out.sumSq); + } + + // Make sure the data is in shared memory. + __syncthreads(); + + // The global group index. + int32_t gj = blockIdx.x * params.groupsPerBlock + threadIdx.x; + + // Threads that have nothing left to do, exit. + if (threadIdx.x >= params.groupsPerBlock || gj >= params.groups) { + return; + } + + // The first threads (those storing to global memory, load the values). + float2 sums = smem[threadIdx.x]; + + // Store to global memory. + atomicAdd(¶ms.redBuffer[(2 * ni + 0) * params.groups + gj], sums.x); + atomicAdd(¶ms.redBuffer[(2 * ni + 1) * params.groups + gj], sums.y); +} + +template +void groupNormNHWCSum(GroupNormNHWCParams const& params, cudaStream_t stream) { + // Make sure the values are as we expect. + ORT_ENFORCE(params.c % params.cPerBlock == 0 && params.hw % params.hwPerBlock == 0); + // Make sure a group does not span multiple blocks. + ORT_ENFORCE(params.cPerBlock % params.cPerGroup == 0); + + dim3 grid; + + // The number of blocks to compute all the channels. + grid.x = params.c / params.cPerBlock; + // The number of blocks to compute all the activations in a given instance. + grid.y = divUp(params.hw, params.hwPerBlock); + // The number of instances. + grid.z = params.n; + + switch (params.cPerBlock) { + case 320: + groupNormNHWCSumKernel<<>>(params); + break; + case 480: + groupNormNHWCSumKernel<<>>(params); + break; + case 256: + groupNormNHWCSumKernel<<>>(params); + break; + case 128: + groupNormNHWCSumKernel<<>>(params); + break; + default: + ORT_NOT_IMPLEMENTED("Not implemented"); + } +} + +template +__device__ void computeGroupNorm(const T* src, T* dst, int64_t offset, float mean, float invStdDev, float2& gammaF2, float2& betaF2, bool swish); + +template <> +__device__ void computeGroupNorm(const half* src, half* dst, int64_t offset, float mean, float invStdDev, + float2& gammaF2, float2& betaF2, bool swish) { + // Fetch two channels per thread. + __half2 h2 = *reinterpret_cast<__half2 const*>(&src[offset]); + + // Extract the two half values. + float2 f2 = __half22float2(h2); + + // Normalize the channels. + f2.x = (f2.x - mean) * invStdDev; + f2.y = (f2.y - mean) * invStdDev; + + // Scale by gamma and add beta. + f2.x = gammaF2.x * f2.x + betaF2.x; + f2.y = gammaF2.y * f2.y + betaF2.y; + + // Apply Swish if needed. + if (swish) { + f2.x = f2.x * sigmoid(f2.x); + f2.y = f2.y * sigmoid(f2.y); + } + + *reinterpret_cast<__half2*>(&dst[offset]) = __float22half2_rn(f2); +} + +template <> +__device__ void computeGroupNorm(const float* src, float* dst, int64_t offset, float mean, float invStdDev, + float2& gammaF2, float2& betaF2, bool swish) { + // Fetch two channels per thread. + float2 f2 = *reinterpret_cast(&src[offset]); + + // Normalize the channels. + f2.x = (f2.x - mean) * invStdDev; + f2.y = (f2.y - mean) * invStdDev; + + // Scale by gamma and add beta. + f2.x = gammaF2.x * f2.x + betaF2.x; + f2.y = gammaF2.y * f2.y + betaF2.y; + + // Apply Swish if needed. + if (swish) { + f2.x = f2.x * sigmoid(f2.x); + f2.y = f2.y * sigmoid(f2.y); + } + + *reinterpret_cast(&dst[offset]) = f2; +} + +template +__global__ void groupNormNHWCScaleKernel(GroupNormNHWCParams params) { + // The channel loaded by that thread (2 channels per thread for F16x2). + int32_t ci = blockIdx.x * params.cPerBlock + threadIdx.x * 2; + if (ci >= params.c) { + return; + } + + // The instance in the batch. + int32_t ni = blockIdx.z; + + // The group that thread works on and the channel in the group (modulus). + int32_t gi = ci / params.cPerGroup; + + // Load the sum and sum of squares for the group. + float sum = 0.F, sumSq = 0.F; + if (gi < params.groups) { + sum = params.redBuffer[(2 * ni + 0) * params.groups + gi]; + sumSq = params.redBuffer[(2 * ni + 1) * params.groups + gi]; + } + + // Load gamma/beta. + float2 gammaF2 = *reinterpret_cast(¶ms.gamma[ci]); + float2 betaF2 = *reinterpret_cast(¶ms.beta[ci]); + + // Compute the mean. + float mean = sum * params.invHWC; + // Compute the variance. + float var = sumSq * params.invHWC - (mean * mean); + // Compute the inverse of the stddev. + float invStdDev = var <= 0.F ? 1.F : rsqrtf(var); + + // The first activation loaded by that block. + int32_t hwBegin = blockIdx.y * params.hwPerBlock; + // The last activation loaded by that block. + int32_t hwEnd = min(hwBegin + params.hwPerBlock, params.hw); + + // Iterate over the activations to compute the sums. + for (int32_t hwi = hwBegin; hwi < hwEnd; ++hwi) { + // The src/dst offset. + int64_t offset = (int64_t)ni * params.hwc + hwi * params.c + ci; + + // Fetch two channels per thread. + computeGroupNorm(params.src, params.dst, offset, mean, invStdDev, gammaF2, betaF2, params.withSwish); + } +} + +template +void groupNormNHWCScale(GroupNormNHWCParams const& params, cudaStream_t stream) { + // Make sure the dimensions are aligned with what we expect. + ORT_ENFORCE(params.c % params.cPerBlock == 0); + // Make sure a group does not span multiple blocks. + ORT_ENFORCE(params.cPerBlock % params.cPerGroup == 0); + + dim3 grid; + + // The number of blocks to compute all the channels. + grid.x = params.c / params.cPerBlock; + // The number of blocks to compute all the activations in a given instance. + grid.y = divUp(params.hw, params.hwPerBlock); + // The number of instances. + grid.z = params.n; + + switch (params.cPerBlock) { + case 320: + groupNormNHWCScaleKernel<<>>(params); + break; + case 480: + groupNormNHWCScaleKernel<<>>(params); + break; + case 256: + groupNormNHWCScaleKernel<<>>(params); + break; + case 128: + groupNormNHWCScaleKernel<<>>(params); + break; + default: + ORT_NOT_IMPLEMENTED("Not implemented"); + } +} + +int32_t findMaxDivisor(int32_t n, int32_t maxAllowedDivisor) { + int32_t maxDivisor = -1; + for (int32_t i = 1; i <= std::sqrt(n); i++) { + if (n % i == 0) { + int32_t divisor1 = n / i; + int32_t divisor2 = i; + + if (divisor1 > maxDivisor && divisor1 < maxAllowedDivisor) { + maxDivisor = divisor1; + } + if (divisor2 > maxDivisor && divisor2 < maxAllowedDivisor) { + maxDivisor = divisor2; + } + } + } + return maxDivisor; +} + +template +Status LaunchGroupNormKernel( + cudaStream_t stream, + T* output, + const T* input, + const float* gamma, + const float* beta, + void* workspace, + float epsilon, + int batch_size, + int num_channels, + int height, + int width, + int num_groups, + bool use_swish_activation) { + if (batch_size > static_cast(kMaxGroupNormBatchSize)) { + return ORT_MAKE_STATUS(ONNXRUNTIME, StatusCode::NOT_IMPLEMENTED, + "only support batch_size <= 32. Got", batch_size); + } + + if (num_groups != static_cast(kGroupNormNumberOfGroups)) { + return ORT_MAKE_STATUS(ONNXRUNTIME, StatusCode::NOT_IMPLEMENTED, + "only num_groups=32 is supported. Got", num_groups); + } + + GroupNormNHWCParams params; + int32_t cPerBlock = 320; + int32_t maxBlocksPerHW = 1024; + switch (num_channels) { + case 960: + case 1920: + cPerBlock = 480; + break; + case 512: + case 256: + cPerBlock = 256; + break; + case 128: + cPerBlock = 128; + break; + default: + cPerBlock = 320; + } + + params.withSwish = use_swish_activation; + params.dst = output; + params.src = input; + params.gamma = gamma; + params.beta = beta; + params.redBuffer = reinterpret_cast(workspace); + params.n = batch_size; + params.h = height; + params.w = width; + params.c = num_channels; + params.groups = num_groups; + params.hw = params.h * params.w; + const int32_t blocksPerHW = findMaxDivisor(params.hw, maxBlocksPerHW); + params.hwPerBlock = divUp(params.hw, blocksPerHW); + params.cPerBlock = cPerBlock; + params.cPerGroup = params.c / params.groups; + params.hwc = params.hw * params.c; + params.invHWC = 1.F / (float)(params.hw * params.cPerGroup); + params.groupsPerBlock = cPerBlock / params.cPerGroup; + + DUMP_TENSOR_INIT(); + DUMP_TENSOR("input", input, batch_size, num_channels, height * width); + DUMP_TENSOR("gamma", gamma, 1, num_channels); + DUMP_TENSOR("beta", beta, 1, num_channels); + cudaMemsetAsync(params.redBuffer, 0, GetGroupNormWorkspaceSizeInBytes(), stream); + groupNormNHWCSum(params, stream); + DUMP_TENSOR("workspace", params.redBuffer, batch_size, num_groups, 2); + CUDA_RETURN_IF_ERROR(cudaGetLastError()); + groupNormNHWCScale(params, stream); + CUDA_RETURN_IF_ERROR(cudaGetLastError()); + DUMP_TENSOR("output", output, batch_size, num_channels, height * width); + return Status::OK(); +} + +template Status LaunchGroupNormKernel(cudaStream_t stream, half* output, + const half* input, const float* gamma, const float* beta, void* workspace, + float epsilon, int batch_size, int num_channels, + int height, int width, int num_groups, bool swish); + +template Status LaunchGroupNormKernel(cudaStream_t stream, float* output, + const float* input, const float* gamma, const float* beta, void* workspace, + float epsilon, int batch_size, int num_channels, + int height, int width, int num_groups, bool swish); +} // namespace cuda +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cuda/diffusion/group_norm_impl.h b/onnxruntime/contrib_ops/cuda/diffusion/group_norm_impl.h new file mode 100644 index 0000000000000..c7e9245050ee6 --- /dev/null +++ b/onnxruntime/contrib_ops/cuda/diffusion/group_norm_impl.h @@ -0,0 +1,42 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/common/common.h" +#include "core/common/status.h" +#include +#include +#include + +namespace onnxruntime { +namespace contrib { +namespace cuda { + +constexpr size_t kMaxGroupNormBatchSize = 32; +constexpr size_t kGroupNormNumberOfGroups = 32; + +constexpr size_t GetGroupNormWorkspaceSizeInBytes() { + // Two buffers for sum and squared sum + return (sizeof(float) * 2) * kMaxGroupNormBatchSize * kGroupNormNumberOfGroups; +} + +template +Status LaunchGroupNormKernel( + cudaStream_t stream, + T* output, // normalized output tensor + const T* input, // input tensor + const float* gamma, // gamma (also known as weight or scale) + const float* beta, // beta (also known as bias) + void* workspace, // Work space + float epsilon, // epsilon used normalization + int batch_size, // N + int num_channels, // C + int height, // H + int width, // W + int num_groups, // number of groups + bool use_swish_activation // Whether there is Swish activation after group normalization +); + +} // namespace cuda +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cuda/diffusion/nhwc_conv.cc b/onnxruntime/contrib_ops/cuda/diffusion/nhwc_conv.cc new file mode 100644 index 0000000000000..79f0a18ba515f --- /dev/null +++ b/onnxruntime/contrib_ops/cuda/diffusion/nhwc_conv.cc @@ -0,0 +1,31 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/common/span_utils.h" +#include "core/providers/cuda/cuda_common.h" +#include "core/providers/cuda/shared_inc/fpgeneric.h" +#include "core/providers/cuda/tensor/slice.h" +#include "core/providers/cuda/nn/conv.h" + +using namespace onnxruntime::common; + +namespace onnxruntime { +namespace contrib { +namespace cuda { + +#define REGISTER_KERNEL_TYPED(T) \ + ONNX_OPERATOR_TYPED_KERNEL_EX( \ + NhwcConv, \ + kMSDomain, \ + 1, \ + T, \ + kCudaExecutionProvider, \ + (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType()), \ + Conv); + +REGISTER_KERNEL_TYPED(float) +REGISTER_KERNEL_TYPED(MLFloat16) + +} // namespace cuda +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cuda/fused_conv.cc b/onnxruntime/contrib_ops/cuda/fused_conv.cc index 39c3bb282d912..48881ddca4063 100644 --- a/onnxruntime/contrib_ops/cuda/fused_conv.cc +++ b/onnxruntime/contrib_ops/cuda/fused_conv.cc @@ -9,10 +9,10 @@ namespace contrib { namespace cuda { template -class FusedConv : public onnxruntime::cuda::Conv { +class FusedConv : public onnxruntime::cuda::Conv { public: - using Base = onnxruntime::cuda::Conv; - FusedConv(const OpKernelInfo& info) : onnxruntime::cuda::Conv(info) { + using Base = onnxruntime::cuda::Conv; + FusedConv(const OpKernelInfo& info) : onnxruntime::cuda::Conv(info) { std::string activation; if (info.GetAttr("activation", &activation) == Status::OK() && MapMode(activation) == Status::OK() && diff --git a/onnxruntime/contrib_ops/cuda/transformers/dump_cuda_tensor.cc b/onnxruntime/contrib_ops/cuda/transformers/dump_cuda_tensor.cc index 6c0f7f69c58a1..741f9ac259da1 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/dump_cuda_tensor.cc +++ b/onnxruntime/contrib_ops/cuda/transformers/dump_cuda_tensor.cc @@ -11,7 +11,7 @@ namespace contrib { namespace cuda { namespace transformers { -#ifdef DEBUG_GENERATION +#if DUMP_TENSOR_LEVEL > 0 template class PinnedHostBuffer { public: diff --git a/onnxruntime/core/graph/contrib_ops/bert_defs.cc b/onnxruntime/core/graph/contrib_ops/bert_defs.cc index b4ad4d64e7ddb..68e3985651123 100644 --- a/onnxruntime/core/graph/contrib_ops/bert_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/bert_defs.cc @@ -127,32 +127,41 @@ void RestorePaddingTypeAndShapeInference(ONNX_NAMESPACE::InferenceContext& ctx) void MultiHeadAttentionTypeAndShapeInference(ONNX_NAMESPACE::InferenceContext& ctx) { // Input 0 (query) has shape (batch_size, sequence_length, hidden_size) - // Input 1 (key) has shape (batch_size, kv_sequence_length, hidden_size) - // Input 2 (value) has shape (batch_size, kv_sequence_length, v_hidden_size) + // Input 1 (key) has shape (batch_size, kv_sequence_length, hidden_size) or (batch_size, kv_sequence_length, num_heads, 2, head_size) + // Input 2 (value) has shape (batch_size, kv_sequence_length, v_hidden_size) or nullptr // Output 0 has shape (batch_size, sequence_length, v_hidden_size) // Type inference ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0); // Shape inference - if (hasInputShape(ctx, 0) && hasInputShape(ctx, 2)) { + if (hasInputShape(ctx, 0)) { auto& query_shape = getInputShape(ctx, 0); auto& query_dims = query_shape.dim(); if (query_dims.size() != 3) { fail_shape_inference("Inputs 0 (query) shall be 3 dimensions"); } - auto& value_shape = getInputShape(ctx, 2); - auto& value_dims = value_shape.dim(); - if (value_dims.size() != 3) { - fail_shape_inference("Inputs 2 (value) shall be 3 dimensions"); + if (hasInputShape(ctx, 2)) { + auto& value_shape = getInputShape(ctx, 2); + auto& value_dims = value_shape.dim(); + if (value_dims.size() != 3) { + fail_shape_inference("Inputs 2 (value) shall be 3 dimensions"); + } + + ONNX_NAMESPACE::TensorShapeProto output_shape; + *output_shape.add_dim() = query_dims[0]; + *output_shape.add_dim() = query_dims[1]; + *output_shape.add_dim() = value_dims[2]; + updateOutputShape(ctx, 0, output_shape); } - ONNX_NAMESPACE::TensorShapeProto output_shape; - *output_shape.add_dim() = query_dims[0]; - *output_shape.add_dim() = query_dims[1]; - *output_shape.add_dim() = value_dims[2]; - updateOutputShape(ctx, 0, output_shape); + if (hasInputShape(ctx, 1)) { + auto& key_shape = getInputShape(ctx, 1); + if (key_shape.dim().size() == 5) { + ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput(ctx); + } + } } } @@ -287,16 +296,18 @@ ONNX_MS_OPERATOR_SET_SCHEMA( "T") .Input(1, "key", - "Key with shape (batch_size, kv_sequence_length, hidden_size)", + "Key with shape (batch_size, kv_sequence_length, hidden_size), or packed KV with shape (batch_size, kv_sequence_length, num_heads, 2, head_size)", "T") .Input(2, "value", "Value with shape (batch_size, kv_sequence_length, v_hidden_size)", - "T") + "T", + OpSchema::Optional) .Input(3, "bias", "Bias tensor with shape (hidden_size + hidden_size + v_hidden_size) from input projection", - "T") + "T", + OpSchema::Optional) .Input(4, "key_padding_mask", "Key padding mask with shape (batch_size) or (batch_size, kv_sequence_length)", diff --git a/onnxruntime/core/graph/contrib_ops/diffusion_defs.cc b/onnxruntime/core/graph/contrib_ops/diffusion_defs.cc new file mode 100644 index 0000000000000..14a267357371d --- /dev/null +++ b/onnxruntime/core/graph/contrib_ops/diffusion_defs.cc @@ -0,0 +1,115 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/graph/constants.h" +#include "core/graph/contrib_ops/contrib_defs.h" +#include "core/graph/contrib_ops/onnx_function_util.h" +#include "core/graph/contrib_ops/shape_inference_functions.h" + +// Suppress a warning: global initializer calls a non-constexpr function 'symbol' which is from +// ONNX_OPERATOR_SET_SCHEMA_EX macro and only happens in debug build +#if defined(_WIN32) && !defined(NDEBUG) +#pragma warning(disable : 26426) +#endif + +namespace onnxruntime { +namespace contrib { +using ONNX_NAMESPACE::AttributeProto; +using ONNX_NAMESPACE::OpSchema; +using ONNX_NAMESPACE::TensorShapeProto; +#ifndef NDEBUG +using ONNX_NAMESPACE::DbgOperatorSetTracker; +#endif + +constexpr const char* GroupNorm_ver1_doc = R"DOC( +Applies Group Normalization over a mini-batch of inputs as described in the paper Group Normalization (https://arxiv.org/abs/1803.08494). + +This operator transforms input according to + y = gamma * (x - mean) / sqrt(variance + epsilon) + beta + +The input channels are separated into num_groups groups, each containing num_channels / num_groups channels. num_channels must be divisible by num_groups. The mean and standard-deviation are calculated separately over the each group. +The weight and bias are per-channel affine transform parameter vectors of size num_channels. + +The activation attribute can be used to enable activation after group normalization. +)DOC"; + +ONNX_MS_OPERATOR_SET_SCHEMA( + GroupNorm, 1, + OpSchema() + .SetDoc(GroupNorm_ver1_doc) + .Attr("epsilon", "The epsilon value to use to avoid division by zero", AttributeProto::FLOAT, static_cast(1e-5)) + .Attr("groups", + "The number of groups of channels. It should be a divisor of the number of channels C", + AttributeProto::INT) + .Attr("activation", + "Activation after group normalization: 0 for None, 1 for Swish", + AttributeProto::INT) + .Input(0, + "X", + "Input data tensor. Dimensions are (N x H x W x C), where N is the batch size, C is the number of channels, and H and W are the height and width of the data", + "T") + .Input(1, + "gamma", + "1D gamma tensor for normalization with shape (C), where C is number of channels", + "M") + .Input(2, + "beta", + "1D beta tensor for normalization with shape (C), where C is number of channels", + "M") + .Output(0, + "Y", + "The output tensor of the same shape as X", + "T") + .TypeConstraint("T", {"tensor(float16)", "tensor(float)"}, "Constrain input X and output Y types to float tensors.") + .TypeConstraint("M", {"tensor(float)"}, "Constrain gamma and beta to float tensors.") + .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput)); + +constexpr const char* BiasSplitGelu_ver1_doc = R"DOC( +A fusion used in diffusion model that after adding bias, hidden state is sliced into two tensors of same size, then left +tensor multiplies the Gelu activation result of right tensor. +)DOC"; + +ONNX_MS_OPERATOR_SET_SCHEMA( + BiasSplitGelu, 1, + OpSchema() + .SetDoc(BiasSplitGelu_ver1_doc) + .Input(0, + "X", + "Input tensor. Dimensions are (N, S, D), where N is the batch size, S are image size, and D is hidden dimension", + "T") + .Input(1, + "bias", + "Bias tensor. Dimensions are (D), where D is the same hidden dimension as input tensor", + "T") + .Output(0, + "Y", + "The output tensor with dimensions (N, S, D/2)", + "T") + .TypeConstraint("T", {"tensor(float16)", "tensor(float)"}, "Constrain input X and output Y types to float tensors.") + .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { + propagateElemTypeFromInputToOutput(ctx, 0, 0); + if (hasInputShape(ctx, 0) && hasInputShape(ctx, 1)) { + auto& input_shape = getInputShape(ctx, 0); + if (input_shape.dim().size() != 3) { + fail_shape_inference("input shall be 3 dimensions"); + } + + auto& bias_shape = getInputShape(ctx, 1); + if (bias_shape.dim().size() != 1) { + fail_shape_inference("bias shall be 1 dimension"); + } + + TensorShapeProto output_shape; + *output_shape.add_dim() = input_shape.dim(0); + *output_shape.add_dim() = input_shape.dim(1); + if (bias_shape.dim(0).has_dim_value()) { + output_shape.add_dim()->set_dim_value(bias_shape.dim(0).dim_value() / 2); + } else { + output_shape.add_dim(); + } + + updateOutputShape(ctx, 0, output_shape); + } + })); +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/core/graph/contrib_ops/ms_opset.h b/onnxruntime/core/graph/contrib_ops/ms_opset.h index 1f0af31a4bdd0..a511d01fe1624 100644 --- a/onnxruntime/core/graph/contrib_ops/ms_opset.h +++ b/onnxruntime/core/graph/contrib_ops/ms_opset.h @@ -49,6 +49,7 @@ class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, BeamSearch); class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, BiasDropout); class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, BitmaskBiasDropout); class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, BiasGelu); +class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, BiasSplitGelu); class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, BiasSoftmax); class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, BifurcationDetector); class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, CDist); @@ -69,6 +70,7 @@ class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Gelu); class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, QuickGelu); class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GreedySearch); class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GridSample); +class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GroupNorm); class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Inverse); class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Irfft); class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, IsAllFinite); @@ -135,6 +137,7 @@ class OpSet_Microsoft_ver1 { fn(GetOpSchema()); fn(GetOpSchema()); fn(GetOpSchema()); + fn(GetOpSchema()); fn(GetOpSchema()); fn(GetOpSchema()); fn(GetOpSchema()); @@ -155,6 +158,7 @@ class OpSet_Microsoft_ver1 { fn(GetOpSchema()); fn(GetOpSchema()); fn(GetOpSchema()); + fn(GetOpSchema()); fn(GetOpSchema()); fn(GetOpSchema()); fn(GetOpSchema()); diff --git a/onnxruntime/core/providers/cpu/nn/conv_attributes.h b/onnxruntime/core/providers/cpu/nn/conv_attributes.h index 51a1e7acafe11..b31030acc52c1 100644 --- a/onnxruntime/core/providers/cpu/nn/conv_attributes.h +++ b/onnxruntime/core/providers/cpu/nn/conv_attributes.h @@ -73,7 +73,7 @@ struct ConvAttributes { ~ConvAttributes() = default; - Status ComputeKernelShape(const TensorShape& weight_shape, TensorShapeVector& kernel_shape) const { + Status ComputeKernelShape(const TensorShape& weight_shape, TensorShapeVector& kernel_shape, bool weight_channels_last = false) const { if (kernel_shape_specified) { kernel_shape = kernel_shape_; if (kernel_shape.size() + 2 != weight_shape.NumDimensions()) { @@ -82,15 +82,20 @@ struct ConvAttributes { " W: ", weight_shape.ToString().c_str()); } for (size_t i = 0; i < kernel_shape.size(); ++i) { - if (kernel_shape[i] != weight_shape[i + 2]) { + if (kernel_shape[i] != weight_shape[i + (weight_channels_last ? 1 : 2)]) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "kernel_shape is not compatible with W shape.", " kernel_shape: ", TensorShape(kernel_shape).ToString().c_str(), - " W: ", weight_shape.ToString().c_str()); + " W: ", weight_shape.ToString().c_str(), + " channels_last: ", weight_channels_last); } } } else { auto weight_dims = weight_shape.GetDims(); - kernel_shape.assign(weight_dims.begin() + 2, weight_dims.end()); + if (weight_channels_last) { + kernel_shape.assign(weight_dims.begin() + 1, weight_dims.end() - 1); + } else { + kernel_shape.assign(weight_dims.begin() + 2, weight_dims.end()); + } } return Status::OK(); @@ -98,7 +103,8 @@ struct ConvAttributes { Status ValidateInputShape(const TensorShape& input_shape, const TensorShape& weight_shape, - bool channels_last = false) const { + bool input_channels_last = false, + bool weight_channels_last = false) const { if (input_shape.NumDimensions() != weight_shape.NumDimensions()) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "X num_dims does not match W num_dims.", " X: ", input_shape.ToString().c_str(), @@ -106,9 +112,9 @@ struct ConvAttributes { } const int64_t M = weight_shape[0]; - const int64_t C = channels_last ? input_shape.GetDims().back() : input_shape[1]; + const int64_t C = input_channels_last ? input_shape.GetDims().back() : input_shape[1]; - if (C != weight_shape[1] * group) { + if (C != (weight_channels_last ? weight_shape.GetDims().back() : weight_shape[1]) * group) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Input channels C is not equal to kernel channels * group.", " C: ", C, " kernel channels: ", weight_shape[1], diff --git a/onnxruntime/core/providers/cuda/cudnn_common.cc b/onnxruntime/core/providers/cuda/cudnn_common.cc index d62a651880a85..4c9cbbe605a7a 100644 --- a/onnxruntime/core/providers/cuda/cudnn_common.cc +++ b/onnxruntime/core/providers/cuda/cudnn_common.cc @@ -42,6 +42,12 @@ Status CudnnTensor::Set(gsl::span input_dims, cudnnDataType_t dat return Status::OK(); } +Status CudnnTensor::Set(cudnnTensorFormat_t format, cudnnDataType_t dataType, int n, int c, int h, int w) { + ORT_RETURN_IF_ERROR(CreateTensorIfNeeded()); + CUDNN_RETURN_IF_ERROR(cudnnSetTensor4dDescriptor(tensor_, format, dataType, n, c, h, w)); + return Status::OK(); +} + Status CudnnTensor::Set(const CudnnTensor& x_desc, cudnnBatchNormMode_t mode) { ORT_RETURN_IF_ERROR(CreateTensorIfNeeded()); CUDNN_RETURN_IF_ERROR(cudnnDeriveBNTensorDescriptor(tensor_, x_desc, mode)); @@ -113,15 +119,23 @@ Status CudnnFilterDescriptor::Set(gsl::span filter_dims, cudnnDat return Status::OK(); } +Status CudnnFilterDescriptor::Set(cudnnTensorFormat_t format, cudnnDataType_t dataType, int k, int c, int h, int w) { + if (!desc_) + CUDNN_RETURN_IF_ERROR(cudnnCreateFilterDescriptor(&desc_)); + + CUDNN_RETURN_IF_ERROR(cudnnSetFilter4dDescriptor(desc_, dataType, format, k, c, h, w)); + return Status::OK(); +} + template cudnnDataType_t CudnnTensor::GetDataType() { ORT_THROW("cuDNN engine currently supports only single/double/half/int8/uint8 precision data types. Got:", - typeid(ElemType).name()); + typeid(ElemType).name()); // Not reachable but GCC complains return CUDNN_DATA_FLOAT; } -template<> +template <> cudnnDataType_t CudnnTensor::GetDataType() { return CUDNN_DATA_FLOAT; } diff --git a/onnxruntime/core/providers/cuda/cudnn_common.h b/onnxruntime/core/providers/cuda/cudnn_common.h index f104373b9413a..ba75ab4f2c029 100644 --- a/onnxruntime/core/providers/cuda/cudnn_common.h +++ b/onnxruntime/core/providers/cuda/cudnn_common.h @@ -18,6 +18,8 @@ class CudnnTensor final { Status Set(gsl::span input_dims, cudnnDataType_t dataType); Status Set(const CudnnTensor& x_desc, cudnnBatchNormMode_t mode); + // Set 4D tensor format (for NHWC) + Status Set(cudnnTensorFormat_t format, cudnnDataType_t dataType, int n, int c, int h, int w); operator cudnnTensorDescriptor_t() const { return tensor_; } @@ -58,6 +60,9 @@ class CudnnFilterDescriptor final { Status Set(gsl::span filter_dims, cudnnDataType_t data_typ); + // Set 4D filter where k is output channels, c is input channels, h and w is rows and columns per filter. + Status Set(cudnnTensorFormat_t format, cudnnDataType_t dataType, int k, int c, int h, int w); + operator cudnnFilterDescriptor_t() const { return desc_; } private: diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc index f1590bc51388d..b0df77db96744 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.cc +++ b/onnxruntime/core/providers/cuda/nn/conv.cc @@ -20,7 +20,7 @@ namespace cuda { T, \ kCudaExecutionProvider, \ (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType()), \ - Conv); \ + Conv); \ ONNX_OPERATOR_TYPED_KERNEL_EX( \ Conv, \ kOnnxDomain, \ @@ -28,14 +28,14 @@ namespace cuda { T, \ kCudaExecutionProvider, \ (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType()), \ - Conv); + Conv); REGISTER_KERNEL_TYPED(float) REGISTER_KERNEL_TYPED(double) REGISTER_KERNEL_TYPED(MLFloat16) -template -const cudnnConvolutionFwdAlgo_t Conv::kAllAlgos[] = { +template +const cudnnConvolutionFwdAlgo_t Conv::kAllAlgos[] = { CUDNN_CONVOLUTION_FWD_ALGO_GEMM, CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING, @@ -52,7 +52,7 @@ cudnnStatus_t GetWorkspaceSize(cudnnHandle_t handle, const CudnnConvState& s, const cudnnConvolutionFwdAlgo_t* algo, int n_algo) { - // TODO: get maximum available size from memory areana + // TODO: get maximum available size from memory arena size_t free, total; CUDA_CALL_THROW(cudaMemGetInfo(&free, &total)); // Assuming 10% of fragmentation @@ -86,8 +86,8 @@ Status SliceOutUnwantedOutputSection(cudaStream_t stream, return SliceCuda::Impl(stream, input_data, input_dims, output_data, compute_metadata, element_size); } -template -Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const { +template +Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const { //set X const Tensor* X = context->Input(0); const TensorShape& x_shape = X->Shape(); @@ -99,6 +99,13 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const const TensorShape& w_shape = W->Shape(); auto w_dims = w_shape.AsShapeVector(); s_.w_data = reinterpret_cast(W->Data()); + + // Make sure input and weight are 4D for NHWC since we set 4D descriptor for NHWC. + constexpr bool channels_last = NHWC; + if (channels_last && (x_shape.NumDimensions() != 4 || w_shape.NumDimensions() != 4)) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Number of dimensions of X and W should be 4 for channels_last format (NHWC)"); + } + // set B if (context->InputCount() >= 3) { const Tensor* B = context->Input(2); @@ -125,48 +132,60 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const s_.cached_benchmark_results.clear(); } - const int64_t N = X->Shape()[0]; - const int64_t M = W->Shape()[0]; - - ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X, W)); + ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W->Shape(), channels_last, channels_last)); TensorShapeVector kernel_shape; - ORT_RETURN_IF_ERROR(conv_attrs_.ComputeKernelShape(W->Shape(), kernel_shape)); - auto rank = kernel_shape.size(); + ORT_RETURN_IF_ERROR(conv_attrs_.ComputeKernelShape(W->Shape(), kernel_shape, channels_last)); + + const size_t kernel_rank = kernel_shape.size(); + ConvPadVector pads(conv_attrs_.pads); if (pads.empty()) { - pads.resize(rank * 2, 0); + pads.resize(kernel_rank * 2, 0); } TensorShapeVector dilations(conv_attrs_.dilations); if (dilations.empty()) { - dilations.resize(rank, 1); + dilations.resize(kernel_rank, 1); } TensorShapeVector strides(conv_attrs_.strides); if (strides.empty()) { - strides.resize(rank, 1); + strides.resize(kernel_rank, 1); } TensorShapeVector y_dims; - y_dims.reserve(2 + rank); // rank indicates number of feature dimensions - so add 2 to account for 'N' and 'C' - y_dims.insert(y_dims.begin(), {N, M}); + y_dims.reserve(2 + kernel_rank); // add 2 to account for 'N' and 'C' - TensorShapeVector y_dims_with_adjusted_pads; - y_dims_with_adjusted_pads.reserve(2 + rank); // rank indicates number of feature dimensions - so add 2 to account for 'N' and 'C' - y_dims_with_adjusted_pads.insert(y_dims_with_adjusted_pads.begin(), {N, M}); + const int64_t N = X->Shape()[0]; + const int64_t M = W->Shape()[0]; + if (channels_last) { + y_dims.push_back(N); + } else { + y_dims.insert(y_dims.begin(), {N, M}); + } bool post_slicing_required = false; TensorShapeVector slice_starts; - slice_starts.reserve(rank); + slice_starts.reserve(kernel_rank); TensorShapeVector slice_ends; - slice_ends.reserve(rank); + slice_ends.reserve(kernel_rank); TensorShapeVector slice_axes; - slice_axes.reserve(rank); + slice_axes.reserve(kernel_rank); + + const size_t spatial_dim_start = channels_last ? 1 : 2; + const size_t spatial_dim_end = spatial_dim_start + kernel_rank; + TensorShape spatial_shape = X->Shape().Slice(spatial_dim_start, spatial_dim_end); - ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShapeWithAdjustedPads(x_shape.Slice(2), kernel_shape, + TensorShapeVector y_dims_with_adjusted_pads(y_dims); + ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShapeWithAdjustedPads(spatial_shape, kernel_shape, strides, dilations, pads, y_dims, y_dims_with_adjusted_pads, post_slicing_required, slice_starts, slice_ends, slice_axes)); + if (channels_last) { + y_dims.push_back(M); + y_dims_with_adjusted_pads.push_back(M); + } + ORT_ENFORCE(y_dims.size() == y_dims_with_adjusted_pads.size()); s_.y_dims = gsl::make_span(y_dims); s_.y_dims_with_adjusted_pads = y_dims_with_adjusted_pads; @@ -190,7 +209,7 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const TensorShapeVector x_dims_cudnn{x_dims.begin(), x_dims.end()}; TensorShapeVector y_dims_cudnn = !post_slicing_required ? y_dims : y_dims_with_adjusted_pads; - if (rank < 2) { + if (kernel_rank < 2) { // TODO: Explore padding the provided input shape [N, C, D] to [N, C, 1, D] // especially for EXHAUSTIVE algo search which may result in a better algo selection. // ORTModule uses different algo search options (HEURISTIC, and use max workspace size) compared to @@ -203,7 +222,7 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const x_dims_cudnn.insert(x_dims_cudnn.begin() + 2, 1); y_dims_cudnn.insert(y_dims_cudnn.begin() + 2, 1); w_dims.insert(w_dims.begin() + 2, 1); - pads.insert(pads.begin() + rank, 0); + pads.insert(pads.begin() + kernel_rank, 0); pads.insert(pads.begin(), 0); kernel_shape.insert(kernel_shape.begin(), 1); strides.insert(strides.begin(), 1); @@ -212,7 +231,7 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const x_dims_cudnn.push_back(1); y_dims_cudnn.push_back(1); w_dims.push_back(1); - pads.insert(pads.begin() + rank, 0); + pads.insert(pads.begin() + kernel_rank, 0); pads.insert(pads.end(), 0); kernel_shape.push_back(1); strides.push_back(1); @@ -220,16 +239,43 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const } } - if (w_dims_changed) - ORT_RETURN_IF_ERROR(s_.w_desc.Set(w_dims, CudnnTensor::GetDataType())); + if (w_dims_changed) { + if (!channels_last) { + ORT_RETURN_IF_ERROR(s_.w_desc.Set(w_dims, CudnnTensor::GetDataType())); + } else { + ORT_RETURN_IF_ERROR(s_.w_desc.Set(CUDNN_TENSOR_NHWC, + CudnnTensor::GetDataType(), + static_cast(w_dims[0]), + static_cast(w_dims[3]), + static_cast(w_dims[1]), + static_cast(w_dims[2]))); + } + } // We must delay returning early until here so that the weight dims have been cached properly if (s_.Y->Shape().Size() == 0) { return Status::OK(); } - ORT_RETURN_IF_ERROR(s_.x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType())); - ORT_RETURN_IF_ERROR(s_.y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType())); + if (channels_last) { + ORT_RETURN_IF_ERROR(s_.x_tensor.Set(CUDNN_TENSOR_NHWC, + CudnnTensor::GetDataType(), + static_cast(x_dims_cudnn[0]), + static_cast(x_dims_cudnn[3]), + static_cast(x_dims_cudnn[1]), + static_cast(x_dims_cudnn[2]))); + + ORT_RETURN_IF_ERROR(s_.y_tensor.Set(CUDNN_TENSOR_NHWC, + CudnnTensor::GetDataType(), + static_cast(y_dims_cudnn[0]), + static_cast(y_dims_cudnn[3]), + static_cast(y_dims_cudnn[1]), + static_cast(y_dims_cudnn[2]))); + } else { + ORT_RETURN_IF_ERROR(s_.x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType())); + ORT_RETURN_IF_ERROR(s_.y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType())); + } + ORT_RETURN_IF_ERROR(s_.conv_desc.Set(kernel_shape.size(), pads, strides, dilations, gsl::narrow_cast(conv_attrs_.group), CUDNN_CROSS_CORRELATION, CudnnTensor::GetDataType())); @@ -331,8 +377,8 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const return Status::OK(); } -template -Status Conv::ComputeInternal(OpKernelContext* context) const { +template +Status Conv::ComputeInternal(OpKernelContext* context) const { std::lock_guard lock(s_.mutex); ORT_RETURN_IF_ERROR(UpdateState(context)); if (s_.Y->Shape().Size() == 0) { @@ -367,7 +413,7 @@ Status Conv::ComputeInternal(OpKernelContext* context) const { s_.slice_ends, s_.slice_axes, s_.element_size)); } return Status::OK(); -} // namespace cuda +} CudnnConvolutionDescriptor::CudnnConvolutionDescriptor() : desc_(nullptr) { } @@ -424,5 +470,11 @@ Status CudnnConvolutionDescriptor::Set( return Status::OK(); } +#ifndef DISABLE_CONTRIB_OPS +// template instantiation for NhwcConv +template class Conv; +template class Conv; +#endif + } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/nn/conv.h b/onnxruntime/core/providers/cuda/nn/conv.h index ae179de0070b0..07825b93204ca 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.h +++ b/onnxruntime/core/providers/cuda/nn/conv.h @@ -177,7 +177,9 @@ enum : size_t { AlgoSearchWorkspaceSize = 32 * 1024 * 1024, }; -template +// ONNX Conv operator uses NCHW format for input, weights and output. +// NhwcConv contrib ops uses NHWC format: last dimension of input, weights and output are channels. +template class Conv : public CudaKernel { public: using CudaT = typename ToCudaType::MappedType; diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py index ed94a01f562ef..689235b630d94 100755 --- a/onnxruntime/python/tools/symbolic_shape_infer.py +++ b/onnxruntime/python/tools/symbolic_shape_infer.py @@ -200,6 +200,8 @@ def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""): "PythonOp": self._infer_PythonOp, "SkipLayerNormalization": self._infer_SkipLayerNormalization, "SkipSimplifiedLayerNormalization": self._infer_SkipLayerNormalization, + "GroupNorm": self._infer_GroupNorm, + "BiasSplitGelu": self._infer_BiasSplitGelu, } self.aten_op_dispatcher_ = { "embedding": self._infer_Gather, @@ -434,6 +436,8 @@ def _onnx_infer_single_node(self, node): "SkipLayerNormalization", "PythonOp", "MultiHeadAttention", + "GroupNorm", + "BiasSplitGelu", ] if not skip_infer: @@ -1963,53 +1967,62 @@ def _infer_ZipMap(self, node): def _infer_Attention(self, node): shape = self._get_shape(node, 0) shape_bias = self._get_shape(node, 2) - assert len(shape) == 3 and len(shape_bias) == 1 - qkv_hidden_sizes_attr = get_attribute(node, "qkv_hidden_sizes") - if qkv_hidden_sizes_attr is not None: - assert len(qkv_hidden_sizes_attr) == 3 - shape[2] = int(qkv_hidden_sizes_attr[2]) - else: - shape[2] = int(shape_bias[0] / 3) - output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type - vi = self.known_vi_[node.output[0]] - vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, shape)) - - if len(node.output) > 1: - # input shape: (batch_size, sequence_length, hidden_size) - # past shape: (2, batch_size, num_heads, past_sequence_length, head_size) - # mask shape: (batch_size, total_sequence_length) or (batch_size, sequence_length, total_sequence_length) or (batch_size, 1, max_seq_len, max_seq_len) - # present shape: (2, batch_size, num_heads, total_sequence_length, head_size), where total_sequence_length=sequence_length+past_sequence_length - input_shape = self._get_shape(node, 0) - past_shape = self._get_shape(node, 4) - mask_shape = self._get_shape(node, 3) - if len(past_shape) == 5: - if len(mask_shape) in [2, 3]: - past_shape[3] = mask_shape[-1] - elif isinstance(input_shape[1], int) and isinstance(past_shape[3], int): - past_shape[3] = input_shape[1] + past_shape[3] - else: - past_shape[3] = f"{past_shape[3]}+{input_shape[1]}" - vi = self.known_vi_[node.output[1]] - vi.CopyFrom(helper.make_tensor_value_info(vi.name, output_dtype, past_shape)) + if shape and len(shape) == 3 and shape_bias and len(shape_bias) == 1: + qkv_hidden_sizes_attr = get_attribute(node, "qkv_hidden_sizes") + if qkv_hidden_sizes_attr is not None: + assert len(qkv_hidden_sizes_attr) == 3 + shape[2] = int(qkv_hidden_sizes_attr[2]) + elif isinstance(shape_bias[0], int): + shape[2] = int(shape_bias[0] / 3) + output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type + vi = self.known_vi_[node.output[0]] + vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, shape)) + + if len(node.output) > 1: + # input shape: (batch_size, sequence_length, hidden_size) + # past shape: (2, batch_size, num_heads, past_sequence_length, head_size) + # mask shape: (batch_size, total_sequence_length) or (batch_size, sequence_length, total_sequence_length) or (batch_size, 1, max_seq_len, max_seq_len) + # present shape: (2, batch_size, num_heads, total_sequence_length, head_size), where total_sequence_length=sequence_length+past_sequence_length + input_shape = self._get_shape(node, 0) + past_shape = self._get_shape(node, 4) + mask_shape = self._get_shape(node, 3) + if past_shape and len(past_shape) == 5: + if mask_shape and len(mask_shape) in [2, 3]: + past_shape[3] = mask_shape[-1] + elif input_shape and len(input_shape) == 3: + if isinstance(input_shape[1], int) and isinstance(past_shape[3], int): + past_shape[3] = input_shape[1] + past_shape[3] + else: + past_shape[3] = f"{past_shape[3]}+{input_shape[1]}" + vi = self.known_vi_[node.output[1]] + vi.CopyFrom(helper.make_tensor_value_info(vi.name, output_dtype, past_shape)) def _infer_BiasGelu(self, node): self._propagate_shape_and_type(node) def _infer_MultiHeadAttention(self, node): # Input 0 (query) has shape (batch_size, sequence_length, hidden_size) - # Input 1 (key) has shape (batch_size, kv_sequence_length, hidden_size) - # Input 2 (value) has shape (batch_size, kv_sequence_length, v_hidden_size) + # Without packed KV: + # Input 1 (key) has shape (batch_size, kv_sequence_length, hidden_size) + # Input 2 (value) has shape (batch_size, kv_sequence_length, v_hidden_size) + # With packed KV: + # Input 1 (key) has shape (batch_size, kv_sequence_length, num_heads, 2, head_size) + # Input 2 (value) is nullptr # Output 0 has shape (batch_size, sequence_length, v_hidden_size) query_shape = self._get_shape(node, 0) - value_shape = self._get_shape(node, 2) + key_shape = self._get_shape(node, 1) + if query_shape is not None and len(query_shape) == 3: - assert len(query_shape) == 3 and len(value_shape) == 3 - output_shape = query_shape - output_shape[2] = value_shape[2] + # By default, hidden size is same for Q/K/V. Only need check v_hidden_size when value is provided. + output_shape = query_shape + if key_shape and len(key_shape) == 3: + value_shape = self._get_shape(node, 2) + if value_shape and len(value_shape) == 3: + output_shape[2] = value_shape[2] - output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type - vi = self.known_vi_[node.output[0]] - vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, output_shape)) + output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type + vi = self.known_vi_[node.output[0]] + vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, output_shape)) def _infer_FastGelu(self, node): self._propagate_shape_and_type(node) @@ -2056,6 +2069,19 @@ def _infer_SkipLayerNormalization(self, node): if len(node.output) > 3: self._propagate_shape_and_type(node, 0, 3) + def _infer_GroupNorm(self, node): + self._propagate_shape_and_type(node) + + def _infer_BiasSplitGelu(self, node): + input_shape = self._get_shape(node, 0) + bias_shape = self._get_shape(node, 1) + if input_shape and bias_shape and isinstance(bias_shape[0], int): + output_shape = input_shape + output_shape[2] = int(bias_shape[0] / 2) + vi = self.known_vi_[node.output[0]] + output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type + vi.CopyFrom(helper.make_tensor_value_info(vi.name, output_dtype, output_shape)) + def _infer_PythonOp(self, node): output_tensor_types = get_attribute(node, "output_tensor_types") assert output_tensor_types diff --git a/onnxruntime/python/tools/transformers/fusion_attention_unet.py b/onnxruntime/python/tools/transformers/fusion_attention_unet.py index 2151e6a21c5e7..0441ce494d560 100644 --- a/onnxruntime/python/tools/transformers/fusion_attention_unet.py +++ b/onnxruntime/python/tools/transformers/fusion_attention_unet.py @@ -19,11 +19,14 @@ class FusionAttentionUnet(Fusion): Fuse Attention subgraph of UNet into one Attention node. """ - def __init__(self, model: OnnxModel, hidden_size: int, num_heads: int, is_cross_attention: bool): + def __init__( + self, model: OnnxModel, hidden_size: int, num_heads: int, is_cross_attention: bool, enable_packed_kv: bool + ): super().__init__(model, "MultiHeadAttention" if is_cross_attention else "Attention", ["LayerNormalization"]) self.hidden_size = hidden_size self.num_heads = num_heads self.is_cross_attention = is_cross_attention + self.enable_packed_kv = enable_packed_kv # Flags to show warning only once self.num_heads_warning = True @@ -103,8 +106,22 @@ def create_attention_node( is_self_attention = not self.is_cross_attention if is_self_attention: - if q_matmul.input[0] != input or k_matmul.input[0] != input or q_matmul.input[0] != input: - logger.debug("q_matmul.input[0] != input or k_matmul.input[0] != input or q_matmul.input[0] != input") + if q_matmul.input[0] != input or k_matmul.input[0] != input or v_matmul.input[0] != input: + logger.debug( + "For self attention, input hidden state for q and k/v shall be same. Got %s, %s, %s", + q_matmul.input[0], + k_matmul.input[0], + v_matmul.input[0], + ) + return None + else: + if q_matmul.input[0] != input or (k_matmul.input[0] != v_matmul.input[0]) or (k_matmul.input[0] == input): + logger.debug( + "For cross attention, input hidden state for q and k/v shall be different. Got %s, %s, %s", + q_matmul.input[0], + k_matmul.input[0], + v_matmul.input[0], + ) return None if hidden_size > 0 and (hidden_size % num_heads) != 0: @@ -136,7 +153,7 @@ def create_attention_node( kw_in_size = kw.shape[0] vw_in_size = vw.shape[0] - assert qw_in_size == kw_in_size == vw_in_size + assert qw_in_size == kw_in_size and kw_in_size == vw_in_size if hidden_size > 0 and hidden_size != qw_in_size: raise ValueError( @@ -162,8 +179,63 @@ def create_attention_node( ) self.model.add_initializer(weight, self.this_graph_name) - else: + else: # cross attention attention_node_name = self.model.create_node_name("MultiHeadAttention") + if self.enable_packed_kv: + if kw.shape != vw.shape: + return None + + kw_in_size = kw.shape[0] + vw_in_size = vw.shape[0] + assert kw_in_size == vw_in_size + + qw_out_size = qw.shape[1] + kw_out_size = kw.shape[1] + vw_out_size = vw.shape[1] + assert qw_out_size == vw_out_size and kw_out_size == vw_out_size + + c = kw_in_size + n = num_heads + h = kw_out_size // num_heads + + # Concat and interleave weights so that the output of fused KV GEMM has [B, S_kv, N, 2, H] shape + kv_weight = np.dstack([kw.reshape(c, n, h), vw.reshape(c, n, h)]).reshape(c, n * 2 * h) + + matmul_node_name = self.model.create_node_name("MatMul", name_prefix="MatMul_KV") + weight = helper.make_tensor( + name=matmul_node_name + "_weight", + data_type=TensorProto.FLOAT, + dims=[kv_weight.shape[0], kv_weight.shape[1]], + vals=kv_weight.flatten().tolist(), + ) + + self.model.add_initializer(weight, self.this_graph_name) + + matmul_node = helper.make_node( + "MatMul", + inputs=[k_matmul.input[0], matmul_node_name + "_weight"], + outputs=[matmul_node_name + "_out"], + name=matmul_node_name, + ) + self.node_name_to_graph_name[matmul_node.name] = self.this_graph_name + + shape_tensor = helper.make_tensor( + name=matmul_node_name + "_reshape_shape", + data_type=TensorProto.INT64, + dims=[5], + vals=[0, 0, n, 2, h], + ) + self.model.add_initializer(shape_tensor, self.this_graph_name) + + reshape_node = helper.make_node( + "Reshape", + inputs=[matmul_node_name + "_out", matmul_node_name + "_reshape_shape"], + outputs=[k_matmul.output[0]], + name=matmul_node_name + "_reshape", + ) + self.node_name_to_graph_name[reshape_node.name] = self.this_graph_name + self.nodes_to_add.extend([matmul_node, reshape_node]) + self.nodes_to_remove.extend([k_matmul, v_matmul]) # No bias, use zeros qkv_bias = np.zeros([3, hidden_size], dtype=np.float32) @@ -184,12 +256,18 @@ def create_attention_node( attention_node_name + "_qkv_bias", ] else: - attention_inputs = [ - q_matmul.output[0], - k_matmul.output[0], - v_matmul.output[0], - attention_node_name + "_qkv_bias", - ] + if not self.enable_packed_kv: + attention_inputs = [ + q_matmul.output[0], + k_matmul.output[0], + v_matmul.output[0], + attention_node_name + "_qkv_bias", + ] + else: + attention_inputs = [ + q_matmul.output[0], + k_matmul.output[0], + ] attention_node = helper.make_node( "Attention" if is_self_attention else "MultiHeadAttention", @@ -200,12 +278,23 @@ def create_attention_node( attention_node.domain = "com.microsoft" attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)]) + counter_name = ( + "Attention (self attention)" + if is_self_attention + else "MultiHeadAttention ({})".format( + "cross attention with packed kv" if self.enable_packed_kv else "cross attention" + ) + ) + self.increase_counter(counter_name) return attention_node def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): - node_before_layernorm = self.model.match_parent( - normalize_node, "Add" if self.is_cross_attention else "Reshape", 0 - ) + node_before_layernorm = self.model.match_parent(normalize_node, "Add", 0) + + # In SD 1.5, for self attention, LayerNorm has parent Reshape + if node_before_layernorm is None and not self.is_cross_attention: + node_before_layernorm = self.model.match_parent(normalize_node, "Reshape", 0) + if node_before_layernorm is None: return @@ -241,11 +330,11 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): qk_nodes = self.model.match_parent_path(matmul_qkv, ["Softmax", "Mul", "MatMul"], [0, 0, 0]) if qk_nodes is not None: - (softmax_qk, mul_qk, matmul_qk) = qk_nodes + (_softmax_qk, _mul_qk, matmul_qk) = qk_nodes else: qk_nodes = self.model.match_parent_path(matmul_qkv, ["Softmax", "Add", "Mul", "MatMul"], [0, 0, 0, 0]) if qk_nodes is not None: - (softmax_qk, add_zero, mul_qk, matmul_qk) = qk_nodes + (_softmax_qk, _add_zero, _mul_qk, matmul_qk) = qk_nodes else: logger.debug("fuse_attention: failed to match qk path") return diff --git a/onnxruntime/python/tools/transformers/fusion_biassplitgelu.py b/onnxruntime/python/tools/transformers/fusion_biassplitgelu.py new file mode 100644 index 0000000000000..106d3de25d39d --- /dev/null +++ b/onnxruntime/python/tools/transformers/fusion_biassplitgelu.py @@ -0,0 +1,110 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +from logging import getLogger +from typing import Dict + +from fusion_base import Fusion +from onnx import helper +from onnx_model import OnnxModel + +logger = getLogger(__name__) + + +class FusionBiasSplitGelu(Fusion): + def __init__(self, model: OnnxModel): + super().__init__(model, "BiasSplitGelu", "Gelu") + + def fuse(self, gelu_node, input_name_to_nodes: Dict, output_name_to_node: Dict): + """ + [root] --->Add --------------------> Slice ---------------> Mul --> + | ^ ^ + | | | + +----------------------------+---Slice --> Gelu---+ + | | ^ + | |-----| + | | | + | Mul Mul + | ^ ^ + v | | + Shape ---> Gather --> Add --> Div --+ + """ + if gelu_node.output[0] not in input_name_to_nodes: + return + children = input_name_to_nodes[gelu_node.output[0]] + if len(children) != 1 or children[0].op_type != "Mul": + return + mul_after_gelu = children[0] + + slice_before_gelu = self.model.match_parent(gelu_node, "Slice", 0, output_name_to_node) + if slice_before_gelu is None: + return + + if self.model.find_constant_input(slice_before_gelu, -1, delta=0.001) != 3: + return + + add_output = slice_before_gelu.input[0] + + start_index_nodes = self.model.match_parent_path( + slice_before_gelu, + ["Div", "Add", "Gather", "Shape", "Add"], + [1, 0, 0, 0, 0], + output_name_to_node, # Mul(1) is optional + ) + if start_index_nodes is None: + start_index_nodes = self.model.match_parent_path( + slice_before_gelu, + ["Mul", "Div", "Add", "Gather", "Shape", "Add"], + [1, 0, 0, 0, 0, 0], + output_name_to_node, + ) + + if start_index_nodes is None or start_index_nodes[-2].input[0] != add_output: + return + + end_index_nodes = self.model.match_parent_path(slice_before_gelu, ["Mul", "Div"], [2, 0], output_name_to_node) + + if ( + end_index_nodes is None or end_index_nodes[1] not in start_index_nodes + ): # the Div is parent of both two Mul nodes + return + + slice_before_mul = self.model.match_parent(mul_after_gelu, "Slice", 0, output_name_to_node) + if slice_before_mul is None: + return + + if ( + slice_before_mul.input[2] != slice_before_gelu.input[1] + ): # end index of slice_before_mul is start index of slice_before_gelu + return + + subgraph_nodes = start_index_nodes + [ + end_index_nodes[0], + mul_after_gelu, + gelu_node, + slice_before_mul, + slice_before_gelu, + ] + subgraph_output = mul_after_gelu.output[0] + if not self.model.is_safe_to_fuse_nodes( + subgraph_nodes, [subgraph_output], input_name_to_nodes, output_name_to_node + ): + logger.info("Skip fuse BiasSplitGelu since it is not safe to fuse the subgraph.") + return + + add_node = start_index_nodes[-1] + bias_index, _value = self.model.get_constant_input(add_node) + if not isinstance(bias_index, int): + return + self.nodes_to_remove.extend(subgraph_nodes) + node_name = self.model.create_node_name("BiasSplitGelu", name_prefix="BiasSplitGelu") + fused_node = helper.make_node( + "BiasSplitGelu", + inputs=[add_node.input[1 - bias_index], add_node.input[bias_index]], + outputs=[subgraph_output], + name=node_name, + ) + fused_node.domain = "com.microsoft" + self.nodes_to_add.append(fused_node) + self.node_name_to_graph_name[node_name] = self.this_graph_name diff --git a/onnxruntime/python/tools/transformers/fusion_group_norm.py b/onnxruntime/python/tools/transformers/fusion_group_norm.py new file mode 100644 index 0000000000000..a0a4d7c16de0b --- /dev/null +++ b/onnxruntime/python/tools/transformers/fusion_group_norm.py @@ -0,0 +1,198 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +from logging import getLogger +from typing import Dict + +import numpy as np +from fusion_base import Fusion +from fusion_utils import FusionUtils +from onnx import TensorProto, helper +from onnx_model import OnnxModel + +logger = getLogger(__name__) + + +class FusionGroupNorm(Fusion): + def __init__(self, model: OnnxModel): + super().__init__(model, "GroupNorm", "Add") + + def fuse(self, add_node, input_name_to_nodes: Dict, output_name_to_node: Dict): + """ + Fuse Group Normalization subgraph into one node GroupNorm. + The following is the pattern with swish activation: + +----------------Shape-------------------------------+ + | | + | (0, 32, -1) v (512x1x1) (512x1x1) (optional) + [Root] --> Reshape -------> InstanceNormalization --> Reshape ---> Mul --> Add --> Mul--> [output] + Bx512xHxW (scale=ones(32), B=zeros(32)) | ^ Bx512xHxW + | | + +--->Sigmoid (optional) + The Mul and Sigmoid before output is for Swish activation. They are optional. + """ + nodes = self.model.match_parent_path( + add_node, ["Mul", "Reshape", "InstanceNormalization", "Reshape"], [0, 0, 0, 0], output_name_to_node + ) + if nodes is None: + return + + weight_mul, reshape_4d, instance_norm, reshape_3d = nodes + root = reshape_3d.input[0] + + parents = self.model.match_parent_path(reshape_4d, ["Shape"], [1], output_name_to_node) + if parents is None: + return + if parents[0].input[0] != root: + return + shape_node = parents[0] + + # Check whether it has swish activation. + swish_mul = self.model.find_first_child_by_type(add_node, "Mul") + swish_sigmoid = None + if swish_mul is not None: + sigmoid_path = self.model.match_parent_path(swish_mul, ["Sigmoid"], [None], output_name_to_node) + if sigmoid_path is not None: + swish_sigmoid = sigmoid_path[0] + + weight_input = weight_mul.input[1 - self.model.input_index(reshape_4d.output[0], weight_mul)] + if not self.model.is_constant_with_specified_dimension(weight_input, 3, "group norm weight"): + return + + bias_input = add_node.input[1 - self.model.input_index(weight_mul.output[0], add_node)] + if not self.model.is_constant_with_specified_dimension(bias_input, 3, "layernorm bias"): + return + + weight = self.model.get_constant_value(weight_input) + if weight is None: + return + + if not (len(weight.shape) == 3 and weight.shape[1] == 1 and weight.shape[2] == 1): + return + + bias = self.model.get_constant_value(bias_input) + if bias is None: + return + if not (len(bias.shape) == 3 and bias.shape[1] == 1 and bias.shape[2] == 1): + return + + weight_elements = int(np.prod(weight.shape)) + bias_elements = int(np.prod(bias.shape)) + if weight_elements != bias_elements: + return + + instance_norm_scale = self.model.get_constant_value(instance_norm.input[1]) + if instance_norm_scale is None: + return + instance_norm_bias = self.model.get_constant_value(instance_norm.input[2]) + if instance_norm_bias is None: + return + + if not ( + len(instance_norm_scale.shape) == 1 + and len(instance_norm_bias.shape) == 1 + and instance_norm_scale.shape == instance_norm_bias.shape + and instance_norm_scale.shape[0] == 32 + ): + logger.info("InstanceNormalization groups=%d", instance_norm_scale.shape[0]) + return + + if not np.allclose(np.ones_like(instance_norm_scale), instance_norm_scale): + return + if not np.allclose(np.zeros_like(instance_norm_bias), instance_norm_bias): + return + + group_norm_name = self.model.create_node_name("GroupNorm", name_prefix="GroupNorm") + + if weight_elements not in [320, 640, 960, 1280, 1920, 2560] + [128, 256, 512]: + logger.info("GroupNorm channels=%d", weight_elements) + + gamma = helper.make_tensor( + name=group_norm_name + "_gamma", + data_type=TensorProto.FLOAT, + dims=[weight_elements], + vals=weight.flatten().tolist(), + ) + self.model.add_initializer(gamma, self.this_graph_name) + + beta = helper.make_tensor( + name=group_norm_name + "_beta", + data_type=TensorProto.FLOAT, + dims=[bias_elements], + vals=bias.flatten().tolist(), + ) + self.model.add_initializer(beta, self.this_graph_name) + + last_node = add_node + subgraph_nodes = [add_node, weight_mul, reshape_4d, instance_norm, reshape_3d, shape_node] + has_swish_activation = swish_mul and swish_sigmoid + if swish_mul and swish_sigmoid: + subgraph_nodes.extend([swish_mul, swish_sigmoid]) + last_node = swish_mul + + if not self.model.is_safe_to_fuse_nodes( + subgraph_nodes, + last_node.output, + input_name_to_nodes, + output_name_to_node, + ): + self.nodes_to_remove.extend([last_node]) + else: + self.nodes_to_remove.extend(subgraph_nodes) + + # instance_norm_scale might from Constant node. Use prune graph to clear it. + self.prune_graph = True + + # Right now GroupNorm only support float16 input. Need add a Cast in fp32 model. + utils = FusionUtils(self.model) + + input = root + output = last_node.output[0] + if weight.dtype == np.float32: + # Add a Cast node to get float16 input for GroupNorm + cast_input, _cast_node = utils.cast_input(root, "float16") + input = cast_input + + # Add a Cast node to convert back to float32 after GroupNorm + output = group_norm_name + "_out" + cast_node = helper.make_node("Cast", inputs=[group_norm_name + "_out"], outputs=[last_node.output[0]]) + cast_node.attribute.extend([helper.make_attribute("to", int(TensorProto.FLOAT))]) + self.model.add_node(cast_node) + + # NCHW to NHWC + transpose_input = helper.make_node( + "Transpose", + [input], + [input + "_NHWC"], + name=self.model.create_node_name("Transpose", name_prefix="Transpose_NCHW_to_NHWC"), + perm=[0, 2, 3, 1], + ) + + new_node = helper.make_node( + "GroupNorm", + inputs=[input + "_NHWC", group_norm_name + "_gamma", group_norm_name + "_beta"], + outputs=[output + "_NHWC"], + name=group_norm_name, + ) + + new_node.attribute.extend(instance_norm.attribute) + new_node.attribute.extend([helper.make_attribute("groups", 32)]) + new_node.attribute.extend([helper.make_attribute("activation", 1 if has_swish_activation else 0)]) + new_node.domain = "com.microsoft" + + # NHWC to NCHW + transpose_output = helper.make_node( + "Transpose", + [output + "_NHWC"], + [output], + name=self.model.create_node_name("Transpose", name_prefix="Transpose_NHWC_to_NCHW"), + perm=[0, 3, 1, 2], + ) + + self.nodes_to_add.append(new_node) + self.nodes_to_add.append(transpose_input) + self.nodes_to_add.append(transpose_output) + + self.node_name_to_graph_name[new_node.name] = self.this_graph_name + self.node_name_to_graph_name[transpose_input.name] = self.this_graph_name + self.node_name_to_graph_name[transpose_output.name] = self.this_graph_name diff --git a/onnxruntime/python/tools/transformers/fusion_options.py b/onnxruntime/python/tools/transformers/fusion_options.py index 9a5359b58caa6..cdfa2c626fc57 100644 --- a/onnxruntime/python/tools/transformers/fusion_options.py +++ b/onnxruntime/python/tools/transformers/fusion_options.py @@ -6,9 +6,16 @@ class AttentionMaskFormat: + # Build 1D mask indice (sequence length). It requires right side padding! Recommended for BERT model to get best performance. MaskIndexEnd = 0 + + # For experiment only. Do not use it in production. MaskIndexEndAndStart = 1 + + # Raw attention mask with 0 means padding (or no attention) and 1 otherwise. AttentionMask = 2 + + # No attention mask NoMask = 3 @@ -36,7 +43,17 @@ def __init__(self, model_type): self.enable_shape_inference = True self.enable_gemm_fast_gelu = False - self.attention_mask_format = AttentionMaskFormat.AttentionMask + + # Set default to sequence length for BERT model to use fused attention to speed up. + # Note that embed layer normalization will convert 2D mask to 1D when mask type is MaskIndexEnd. + self.attention_mask_format = ( + AttentionMaskFormat.MaskIndexEnd if model_type == "bert" else AttentionMaskFormat.AttentionMask + ) + + # options for stable diffusion + self.enable_group_norm = model_type == "unet" + self.enable_bias_splitgelu = model_type == "unet" + self.enable_packed_kv = model_type == "unet" def use_raw_attention_mask(self, use_raw_mask=True): if use_raw_mask: @@ -74,8 +91,14 @@ def parse(args): options.enable_gemm_fast_gelu = True if args.use_mask_index: options.use_raw_attention_mask(False) + if args.use_raw_attention_mask: + options.use_raw_attention_mask(True) if args.no_attention_mask: options.disable_attention_mask() + if args.disable_group_norm: + options.enable_group_norm = False + if args.disable_packed_kv: + options.enable_packed_kv = False return options @staticmethod @@ -164,10 +187,18 @@ def add_arguments(parser: ArgumentParser): "--use_mask_index", required=False, action="store_true", - help="use mask index instead of raw attention mask in attention operator", + help="use mask index to activate fused attention to speed up. It requires right-side padding!", ) parser.set_defaults(use_mask_index=False) + parser.add_argument( + "--use_raw_attention_mask", + required=False, + action="store_true", + help="use raw attention mask. Use this option if your input is not right-side padding. This might deactivate fused attention and get worse performance.", + ) + parser.set_defaults(use_raw_attention_mask=False) + parser.add_argument( "--no_attention_mask", required=False, @@ -185,3 +216,19 @@ def add_arguments(parser: ArgumentParser): "MultiHeadAttention has only CUDA implementation so the model can only run with cuda execution provider.", ) parser.set_defaults(use_multi_head_attention=False) + + parser.add_argument( + "--disable_group_norm", + required=False, + action="store_true", + help="not fuse GroupNorm. Only works for model_type=unet", + ) + parser.set_defaults(disable_group_norm=False) + + parser.add_argument( + "--disable_packed_kv", + required=False, + action="store_true", + help="not use packed kv in cross attention. Only works for model_type=unet", + ) + parser.set_defaults(disable_packed_kv=False) diff --git a/onnxruntime/python/tools/transformers/fusion_utils.py b/onnxruntime/python/tools/transformers/fusion_utils.py index 865c1542c1cc9..8363f2674cd40 100644 --- a/onnxruntime/python/tools/transformers/fusion_utils.py +++ b/onnxruntime/python/tools/transformers/fusion_utils.py @@ -28,8 +28,8 @@ def cast_graph_input_to_int32(self, input_name: str) -> Tuple[bool, str]: logger.debug(f"Did not cast graph input {input_name} to int32: found {graph_input is not None}") return False, input_name - def cast_input_to_int32(self, input_name: str): - cast_output = input_name + "_int32" + def cast_input(self, input_name: str, target_type="int32"): + cast_output = input_name + "_" + target_type # Avoid consequent Cast nodes. inputs = [input_name] @@ -40,11 +40,24 @@ def cast_input_to_int32(self, input_name: str): inputs = [parent_node.input[0]] cast_node = helper.make_node("Cast", inputs=inputs, outputs=[cast_output]) - cast_node.attribute.extend([helper.make_attribute("to", int(TensorProto.INT32))]) + + if target_type == "int32": + to_type = int(TensorProto.INT32) + elif target_type == "float32": + to_type = int(TensorProto.FLOAT) + elif target_type == "float16": + to_type = int(TensorProto.FLOAT16) + else: + raise ValueError("Invalid target_type: {target_type}") + + cast_node.attribute.extend([helper.make_attribute("to", to_type)]) self.model.add_node(cast_node) return cast_output, cast_node + def cast_input_to_int32(self, input_name: str): + return self.cast_input(input_name, "int32") + def remove_cast_int32(self, input_name: str): input_name_to_nodes = self.model.input_name_to_nodes() nodes = input_name_to_nodes[input_name] diff --git a/onnxruntime/python/tools/transformers/models/diffusion/__init__.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/__init__.py similarity index 100% rename from onnxruntime/python/tools/transformers/models/diffusion/__init__.py rename to onnxruntime/python/tools/transformers/models/stable_diffusion/__init__.py diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py new file mode 100755 index 0000000000000..580c5ef4c3cca --- /dev/null +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py @@ -0,0 +1,244 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +import argparse +import os +import time + +SD_MODELS = { + "1.5": "runwayml/stable-diffusion-v1-5", + "2.0": "stabilityai/stable-diffusion-2", + "2.1": "stabilityai/stable-diffusion-2-1", +} + + +def get_test_settings(): + height = 512 + width = 512 + num_inference_steps = 50 + prompts = [ + "a photo of an astronaut riding a horse on mars", + "cute grey cat with blue eyes, wearing a bowtie, acrylic painting", + "a cute magical flying dog, fantasy art drawn by disney concept artists, highly detailed, digital painting", + "an illustration of a house with large barn with many cute flower pots and beautiful blue sky scenery", + "one apple sitting on a table, still life, reflective, full color photograph, centered, close-up product", + "background texture of stones, masterpiece, artistic, stunning photo, award winner photo", + "new international organic style house, tropical surroundings, architecture, 8k, hdr", + "beautiful Renaissance Revival Estate, Hobbit-House, detailed painting, warm colors, 8k, trending on Artstation", + "blue owl, big green eyes, portrait, intricate metal design, unreal engine, octane render, realistic", + "delicate elvish moonstone necklace on a velvet background, symmetrical intricate motifs, leaves, flowers, 8k", + ] + + return height, width, num_inference_steps, prompts + + +def get_ort_pipeline(model_name: str, directory: str, provider: str, disable_safety_checker: bool): + from diffusers import OnnxStableDiffusionPipeline + + import onnxruntime + + if directory is not None: + assert os.path.exists(directory) + session_options = onnxruntime.SessionOptions() + pipe = OnnxStableDiffusionPipeline.from_pretrained( + directory, + provider=provider, + sess_options=session_options, + ) + else: + pipe = OnnxStableDiffusionPipeline.from_pretrained( + model_name, + revision="onnx", + provider=provider, + use_auth_token=True, + ) + + if disable_safety_checker: + pipe.safety_checker = None + pipe.feature_extractor = None + + return pipe + + +def get_torch_pipeline(model_name: str, disable_channels_last: bool, disable_safety_checker: bool): + from diffusers import StableDiffusionPipeline + from torch import channels_last, float16 + + pipe = StableDiffusionPipeline.from_pretrained( + model_name, torch_dtype=float16, revision="fp16", use_auth_token=True + ).to("cuda") + + if not disable_channels_last: + pipe.unet.to(memory_format=channels_last) # in-place operation + + if disable_safety_checker: + pipe.safety_checker = None + pipe.feature_extractor = None + + return pipe + + +def get_image_filename_prefix(engine: str, model_name: str, batch_size: int, disable_safety_checker: bool): + short_model_name = model_name.split("/")[-1].replace("stable-diffusion-", "sd") + return f"{engine}_{short_model_name}_b{batch_size}" + ("" if disable_safety_checker else "_safe") + + +def run_ort_pipeline(pipe, batch_size: int, image_filename_prefix: str): + from diffusers import OnnxStableDiffusionPipeline + + assert isinstance(pipe, OnnxStableDiffusionPipeline) + + height, width, num_inference_steps, prompts = get_test_settings() + + pipe("warm up", height, width, num_inference_steps=2) + + latency_list = [] + for i, prompt in enumerate(prompts): + input_prompts = [prompt] * batch_size + inference_start = time.time() + image = pipe(input_prompts, height, width, num_inference_steps).images[0] + inference_end = time.time() + + latency = inference_end - inference_start + latency_list.append(latency) + print(f"Inference took {latency} seconds") + image.save(f"{image_filename_prefix}_{i}.jpg") + print("Average latency in seconds:", sum(latency_list) / len(latency_list)) + + +def run_torch_pipeline(pipe, batch_size: int, image_filename_prefix: str): + import torch + + height, width, num_inference_steps, prompts = get_test_settings() + + pipe("warm up", height, width, num_inference_steps=2) + + torch.set_grad_enabled(False) + + latency_list = [] + for i, prompt in enumerate(prompts): + input_prompts = [prompt] * batch_size + torch.cuda.synchronize() + inference_start = time.time() + image = pipe(input_prompts, height, width, num_inference_steps).images[0] + torch.cuda.synchronize() + inference_end = time.time() + + latency = inference_end - inference_start + latency_list.append(latency) + print(f"Inference took {latency} seconds") + image.save(f"{image_filename_prefix}_{i}.jpg") + + print("Average latency in seconds:", sum(latency_list) / len(latency_list)) + + +def run_ort(model_name: str, directory: str, provider: str, batch_size: int, disable_safety_checker: bool): + load_start = time.time() + pipe = get_ort_pipeline(model_name, directory, provider, disable_safety_checker) + load_end = time.time() + print(f"Model loading took {load_end - load_start} seconds") + + image_filename_prefix = get_image_filename_prefix("ort", model_name, batch_size, disable_safety_checker) + run_ort_pipeline(pipe, batch_size, image_filename_prefix) + + +def run_torch(model_name: str, batch_size: int, disable_channels_last: bool, disable_safety_checker: bool): + import torch + + torch.backends.cudnn.enabled = True + torch.backends.cudnn.benchmark = True + # torch.backends.cuda.matmul.allow_tf32 = True + + torch.set_grad_enabled(False) + + load_start = time.time() + pipe = get_torch_pipeline(model_name, disable_channels_last, disable_safety_checker) + load_end = time.time() + print(f"Model loading took {load_end - load_start} seconds") + + image_filename_prefix = get_image_filename_prefix("torch", model_name, batch_size, disable_safety_checker) + ( + "" if disable_channels_last else "_channels_last" + ) + with torch.inference_mode(): + run_torch_pipeline(pipe, batch_size, image_filename_prefix) + + +def parse_arguments(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "-e", + "--engine", + required=False, + type=str, + default="onnxruntime", + choices=["onnxruntime", "torch"], + help="Engines to benchmark. Default is onnxruntime.", + ) + + parser.add_argument( + "-v", + "--version", + required=True, + type=str, + choices=list(SD_MODELS.keys()), + help="Stable diffusion version like 1.5, 2.0 or 2.1", + ) + + parser.add_argument( + "-p", + "--pipeline", + required=False, + type=str, + default=None, + help="Directory of saved onnx pipeline. It could be output directory of optimize_pipeline.py.", + ) + + parser.add_argument( + "-c", + "--disable_channels_last", + required=False, + action="store_true", + help="Disable channels last for torch. It will be ignored for onnxruntime engine", + ) + parser.set_defaults(disable_channels_last=False) + + parser.add_argument( + "--enable_safety_checker", + required=False, + action="store_true", + help="Enable safety checker", + ) + parser.set_defaults(enable_safety_checker=False) + + parser.add_argument("-b", "--batch_size", type=int, default=1) + + args = parser.parse_args() + return args + + +def main(): + args = parse_arguments() + print(args) + + sd_model = SD_MODELS[args.version] + if args.engine == "onnxruntime": + assert args.pipeline, "--pipeline should be specified for onnxruntime engine" + + if args.batch_size > 1: + # Need remove a line https://github.com/huggingface/diffusers/blob/a66f2baeb782e091dde4e1e6394e46f169e5ba58/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L307 + # in diffuers to run batch_size > 1. + assert ( + args.enable_safety_checker + ), "batch_size > 1 is not compatible with safety checker due to a bug in diffuers" + + provider = "CUDAExecutionProvider" # TODO: use ["CUDAExecutionProvider", "CPUExecutionProvider"] in diffuers + run_ort(sd_model, args.pipeline, provider, args.batch_size, not args.enable_safety_checker) + else: + run_torch(sd_model, args.batch_size, args.disable_channels_last, not args.enable_safety_checker) + + +if __name__ == "__main__": + main() diff --git a/onnxruntime/python/tools/transformers/models/diffusion/convert_to_fp16.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py similarity index 52% rename from onnxruntime/python/tools/transformers/models/diffusion/convert_to_fp16.py rename to onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py index 8e20f58dd75d4..0979f0d2ddcb5 100644 --- a/onnxruntime/python/tools/transformers/models/diffusion/convert_to_fp16.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py @@ -6,16 +6,23 @@ # This script converts stable diffusion onnx models from float to half (mixed) precision for GPU inference. # # Before running this script, you need convert checkpoint to float32 onnx models like the following -# git clone https://github.com/huggingface/diffusers -# cd diffusers -# pip install -e . +# export ONNX_ROOT=./sd_onnx +# pip install -r requirements.txt # huggingface-cli login -# python3 scripts/convert_stable_diffusion_checkpoint_to_onnx.py --model_path runwayml/stable-diffusion-v1-5 --output_path ../stable-diffusion-v1-5 -# +# wget https://raw.githubusercontent.com/huggingface/diffusers/v0.12.1/scripts/convert_stable_diffusion_checkpoint_to_onnx.py +# python convert_stable_diffusion_checkpoint_to_onnx.py --model_path runwayml/stable-diffusion-v1-5 --output_path $ONNX_ROOT/stable-diffusion-v1-5-fp32 +# python convert_stable_diffusion_checkpoint_to_onnx.py --model_path stabilityai/stable-diffusion-2-1 --output_path $ONNX_ROOT/stable-diffusion-v2-1-fp32 +# Note that this script might not be compatible with older or newer version of diffusers/transformers. It is because fusion script need change accordingly when onnx graph is changed. + # Then you can use this script to convert them to float16 like the following: -# pip3 install -U onnxruntime-gpu >= 1.14 -# python3 -m onnxruntime.transformers.models.diffusion.convert_to_fp16 -i ../stable-diffusion-v1-5 -o ../stable-diffusion-v1-5-fp16 -# Note that float16 model is intended for CUDA Execution Provider. It might not run in CPU Execution Provider. +# python optimize_pipeline.py -i $ONNX_ROOT/stable-diffusion-v1-5-fp32 -o $ONNX_ROOT/stable-diffusion-v1-5-fp16 --float16 +# python optimize_pipeline.py -i $ONNX_ROOT/stable-diffusion-v2-1-fp32 -o $ONNX_ROOT/stable-diffusion-v2-1-fp16 --float16 +# Or +# pip install -U onnxruntime-gpu >= 1.14 +# python -m onnxruntime.transformers.models.stable_diffusion.optimize_pipeline -i $ONNX_ROOT/stable-diffusion-v1-5-fp32 -o $ONNX_ROOT/stable-diffusion-v1-5-fp16 --float16 +# python -m onnxruntime.transformers.models.stable_diffusion.optimize_pipeline -i $ONNX_ROOT/stable-diffusion-v2-1-fp32 -o $ONNX_ROOT/stable-diffusion-v2-1-fp16 --float16 + +# Note that float16 model is for CUDA Execution Provider. It might not run in CPU Execution Provider. import argparse import logging @@ -27,51 +34,63 @@ import coloredlogs sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..")) +from fusion_options import FusionOptions from optimizer import optimize_model # noqa: E402 logger = logging.getLogger(__name__) -def convert_to_fp16(source_dir: Path, target_dir: Path, overwrite: bool, use_external_data_format: bool): - """Convert a model to float16 +def optimize_stable_diffusion_onnx_pipeline( + source_dir: Path, target_dir: Path, overwrite: bool, use_external_data_format: bool, float16: bool +): + """Optimize onnx models used in stable diffusion onnx pipeline and optionally convert to float16. Args: - source_dir (Path): source directory - target_dir (Path): target directory - overwrite (bool): overwrite if exists - use_external_data_format (bool): save model to two files: one for onnx graph, another for weights + source_dir (Path): Root of input directory of stable diffusion onnx pipeline with float32 models. + target_dir (Path): Root of output directory of stable diffusion onnx pipeline with optimized models. + overwrite (bool): Overwrite files if exists. + use_external_data_format (bool): save onnx model to two files: one for onnx graph, another for weights + float16 (bool): use half precision Raises: RuntimeError: input onnx model does not exist RuntimeError: output onnx model path existed """ - dirs_with_onnx = ["vae_encoder", "vae_decoder", "text_encoder", "safety_checker", "unet"] + dirs_with_onnx = ["unet", "vae_encoder", "vae_decoder", "text_encoder", "safety_checker"] for name in dirs_with_onnx: onnx_model_path = source_dir / name / "model.onnx" if not os.path.exists(onnx_model_path): - raise RuntimeError(f"input onnx model does not exist: {onnx_model_path}") + message = f"input onnx model does not exist: {onnx_model_path}." + if name not in ["safety_checker", "feature_extractor"]: + raise RuntimeError(message) + continue num_heads = 0 hidden_size = 0 # Graph fusion before fp16 conversion, otherwise they cannot be fused later. # Right now, onnxruntime does not save >2GB model so we use script to optimize unet instead. + logger.info(f"optimize {onnx_model_path}...") + + fusion_options = FusionOptions("unet") + # packed kv requires compute capacity >= 7.5 (like T4, A100, RTX 2060~4090. See https://developer.nvidia.com/cuda-gpus) + # Suggest to disable it if you are using older GPU like V100, RTX 1060/1070/1080, or using float32 model. + fusion_options.enable_packed_kv = float16 + m = optimize_model( str(onnx_model_path), model_type="unet", num_heads=num_heads, hidden_size=hidden_size, opt_level=0, - optimization_options=None, + optimization_options=fusion_options, use_gpu=False, ) - # VAE-decoder in fp16 reduced quality thus we exclude it here - if name != "vae_decoder": - m.convert_float_to_float16(op_block_list=["RandomNormalLike", "Resize"]) - else: - print("skip convert vae_decoder to fp16.") + if float16: + logger.info("convert %s to float16 ...", name) + m.convert_float_to_float16(op_block_list=["RandomNormalLike", "Resize", "GroupNorm"]) optimized_model_path = target_dir / name / "model.onnx" output_dir = optimized_model_path.parent @@ -84,11 +103,11 @@ def convert_to_fp16(source_dir: Path, target_dir: Path, overwrite: bool, use_ext output_dir.mkdir(parents=True, exist_ok=True) m.save_model_to_file(str(optimized_model_path), use_external_data_format=use_external_data_format) - print(f"{onnx_model_path} => {optimized_model_path}") + logger.info("%s => %s", onnx_model_path, optimized_model_path) -def copy_extra(source_dir: Path, target_dir: Path, overwrite: bool): - """Copy extra directory. +def copy_extra_directory(source_dir: Path, target_dir: Path, overwrite: bool): + """Copy extra directory that does not have onnx model Args: source_dir (Path): source directory @@ -100,10 +119,15 @@ def copy_extra(source_dir: Path, target_dir: Path, overwrite: bool): RuntimeError: output path exists but overwrite is false. """ extra_dirs = ["scheduler", "tokenizer", "feature_extractor"] + for name in extra_dirs: source_path = source_dir / name + if not os.path.exists(source_path): - raise RuntimeError(f"source path does not exist: {source_path}") + message = f"source path does not exist: {source_path}" + if name not in ["safety_checker", "feature_extractor"]: + raise RuntimeError(message) + continue target_path = target_dir / name if target_path.exists(): @@ -112,7 +136,7 @@ def copy_extra(source_dir: Path, target_dir: Path, overwrite: bool): shutil.rmtree(target_path) shutil.copytree(source_path, target_path) - print(f"{source_path} => {target_path}") + logger.info("%s => %s", source_path, target_path) extra_files = ["model_index.json"] for name in extra_files: @@ -126,7 +150,7 @@ def copy_extra(source_dir: Path, target_dir: Path, overwrite: bool): raise RuntimeError(f"output path existed: {target_path}") os.remove(target_path) shutil.copyfile(source_path, target_path) - print(f"{source_path} => {target_path}") + logger.info("%s => %s", source_path, target_path) def parse_arguments(): @@ -150,8 +174,16 @@ def parse_arguments(): "--output", required=True, type=str, - help="Root of output directory of stable diffusion onnx pipeline with float16 models.", + help="Root of output directory of stable diffusion onnx pipeline with optimized models.", + ) + + parser.add_argument( + "--float16", + required=False, + action="store_true", + help="Output models of half or mixed precision.", ) + parser.set_defaults(float16=False) parser.add_argument( "--overwrite", @@ -166,7 +198,8 @@ def parse_arguments(): "--use_external_data_format", required=False, action="store_true", - help="Onnx model larger than 2GB need to use external data format.", + help="Onnx model larger than 2GB need to use external data format. " + "Save onnx model to two files: one for onnx graph, another for large weights.", ) parser.set_defaults(use_external_data_format=False) @@ -177,8 +210,10 @@ def parse_arguments(): def main(): coloredlogs.install(fmt="%(funcName)20s: %(message)s") args = parse_arguments() - copy_extra(Path(args.input), Path(args.output), args.overwrite) - convert_to_fp16(Path(args.input), Path(args.output), args.overwrite, args.use_external_data_format) + copy_extra_directory(Path(args.input), Path(args.output), args.overwrite) + optimize_stable_diffusion_onnx_pipeline( + Path(args.input), Path(args.output), args.overwrite, args.use_external_data_format, args.float16 + ) main() diff --git a/onnxruntime/python/tools/transformers/onnx_model.py b/onnxruntime/python/tools/transformers/onnx_model.py index 4827facd78100..96c22b5894c60 100644 --- a/onnxruntime/python/tools/transformers/onnx_model.py +++ b/onnxruntime/python/tools/transformers/onnx_model.py @@ -977,6 +977,10 @@ def save_model_to_file(self, output_path, use_external_data_format=False, all_te logger.info("Sort graphs in topological order") self.topological_sort() + # Note: After the model is saved to another directory with external data, + # You need reload the onnx model if you want to read tensor from self.model object. + # It is because the base directory is not updated for self.model object so attempt to read tensor data + # might encounter error since external data cannot be located. OnnxModel.save(self.model, output_path, use_external_data_format, all_tensors_to_one_file) logger.info(f"Model saved to {output_path}") diff --git a/onnxruntime/python/tools/transformers/onnx_model_unet.py b/onnxruntime/python/tools/transformers/onnx_model_unet.py index 7872cf68e7366..feba717bd8f6f 100644 --- a/onnxruntime/python/tools/transformers/onnx_model_unet.py +++ b/onnxruntime/python/tools/transformers/onnx_model_unet.py @@ -7,6 +7,8 @@ from typing import Optional from fusion_attention_unet import FusionAttentionUnet +from fusion_biassplitgelu import FusionBiasSplitGelu +from fusion_group_norm import FusionGroupNorm from fusion_options import FusionOptions from onnx import ModelProto from onnx_model_bert import BertOnnxModel @@ -52,11 +54,20 @@ def optimize(self, options: Optional[FusionOptions] = None): self.fuse_reshape() + if (options is None) or options.enable_group_norm: + group_norm_fusion = FusionGroupNorm(self) + group_norm_fusion.apply() + + if (options is None) or options.enable_bias_splitgelu: + bias_split_gelu_fusion = FusionBiasSplitGelu(self) + bias_split_gelu_fusion.apply() + if (options is None) or options.enable_attention: - self_attention_fusion = FusionAttentionUnet(self, self.hidden_size, self.num_heads, False) + self_attention_fusion = FusionAttentionUnet(self, self.hidden_size, self.num_heads, False, False) self_attention_fusion.apply() - cross_attention_fusion = FusionAttentionUnet(self, self.hidden_size, self.num_heads, True) + enable_packed_kv = (options is None) or options.enable_packed_kv + cross_attention_fusion = FusionAttentionUnet(self, self.hidden_size, self.num_heads, True, enable_packed_kv) cross_attention_fusion.apply() if (options is None) or options.enable_skip_layer_norm: diff --git a/onnxruntime/test/contrib_ops/bias_split_gelu_op_test.cc b/onnxruntime/test/contrib_ops/bias_split_gelu_op_test.cc new file mode 100644 index 0000000000000..3fac765d898da --- /dev/null +++ b/onnxruntime/test/contrib_ops/bias_split_gelu_op_test.cc @@ -0,0 +1,145 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "gtest/gtest.h" +#include "test/common/tensor_op_test_utils.h" +#include "test/common/cuda_op_test_utils.h" +#include "test/providers/provider_test_utils.h" + +using namespace onnxruntime::test; + +namespace onnxruntime { +namespace test { +namespace bias_split_gelu_test { +std::vector ComputeGelu(const std::vector& input_data) { + std::vector output; + output.reserve(input_data.size()); + + for (size_t i = 0; i < input_data.size(); i++) { + float x = input_data[i]; + float y = x * (0.5f * (1.0f + std::erff(x / 1.41421356237f))); + output.push_back(y); + } + return output; +} + +std::vector AddBias(const std::vector& input_data, const std::vector& bias_data) { + size_t bias_length = bias_data.size(); + + std::vector output; + output.reserve(input_data.size()); + + for (size_t i = 0; i < input_data.size(); i++) { + output.push_back(input_data[i] + bias_data[i % bias_length]); + } + return output; +} + +void Split(const std::vector& input_data, + const std::vector& input_dims, + std::vector& left_half_data, std::vector& right_half_data) { + std::size_t length = input_data.size(); + left_half_data.reserve(length / 2); + right_half_data.reserve(length / 2); + + int64_t index = 0; + for (int64_t i = 0; i < input_dims[0]; i++) { + for (int64_t j = 0; j < input_dims[1]; j++) { + for (int64_t k = 0; k < input_dims[2]; k++, index++) { + if (k < input_dims[2] / 2) { + left_half_data.push_back(input_data[index]); + } else { + right_half_data.push_back(input_data[index]); + } + } + } + } +} + +std::vector GetExpectedResult(const std::vector& input_data, + const std::vector& input_dims, + const std::vector& bias_data) { + std::vector add_bias_data = AddBias(input_data, bias_data); + std::vector left_half_data; + std::vector right_half_data; + Split(add_bias_data, input_dims, left_half_data, right_half_data); + std::vector right_gelu_data = ComputeGelu(right_half_data); + + std::vector output_data; + output_data.reserve(left_half_data.size()); + for (std::size_t i = 0; i < left_half_data.size(); i++) { + output_data.push_back(left_half_data[i] * right_gelu_data[i]); + } + return output_data; +} +} // namespace bias_split_gelu_test + +#if defined(USE_CUDA) // The operator has only CUDA implementation right now + +static void RunBiasSplitGeluGpuTest(const std::vector& input_data, + const std::vector& bias_data, + const std::vector& output_data, + const std::vector& input_dims, + const std::vector& bias_dims, + const std::vector& output_dims, + bool use_float16 = false) { + int min_cuda_architecture = use_float16 ? 530 : 0; + if (!HasCudaEnvironment(min_cuda_architecture)) { + return; + } + + OpTester tester("BiasSplitGelu", 1, onnxruntime::kMSDomain); + + if (use_float16) { + tester.AddInput("X", input_dims, ToFloat16(input_data)); + tester.AddInput("bias", bias_dims, ToFloat16(bias_data)); + tester.AddOutput("Y", output_dims, ToFloat16(output_data)); + } else { + tester.AddInput("X", input_dims, input_data); + tester.AddInput("bias", bias_dims, bias_data); + tester.AddOutput("Y", output_dims, output_data); + } + + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +} + +static void RunBiasSplitGeluTest(int64_t batch_size, int64_t sequence_length, int64_t hidden_size) { + std::vector input_dims = {batch_size, sequence_length, hidden_size}; + std::vector bias_dims = {hidden_size}; + std::vector output_dims = {batch_size, sequence_length, hidden_size / 2}; + + RandomValueGenerator random{}; + std::vector input_data = random.Gaussian(input_dims, 0.0f, 0.3f); + std::vector bias_data = random.Gaussian(bias_dims, 0.0f, 0.3f); + std::vector output_data = bias_split_gelu_test::GetExpectedResult(input_data, input_dims, bias_data); + + RunBiasSplitGeluGpuTest(input_data, bias_data, output_data, input_dims, bias_dims, output_dims); +} + +TEST(BiasSplitGeluTest, BiasSplitGeluTest_HiddenSize_2560) { + constexpr int64_t batch_size = 2; + constexpr int64_t sequence_length = 5; + constexpr int64_t hidden_size = 2560; + RunBiasSplitGeluTest(batch_size, sequence_length, hidden_size); +} + +TEST(BiasSplitGeluTest, BiasSplitGeluTest_HiddenSize_5120) { + constexpr int64_t batch_size = 2; + constexpr int64_t sequence_length = 1; + constexpr int64_t hidden_size = 5120; + RunBiasSplitGeluTest(batch_size, sequence_length, hidden_size); +} + +TEST(BiasSplitGeluTest, BiasSplitGeluTest_HiddenSize_10240) { + constexpr int64_t batch_size = 1; + constexpr int64_t sequence_length = 2; + constexpr int64_t hidden_size = 10240; + RunBiasSplitGeluTest(batch_size, sequence_length, hidden_size); +} + +#endif + +} // namespace test +} // namespace onnxruntime diff --git a/onnxruntime/test/contrib_ops/group_norm_op_test.cc b/onnxruntime/test/contrib_ops/group_norm_op_test.cc new file mode 100644 index 0000000000000..4af51e24159ef --- /dev/null +++ b/onnxruntime/test/contrib_ops/group_norm_op_test.cc @@ -0,0 +1,436 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include +#include "test/common/tensor_op_test_utils.h" +#include "test/common/cuda_op_test_utils.h" +#include "test/framework/test_utils.h" +#include "test/providers/provider_test_utils.h" + +#include "gtest/gtest.h" +#include "gmock/gmock.h" + +using namespace std; + +namespace onnxruntime { +namespace test { + +TEST(GroupNormTest, GroupNorm_128) { + constexpr int64_t B = 2; + constexpr int64_t C = 128; + constexpr int64_t H = 2; + constexpr int64_t W = 2; + + std::vector dims{B, H, W, C}; + std::vector input_data = { + 0.696469f, 0.719469f, 0.480932f, 0.438572f, 0.182492f, 0.634401f, 0.722443f, 0.293714f, 0.430863f, 0.426351f, + 0.623953f, 0.866309f, 0.519485f, 0.603060f, 0.417022f, 0.669314f, 0.842342f, 0.194223f, 0.627249f, 0.556785f, + 0.318766f, 0.925132f, 0.304768f, 0.355915f, 0.151127f, 0.513128f, 0.321981f, 0.854452f, 0.171082f, 0.578551f, + 0.905342f, 0.901911f, 0.806969f, 0.600699f, 0.428347f, 0.093327f, 0.457412f, 0.708697f, 0.286537f, 0.664872f, + 0.438214f, 0.582671f, 0.750717f, 0.859389f, 0.081780f, 0.562218f, 0.467988f, 0.931932f, 0.378986f, 0.032198f, + 0.542636f, 0.769397f, 0.661168f, 0.425868f, 0.181629f, 0.417291f, 0.918397f, 0.313669f, 0.238250f, 0.301947f, + 0.005545f, 0.097038f, 0.798923f, 0.715601f, 0.650750f, 0.502071f, 0.374292f, 0.300610f, 0.005943f, 0.597433f, + 0.887593f, 0.045895f, 0.710162f, 0.355958f, 0.832716f, 0.016392f, 0.225498f, 0.568103f, 0.298245f, 0.587494f, + 0.003532f, 0.052091f, 0.026611f, 0.607529f, 0.389874f, 0.937668f, 0.273842f, 0.882276f, 0.422543f, 0.145264f, + 0.564570f, 0.278024f, 0.542989f, 0.232686f, 0.820574f, 0.332580f, 0.688374f, 0.675035f, 0.944366f, 0.210653f, + 0.456271f, 0.909715f, 0.444221f, 0.947120f, 0.206132f, 0.509402f, 0.322974f, 0.655923f, 0.263610f, 0.953818f, + 0.423518f, 0.020576f, 0.523488f, 0.410266f, 0.013160f, 0.467330f, 0.652154f, 0.165560f, 0.349519f, 0.645823f, + 0.089832f, 0.051901f, 0.810513f, 0.902557f, 0.882713f, 0.212831f, 0.893865f, 0.916849f, 0.286139f, 0.423106f, + 0.392118f, 0.059678f, 0.175452f, 0.849432f, 0.322959f, 0.630976f, 0.493685f, 0.893389f, 0.115618f, 0.250455f, + 0.612895f, 0.545068f, 0.681301f, 0.585937f, 0.083195f, 0.572457f, 0.723416f, 0.158960f, 0.691970f, 0.841670f, + 0.398186f, 0.762548f, 0.398876f, 0.666625f, 0.661564f, 0.384838f, 0.829113f, 0.521533f, 0.207636f, 0.983631f, + 0.394370f, 0.865864f, 0.204543f, 0.296861f, 0.753526f, 0.839243f, 0.306470f, 0.887857f, 0.765096f, 0.814844f, + 0.574064f, 0.821504f, 0.138416f, 0.122244f, 0.807938f, 0.582175f, 0.668384f, 0.744781f, 0.066774f, 0.573774f, + 0.049097f, 0.788187f, 0.321319f, 0.989035f, 0.091296f, 0.047340f, 0.807791f, 0.980582f, 0.484909f, 0.461909f, + 0.798846f, 0.410520f, 0.865460f, 0.067449f, 0.214012f, 0.634442f, 0.365719f, 0.293152f, 0.016119f, 0.710999f, + 0.958510f, 0.929764f, 0.846055f, 0.721184f, 0.875125f, 0.225463f, 0.418627f, 0.948252f, 0.480889f, 0.406779f, + 0.920149f, 0.811953f, 0.754797f, 0.908011f, 0.206115f, 0.822304f, 0.245033f, 0.092186f, 0.191336f, 0.741760f, + 0.693985f, 0.746698f, 0.464935f, 0.953697f, 0.204304f, 0.006028f, 0.491190f, 0.421200f, 0.279802f, 0.043418f, + 0.036323f, 0.617660f, 0.165066f, 0.296902f, 0.972098f, 0.405653f, 0.271480f, 0.102880f, 0.371992f, 0.918097f, + 0.109088f, 0.661717f, 0.024148f, 0.375109f, 0.232980f, 0.612682f, 0.277424f, 0.038700f, 0.648450f, 0.294307f, + 0.131115f, 0.222157f, 0.919472f, 0.392304f, 0.496508f, 0.517623f, 0.226851f, 0.980764f, 0.343178f, 0.398044f, + 0.531551f, 0.724455f, 0.361789f, 0.092105f, 0.425830f, 0.944160f, 0.317285f, 0.483034f, 0.120629f, 0.342764f, + 0.875457f, 0.624904f, 0.763683f, 0.095713f, 0.016129f, 0.153071f, 0.554383f, 0.357398f, 0.704959f, 0.593177f, + 0.240856f, 0.105908f, 0.846506f, 0.316788f, 0.338671f, 0.002688f, 0.292489f, 0.257542f, 0.731073f, 0.983522f, + 0.450636f, 0.927584f, 0.741862f, 0.165938f, 0.665261f, 0.696311f, 0.565642f, 0.337066f, 0.751644f, 0.909872f, + 0.399379f, 0.201400f, 0.007426f, 0.206096f, 0.029320f, 0.472913f, 0.653365f, 0.102635f, 0.792299f, 0.411569f, + 0.845533f, 0.236600f, 0.463653f, 0.241686f, 0.894978f, 0.539505f, 0.988329f, 0.963004f, 0.208248f, 0.191007f, + 0.025242f, 0.993033f, 0.105446f, 0.281235f, 0.533886f, 0.632050f, 0.126958f, 0.971046f, 0.429813f, 0.148778f, + 0.123923f, 0.007738f, 0.363576f, 0.572147f, 0.453089f, 0.556035f, 0.927455f, 0.372396f, 0.680903f, 0.335544f, + 0.369291f, 0.348797f, 0.336340f, 0.709623f, 0.117398f, 0.602932f, 0.676906f, 0.559738f, 0.912132f, 0.777769f, + 0.779767f, 0.657815f, 0.470689f, 0.087408f, 0.270176f, 0.218035f, 0.932892f, 0.707115f, 0.040683f, 0.368875f, + 0.361817f, 0.950252f, 0.987351f, 0.257348f, 0.398639f, 0.625209f, 0.868315f, 0.864480f, 0.093427f, 0.943201f, + 0.709386f, 0.542860f, 0.774580f, 0.238783f, 0.998918f, 0.760210f, 0.732601f, 0.451088f, 0.612179f, 0.000082f, + 0.415504f, 0.851548f, 0.426096f, 0.804026f, 0.551315f, 0.684830f, 0.729050f, 0.737995f, 0.531828f, 0.611024f, + 0.228263f, 0.433701f, 0.312261f, 0.501837f, 0.414826f, 0.985560f, 0.826341f, 0.304121f, 0.510422f, 0.674689f, + 0.243666f, 0.885327f, 0.594432f, 0.695530f, 0.388951f, 0.043591f, 0.995358f, 0.691702f, 0.343456f, 0.130895f, + 0.553257f, 0.354265f, 0.552370f, 0.988345f, 0.520010f, 0.564359f, 0.161069f, 0.079366f, 0.547764f, 0.569004f, + 0.048579f, 0.780998f, 0.111392f, 0.440328f, 0.084904f, 0.927577f, 0.079149f, 0.128631f, 0.424307f, 0.811644f, + 0.551593f, 0.717758f, 0.635900f, 0.121754f, 0.996086f, 0.699834f, 0.518717f, 0.481026f, 0.186904f, 0.916832f, + 0.502216f, 0.095530f, 0.043223f, 0.626309f, 0.375186f, 0.341831f, 0.443368f, 0.967494f, 0.266906f, 0.236462f, + 0.232480f, 0.362277f, 0.162016f, 0.026197f, 0.777162f, 0.871683f, 0.872879f, 0.940029f, 0.596487f, 0.084822f, + 0.539960f, 0.660952f, 0.932351f, 0.500561f, 0.198366f, 0.857153f, 0.904226f, 0.349566f, 0.242220f, 0.634638f, + 0.327100f, 0.959345f, 0.301053f, 0.364187f, 0.215505f, 0.334836f, 0.580713f, 0.200401f, 0.237478f, 0.772878f, + 0.808964f, 0.346795f, 0.360424f, 0.845753f, 0.314351f, 0.483889f, 0.332754f, 0.611977f, 0.863353f, 0.815966f, + 0.408660f, 0.082653f, 0.184886f, 0.441697f, 0.280477f, 0.276902f, 0.837466f, 0.245131f, 0.924552f, 0.858917f, + 0.134613f, 0.704779f, 0.040616f, 0.230090f, 0.678095f, 0.287103f, 0.988215f, 0.980597f, 0.744615f, 0.127612f, + 0.305646f, 0.857652f, 0.922382f, 0.441324f, 0.617186f, 0.220603f, 0.779245f, 0.616006f, 0.135673f, 0.247513f, + 0.359867f, 0.546479f, 0.510376f, 0.167482f, 0.647433f, 0.875771f, 0.795605f, 0.284549f, 0.648163f, 0.697942f, + 0.717354f, 0.792651f, 0.402787f, 0.663393f, 0.701360f, 0.468060f, 0.376677f, 0.475468f, 0.298579f, 0.981118f, + 0.880607f, 0.326968f, 0.355065f, 0.336230f, 0.098184f, 0.016991f, 0.453990f, 0.115745f, 0.207050f, 0.163338f, + 0.587616f, 0.518773f, 0.952489f, 0.803843f, 0.844077f, 0.264328f, 0.097160f, 0.338377f, 0.995782f, 0.945237f, + 0.879142f, 0.501190f, 0.668073f, 0.043569f, 0.550953f, 0.663043f, 0.278687f, 0.244660f, 0.747326f, 0.768959f, + 0.756060f, 0.915355f, 0.280000f, 0.113509f, 0.430876f, 0.755474f, 0.205838f, 0.225924f, 0.265096f, 0.899392f, + 0.330582f, 0.158679f, 0.684826f, 0.544763f, 0.387195f, 0.921920f, 0.383194f, 0.199158f, 0.220731f, 0.083348f, + 0.267982f, 0.416437f, 0.503247f, 0.229764f, 0.751615f, 0.979886f, 0.218682f, 0.785557f, 0.596404f, 0.673936f, + 0.045040f, 0.842387f, 0.478416f, 0.619851f, 0.758625f, 0.557799f, 0.428663f, 0.350140f, 0.081201f, 0.896426f, + 0.967501f, 0.668001f, 0.700871f, 0.894878f, 0.453728f, 0.157534f, 0.986580f, 0.426672f, 0.820969f, 0.739923f, + 0.352277f, 0.581123f, 0.889559f, 0.249954f, 0.012738f, 0.481115f, 0.453346f, 0.241041f, 0.520494f, 0.326236f, + 0.639721f, 0.954378f, 0.538658f, 0.690915f, 0.081894f, 0.499936f, 0.572204f, 0.182921f, 0.706266f, 0.645798f, + 0.303381f, 0.932843f, 0.404739f, 0.322655f, 0.522892f, 0.058939f, 0.563665f, 0.524866f, 0.797733f, 0.861912f, + 0.756946f, 0.534076f, 0.037392f, 0.520718f, 0.491976f, 0.965886f, 0.858428f, 0.805397f, 0.715750f, 0.242962f, + 0.121840f, 0.549413f, 0.707581f, 0.625907f, 0.103884f, 0.967437f, 0.941807f, 0.750748f, 0.391316f, 0.179390f, + 0.954144f, 0.995861f, 0.943181f, 0.225535f, 0.365521f, 0.952603f, 0.655552f, 0.984128f, 0.967362f, 0.764658f, + 0.498658f, 0.382370f, 0.076204f, 0.943615f, 0.206783f, 0.774136f, 0.219836f, 0.290086f, 0.063939f, 0.209334f, + 0.172612f, 0.684041f, 0.813314f, 0.710075f, 0.069982f, 0.338582f, 0.209592f, 0.618762f, 0.537080f, 0.754518f, + 0.657660f, 0.775365f, 0.624964f, 0.544813f, 0.650043f, 0.851819f, 0.127388f, 0.513679f, 0.920330f, 0.419923f, + 0.486112f, 0.347025f, 0.555860f, 0.550530f, 0.693655f, 0.966579f, 0.293974f, 0.196309f, 0.675409f, 0.918160f, + 0.348893f, 0.196346f, 0.473992f, 0.668433f, 0.455520f, 0.089096f, 0.405057f, 0.970099f, 0.672699f, 0.614172f, + 0.233294f, 0.329279f, 0.718766f, 0.744805f, 0.732767f, 0.195352f, 0.845798f, 0.223270f, 0.112540f, 0.858727f, + 0.458333f, 0.753204f, 0.021647f, 0.119070f, 0.378121f, 0.015745f, 0.458821f, 0.738294f, 0.802076f, 0.364342f, + 0.452341f, 0.350539f, 0.763269f, 0.449212f, 0.404651f, 0.508437f, 0.239293f, 0.483217f, 0.315162f, 0.086802f, + 0.146036f, 0.347146f, 0.495040f, 0.036045f, 0.104871f, 0.805327f, 0.475591f, 0.858913f, 0.339811f, 0.397564f, + 0.992478f, 0.147723f, 0.033954f, 0.661169f, 0.727080f, 0.537663f, 0.627922f, 0.567574f, 0.110105f, 0.385743f, + 0.760046f, 0.035033f, 0.441879f, 0.432969f, 0.852450f, 0.733128f, 0.040908f, 0.465148f, 0.525712f, 0.027543f, + 0.959939f, 0.457182f, 0.666527f, 0.031669f, 0.908842f, 0.539977f, 0.656343f, 0.466810f, 0.461138f, 0.658768f, + 0.944778f, 0.801277f, 0.274225f, 0.808626f, 0.764664f, 0.227802f, 0.657667f, 0.106055f, 0.328335f, 0.770169f, + 0.481128f, 0.905028f, 0.271492f, 0.476027f, 0.611671f, 0.727043f, 0.733395f, 0.594644f, 0.898713f, 0.196084f, + 0.859941f, 0.294517f, 0.519280f, 0.563628f, 0.251777f, 0.501324f, 0.897753f, 0.246321f, 0.324222f, 0.585902f, + 0.554412f, 0.174032f, 0.936472f, 0.827655f, 0.987936f, 0.114385f, 0.947582f, 0.246243f, 0.324910f, 0.391096f, + 0.014144f, 0.268021f, 0.689953f, 0.063691f, 0.828527f, 0.860373f, 0.081199f, 0.311536f, 0.647020f, 0.959900f, + 0.587540f, 0.239769f, 0.393420f, 0.952011f, 0.649501f, 0.701122f, 0.654753f, 0.098328f, 0.019756f, 0.307255f, + 0.101182f, 0.903178f, 0.662636f, 0.183807f, 0.383673f, 0.268124f, 0.722163f, 0.242447f, 0.870546f, 0.520290f, + 0.535141f, 0.449352f, 0.382109f, 0.030094f, 0.014841f, 0.754523f, 0.398138f, 0.080007f, 0.994005f, 0.343086f, + 0.416415f, 0.497471f, 0.518243f, 0.594622f, 0.404539f, 0.024741f, 0.205798f, 0.463358f, 0.634085f, 0.004168f, + 0.288890f, 0.318634f, 0.649971f, 0.068623f, 0.011161f, 0.617764f, 0.595074f, 0.477778f, 0.098851f, 0.284219f, + 0.982623f, 0.378369f, 0.671127f, 0.716803f, 0.038332f, 0.175828f, 0.817099f, 0.248624f, 0.526941f, 0.143601f, + 0.318435f, 0.884003f, 0.956312f, 0.605227f, 0.516111f, 0.434986f, 0.446248f, 0.031918f, 0.876705f, 0.222946f, + 0.192030f, 0.151730f, 0.162001f, 0.931703f, 0.647385f, 0.263281f, 0.684891f, 0.196009f, 0.621328f, 0.875460f, + 0.116971f, 0.164779f, 0.810315f, 0.589415f, 0.584904f, 0.002092f, 0.368053f, 0.440462f, 0.466850f, 0.443596f, + 0.484220f, 0.870371f, 0.847502f, 0.015016f, 0.994610f, 0.624150f, 0.620991f, 0.027341f, 0.103521f, 0.971364f, + 0.694315f, 0.886678f, 0.523881f, 0.597125f, 0.947067f, 0.385271f, 0.754392f, 0.835389f, 0.975671f, 0.904114f, + 0.223580f, 0.351703f, 0.835343f, 0.052580f, 0.841164f, 0.205350f, 0.100214f, 0.310509f, 0.847647f, 0.990239f, + 0.434309f, 0.485149f, 0.367266f, 0.977029f, 0.723466f, 0.941467f, 0.249746f, 0.492914f, 0.584139f, 0.015198f, + 0.812326f, 0.527457f, 0.871326f, 0.821721f, 0.101746f, 0.594467f, 0.365567f, 0.751121f, 0.516166f, 0.369039f, + 0.557870f, 0.081583f, 0.060740f, 0.194498f, 0.932089f, 0.673928f, 0.694386f, 0.498688f, 0.422973f, 0.039913f, + 0.051126f, 0.339099f, 0.163220f, 0.351669f, 0.727191f, 0.116125f, 0.363897f, 0.637357f, 0.432239f, 0.345904f, + 0.623269f, 0.016948f, 0.826530f, 0.308751f, 0.290656f, 0.058387f, 0.264397f, 0.294895f, 0.639992f, 0.489059f, + 0.343698f, 0.929770f, 0.390125f, 0.397707f}; + + std::vector gamma_data = { + 0.447359f, 0.873295f, 0.351357f, 0.065158f, 0.442673f, 0.998459f, 0.379773f, 0.193055f, 0.045130f, 0.170969f, + 0.324064f, 0.574278f, 0.665588f, 0.042819f, 0.936180f, 0.235638f, 0.149062f, 0.530829f, 0.677586f, 0.307253f, + 0.669441f, 0.294294f, 0.902172f, 0.880695f, 0.071194f, 0.150403f, 0.698059f, 0.000120f, 0.821814f, 0.356240f, + 0.744620f, 0.044237f, 0.209264f, 0.070805f, 0.179824f, 0.384421f, 0.491552f, 0.916091f, 0.627174f, 0.706480f, + 0.082111f, 0.286787f, 0.991732f, 0.560422f, 0.787817f, 0.032482f, 0.084076f, 0.109233f, 0.015286f, 0.921979f, + 0.253635f, 0.996569f, 0.738130f, 0.250611f, 0.991805f, 0.868534f, 0.164998f, 0.185322f, 0.680186f, 0.078280f, + 0.584525f, 0.066603f, 0.221298f, 0.948440f, 0.498572f, 0.573713f, 0.269683f, 0.440062f, 0.133002f, 0.516616f, + 0.053956f, 0.048249f, 0.679648f, 0.054982f, 0.521284f, 0.266026f, 0.187694f, 0.573319f, 0.296463f, 0.456382f, + 0.138974f, 0.126486f, 0.106529f, 0.071560f, 0.553714f, 0.756005f, 0.792367f, 0.957845f, 0.168392f, 0.135619f, + 0.469955f, 0.861008f, 0.767069f, 0.558178f, 0.156783f, 0.391263f, 0.719346f, 0.373413f, 0.039119f, 0.583884f, + 0.720135f, 0.714771f, 0.164866f, 0.335992f, 0.409172f, 0.420481f, 0.114158f, 0.385532f, 0.506632f, 0.710561f, + 0.569448f, 0.404931f, 0.927597f, 0.598084f, 0.974791f, 0.867376f, 0.673626f, 0.899313f, 0.991240f, 0.220877f, + 0.691057f, 0.918779f, 0.017400f, 0.799489f, 0.089403f, 0.916554f, 0.612013f, 0.162069f}; + + std::vector beta_data = { + 0.039410f, 0.827821f, 0.139492f, 0.939541f, 0.090865f, 0.837978f, 0.423533f, 0.872735f, 0.768574f, 0.852882f, + 0.470242f, 0.713768f, 0.318668f, 0.047173f, 0.232400f, 0.001362f, 0.363028f, 0.493829f, 0.019407f, 0.007730f, + 0.686464f, 0.100436f, 0.073846f, 0.495598f, 0.718159f, 0.977165f, 0.295397f, 0.117518f, 0.068537f, 0.207511f, + 0.100055f, 0.003384f, 0.285074f, 0.164207f, 0.018250f, 0.354632f, 0.825916f, 0.303662f, 0.710100f, 0.728735f, + 0.025556f, 0.961785f, 0.139009f, 0.717465f, 0.379443f, 0.868223f, 0.994961f, 0.193323f, 0.819456f, 0.505503f, + 0.965431f, 0.658089f, 0.593238f, 0.229523f, 0.718700f, 0.288201f, 0.845759f, 0.977264f, 0.007793f, 0.954633f, + 0.358460f, 0.488316f, 0.924086f, 0.775958f, 0.243222f, 0.096853f, 0.841226f, 0.747060f, 0.858339f, 0.384041f, + 0.492114f, 0.465019f, 0.314722f, 0.335672f, 0.718649f, 0.753071f, 0.863854f, 0.844902f, 0.753938f, 0.332778f, + 0.710046f, 0.972624f, 0.916240f, 0.971488f, 0.036208f, 0.611599f, 0.215343f, 0.246560f, 0.844061f, 0.750192f, + 0.328802f, 0.519915f, 0.188330f, 0.003827f, 0.899958f, 0.709642f, 0.528818f, 0.054099f, 0.420840f, 0.380042f, + 0.171547f, 0.156188f, 0.173178f, 0.596836f, 0.124704f, 0.238549f, 0.946272f, 0.219462f, 0.763857f, 0.598040f, + 0.413157f, 0.595286f, 0.133620f, 0.484188f, 0.972134f, 0.427721f, 0.242881f, 0.927507f, 0.610774f, 0.727857f, + 0.543405f, 0.011202f, 0.755700f, 0.978697f, 0.716188f, 0.808757f, 0.851587f, 0.999201f}; + + std::vector norm_data = { + 0.406306f, 1.632045f, 0.095849f, 0.919355f, -0.458834f, 1.632483f, 0.876482f, 0.729815f, 0.750835f, + 0.782631f, 0.590117f, 1.476163f, 0.183714f, 0.057787f, -0.474648f, 0.143954f, 0.561618f, 0.031635f, + 0.426744f, 0.118848f, 0.054676f, 0.526575f, -0.827396f, -0.206514f, 0.631899f, 1.033381f, -0.028056f, + 0.117742f, -0.928939f, 0.254703f, 1.002641f, 0.056505f, 0.502409f, 0.186869f, -0.032152f, -0.201724f, + 0.683548f, 0.900928f, 0.126877f, 1.073324f, -0.017409f, 0.957481f, 0.710492f, 1.254686f, -0.620889f, + 0.882544f, 1.003820f, 0.385277f, 0.814893f, -0.841305f, 1.028838f, 1.664626f, 0.982238f, 0.150513f, + -0.461095f, -0.012286f, 1.094831f, 0.900296f, -0.437987f, 0.919201f, -0.604762f, 0.398245f, 1.126501f, + 1.388226f, 0.740287f, 0.352386f, 0.833504f, 0.614170f, 0.687727f, 0.626510f, 0.563813f, 0.408836f, + 0.651389f, 0.307533f, 1.158524f, 0.360064f, 0.588918f, 0.904664f, 0.418446f, 0.420879f, 0.495571f, + 0.796672f, 0.759542f, 0.996513f, -0.328335f, 1.636925f, -0.644444f, 1.350502f, 0.891792f, 0.600690f, + 0.795602f, 0.142066f, -0.015730f, -0.867947f, 1.039989f, 0.261774f, 1.182381f, 0.375100f, 0.493101f, + -0.112225f, 0.136779f, 1.225890f, 0.158450f, 1.142486f, -0.296101f, 0.228868f, 0.873088f, 0.397857f, + 0.432766f, 1.815673f, 0.353312f, -0.006854f, 0.251850f, 0.343477f, -0.497336f, 0.382225f, 0.758787f, + 0.117172f, 0.342274f, 0.892228f, -0.293386f, -1.206122f, 0.772336f, 1.964310f, 0.807267f, -0.553660f, + 1.500599f, 1.184999f, -0.397960f, 0.498094f, -0.040874f, 0.811189f, -0.472885f, 2.600490f, 0.192458f, + 1.023374f, 0.762038f, 1.098150f, -0.060817f, 0.078648f, 0.518953f, 0.044398f, 0.859423f, 0.038016f, + 0.176986f, 0.714081f, 0.648229f, -0.296623f, 1.040141f, 0.429690f, -0.494966f, 1.206059f, 0.709146f, + 1.134488f, 1.010104f, 0.117495f, 0.857719f, 0.187595f, -0.713799f, 0.068448f, 0.201653f, 0.252268f, + -0.172338f, 0.070818f, 1.228964f, 1.349056f, 0.173722f, 1.663625f, 0.077027f, 1.191751f, 0.094092f, + 1.179984f, -0.462022f, 0.831658f, 1.105588f, 0.249245f, 0.829719f, 1.360636f, 0.624319f, 1.011229f, + -0.634975f, 0.475544f, 0.034839f, 1.765260f, 0.660444f, 0.743193f, 0.795097f, 1.088295f, 0.300263f, + 0.476736f, 1.126446f, 0.453643f, 1.137416f, -0.572652f, 0.673148f, 1.159168f, 0.829472f, 0.160861f, + 0.424527f, 0.503895f, 1.131327f, 0.397239f, 1.178295f, 0.893187f, 1.147333f, 0.005007f, 0.581892f, + 1.174909f, 0.703488f, 0.937277f, 1.057870f, 1.042361f, 0.414783f, 1.554468f, -0.841804f, 1.139242f, + 0.742398f, 0.564714f, -0.081046f, 2.137630f, 0.467942f, 0.330163f, 0.807147f, 1.276604f, -0.094400f, + -0.540889f, 0.428099f, 0.338536f, -0.296175f, -0.883684f, -0.070659f, 0.765356f, -0.351806f, -0.067355f, + 1.118756f, 0.077982f, 0.446440f, -0.258010f, 0.252682f, 1.239576f, -0.979634f, 0.825275f, -0.463020f, + 0.125961f, -0.208515f, 1.494655f, 0.097464f, 0.432844f, 0.867343f, -0.536458f, 0.736790f, 0.328702f, + 0.819557f, 0.061518f, 0.591131f, 0.943027f, -0.514167f, 2.631821f, -0.116213f, 0.907785f, 0.237840f, + 2.037882f, 0.258945f, 0.554331f, 0.749937f, 1.132450f, 0.197422f, 0.606424f, -1.247748f, -0.002311f, + 1.839518f, 0.087527f, 0.521764f, -0.146106f, -0.980738f, -0.302773f, 0.676835f, -0.132461f, 0.596699f, + 0.617694f, 0.659876f, 0.765150f, 1.575500f, 0.117460f, -0.473908f, -0.423069f, -0.505049f, -0.037672f, + 0.447086f, 0.281287f, -0.018190f, 0.915389f, 1.207481f, -0.962214f, 1.016921f, 1.156551f, 0.019404f, + 0.709657f, 0.713726f, 1.354227f, 0.270004f, 0.840813f, 0.865947f, 0.102975f, 0.796979f, 0.520542f, + 1.122967f, -0.562413f, 1.328713f, 0.137686f, 1.895931f, -0.574054f, 0.856002f, 0.857834f, 0.983861f, + 0.978393f, 1.250703f, 0.584533f, 0.704302f, -0.218810f, -0.416660f, 1.397336f, 0.564530f, 0.582539f, + 0.895726f, 0.679485f, 0.442242f, 0.541062f, 0.109609f, 0.275143f, 0.107930f, 0.353517f, 0.707609f, + 0.915281f, 0.628682f, 0.355126f, 0.897993f, 0.923647f, 0.977992f, 0.935513f, -0.370250f, -0.000332f, + -0.462323f, 0.742310f, 0.634979f, 0.910902f, 1.059454f, 1.354346f, 1.166715f, 0.402587f, 1.013271f, + 0.793169f, 0.608214f, -0.429466f, 0.396397f, -0.096419f, 1.306138f, 0.732527f, -0.068210f, 0.480573f, + -0.084915f, 0.843406f, 1.124528f, -0.111570f, 0.667385f, 1.014872f, 1.221989f, 1.165116f, -1.026174f, + 1.364619f, 1.676924f, 0.592108f, 1.041303f, 0.342757f, 2.547432f, 0.978781f, 1.042198f, -0.103338f, + 0.761959f, -0.205143f, 0.651057f, 1.635670f, 0.429972f, 1.116617f, 0.121796f, 1.499508f, 0.477808f, + 1.004834f, 0.238391f, 1.527244f, 0.030314f, 0.851662f, 0.729685f, 0.833627f, 0.322326f, 1.746771f, + 1.284994f, -0.011233f, -0.003167f, 0.150784f, 0.258291f, 1.278590f, 0.351162f, 0.263747f, 0.240001f, + -0.496733f, 1.630098f, 0.959952f, 0.691867f, 0.781609f, 0.678993f, 0.117479f, 0.106319f, 0.737017f, + 0.054679f, 0.007170f, 0.031594f, 0.058290f, 0.042648f, 0.435231f, -0.069486f, 1.149115f, -0.284731f, + 0.478892f, -0.119480f, 1.305503f, -1.632824f, -0.186219f, 0.339929f, 0.911391f, 1.028848f, 0.301977f, + 0.828055f, -0.564568f, 1.414307f, 1.432279f, 0.605853f, 0.199995f, -0.442368f, 1.540784f, 0.876256f, + 0.771619f, -0.860229f, 1.000022f, 0.093107f, 0.450905f, 0.872359f, 2.159872f, 0.030324f, -0.212928f, + 0.691624f, 0.714844f, 0.749217f, -0.247668f, 0.546163f, 0.526861f, 0.965844f, 0.398844f, 0.808378f, + 0.411826f, 0.859227f, 1.148453f, 1.279391f, 0.239180f, 0.580433f, 1.115814f, 1.052553f, 0.938658f, + -0.629015f, 0.794400f, -0.489248f, 1.621988f, 0.789545f, 0.749079f, -0.024277f, 0.386545f, 0.105109f, + -0.943201f, 0.658228f, 0.981167f, 1.500447f, -0.074319f, 0.409342f, 1.247461f, -0.211410f, 0.188935f, + 0.095841f, 0.758850f, 0.595416f, 0.656214f, 0.905517f, -0.334851f, 0.295979f, 0.567667f, 0.073956f, + 0.349117f, 1.184909f, 0.027066f, 2.348872f, 1.470366f, -0.435509f, 1.778383f, -0.706661f, 0.577661f, + 0.928943f, -0.556358f, 0.781633f, 2.151912f, 0.761094f, -0.845767f, 0.154289f, 1.149119f, 0.798352f, + 0.738550f, 0.334614f, 0.879107f, 0.544361f, 1.241048f, -0.116848f, 0.680143f, 0.749525f, 0.903967f, + 0.521876f, 0.044571f, 0.636013f, 0.101543f, 1.160492f, -0.183245f, 0.374511f, 0.647929f, 0.272462f, + 0.221596f, 0.477392f, 0.293685f, 0.793616f, 0.434287f, 0.685965f, 0.952972f, -0.181046f, 0.117678f, + 1.207595f, -0.074850f, -0.407813f, -0.030066f, 0.048566f, 0.067439f, 0.001379f, -0.060172f, 0.280818f, + -0.846823f, 0.816884f, 0.685541f, 0.148579f, 1.228496f, 1.213856f, 0.082768f, -0.675884f, 0.850093f, + 1.127087f, 0.347097f, 0.837999f, 0.524987f, 1.104285f, -0.759169f, 0.635460f, 0.345593f, -0.202532f, + -0.625566f, 0.947332f, 1.108524f, 0.451968f, 1.059174f, -0.345810f, 0.363634f, 0.791247f, 1.440297f, + -0.205800f, -0.385072f, 0.646567f, 1.271695f, 0.794720f, -0.208087f, 0.540560f, 0.482087f, -0.006379f, + 0.408202f, 0.465379f, 0.459783f, 0.691498f, 0.068571f, 0.526139f, 0.197028f, 0.714096f, 0.845790f, + 1.019175f, 1.102269f, -0.858393f, 1.114611f, 0.139467f, 0.453660f, 0.601039f, 0.970108f, 0.433405f, + 1.179632f, 1.010146f, 0.152422f, 0.860524f, 0.488055f, -0.402708f, 0.493756f, 0.475332f, 0.663021f, + 0.781955f, 1.281084f, 0.160415f, 0.198334f, 0.888878f, 0.253592f, 1.097622f, 0.628367f, 0.510147f, + 0.872970f, 1.314789f, 0.231785f, -1.853665f, 0.667719f, 1.129714f, -0.395025f, 0.606279f, 0.720905f, + 1.613478f, 1.226488f, 1.121590f, 1.437285f, 0.732910f, 1.502128f, 0.744201f, -0.186752f, 1.338321f, + 1.092864f, -0.237060f, 2.343516f, 0.055512f, 0.903969f, 0.112821f, -0.874084f, 0.501252f, 0.883760f, + 0.825862f, 1.112298f, 0.830550f, 0.857847f, -0.774066f, 0.048746f, 0.173347f, 0.374310f, 0.508790f, + 0.892303f, 0.267807f, -0.501987f, -0.221902f, 0.168966f, 0.814484f, 0.951160f, 0.628689f, 1.171188f, + 1.143145f, 0.117596f, -0.374746f, -0.281733f, 1.347597f, 0.084762f, 0.563618f, 0.110448f, -0.044959f, + 0.876871f, 1.021856f, 1.680601f, 1.617434f, 1.269441f, 0.006120f, 0.766407f, -1.697397f, 1.538109f, + -0.396112f, 0.895946f, 0.915749f, 0.115267f, 0.798699f, -0.323668f, 0.707952f, 1.253857f, 1.336810f, + 0.388281f, -0.952190f, -0.330132f, 0.567214f, 0.989273f, -0.186156f, 1.005235f, 0.538783f, 0.540277f, + 0.963361f, 0.639664f, 0.462482f, 0.698468f, 0.534505f, 0.759512f, 1.099447f, 0.316237f, 0.498906f, + 0.445022f, 0.377702f, 0.339785f, 1.007626f, 1.143811f, 0.735108f, 0.274017f, 0.909311f, 0.923411f, + 0.633157f, 0.829842f, 0.907585f, 1.018174f, -0.330711f, -1.004036f, -0.470611f, 1.595124f, 1.007291f, + 0.851145f, -0.009376f, 0.217997f, 0.887566f, 0.570916f, 1.051678f, 0.245246f, 1.265329f, -0.268568f, + 0.373900f, 1.000752f, 0.128213f, 0.902059f, -0.106214f, 0.149962f, 0.074347f, -0.311712f, 0.962363f, + 0.626313f, 1.394106f, 0.275462f, 0.349021f, 0.389777f, 1.786896f, 0.567943f, 0.881495f, 0.817815f, + -0.143777f, 1.279913f, 0.339589f, 0.467706f, -0.153407f, -0.046937f, 0.766692f, -0.240678f, 0.593997f, + 1.864102f, 0.830787f, 1.217034f, -0.176123f, 0.595660f, 0.827656f, 0.861351f, -0.710248f, 1.412525f, + 0.737254f, 0.893155f, 0.796258f, 0.917900f, 0.020787f, 0.528776f, 0.896313f, -0.023476f, 0.010474f, + -0.061789f, 0.504972f, 0.727948f, -1.691226f, -0.209513f, 0.783358f, -0.402073f, 1.660988f, 0.398667f, + 0.746822f, 0.756122f, 1.075280f, 0.117522f, 0.482337f, 0.121187f, -0.097000f, 0.026081f, 0.564591f, + 0.229187f, -0.092778f, 0.715658f, 1.202137f, -0.648320f, 0.964561f, -0.294534f, -0.047344f, 1.191577f, + -0.162200f, 1.455440f, -0.230969f, 0.864577f, 1.022470f, 0.269888f, 0.830973f, 0.796731f, 1.288781f, + -0.279808f, 1.461457f, 0.011112f, 0.661665f, 0.377751f, 0.597034f, 0.896032f, 0.864871f, 0.834800f, + -0.242229f, 0.489711f, 0.900796f, -0.769517f, 0.893398f, 0.656636f, 1.234794f, 0.229293f, 1.113528f, + -0.032344f, 0.465116f, 0.453282f, -0.855888f, 0.287742f, 1.001159f, 0.339036f, 1.053392f, 1.481772f, + 0.350476f, 0.045156f, 0.789485f, 1.194247f, 0.953225f, 0.902432f, -0.469070f, 1.620967f, 0.308757f, + 0.558440f, 0.995676f, 0.582246f, -0.395102f, 0.145108f, -1.011727f, 0.925334f, 1.007595f, 0.227135f, + 0.257161f, -0.217773f, 0.446225f, -0.090537f, 1.239298f, 0.278935f, 0.210654f, 0.565323f, 0.079686f, + -0.291973f, 0.796541f, 0.646783f, 0.600274f, -0.508244f, 1.545499f, 0.378070f, 0.104429f, 0.718873f, + 1.460520f, 1.208726f, 0.296987f, -0.352711f, -0.089663f, 0.797042f, 1.431477f, -1.527740f, 0.749836f, + 0.820989f, 0.769196f, -0.563369f, -0.191057f, 1.076530f, 0.250859f, 0.857584f, -0.346350f, 0.894605f, + 0.886723f, 0.338762f, 0.656447f, 1.024669f, 0.693469f, 0.659168f, 0.905854f, 0.224581f, 0.357502f, + -0.007332f, -0.390864f, 0.307303f, 0.571301f, 0.437075f, -0.311736f, -0.249217f, 0.585570f, -0.397286f, + 1.381788f, -0.368342f, 0.647196f, 0.809376f, -0.462215f, 0.117660f, 0.453367f, -0.164130f, 0.558638f, + -0.054476f, 0.367439f, 0.244487f, -0.175144f, -0.005267f, 1.277564f, 0.465179f, 0.811168f, -0.541415f, + -0.034877f, 0.830097f, -0.216335f, 0.466843f, 0.311936f, 0.906072f, 1.086701f, 0.017932f, 0.843566f, + 0.882529f, 1.066624f, -0.810174f, -0.560630f, 0.625432f, 1.290380f, 1.393908f, 0.789381f, 0.972095f, + 1.008577f, 0.881400f, 0.765357f, 0.556296f, 1.274361f, 2.005213f, -0.179109f, -0.167324f, 1.110618f, + 0.147224f, 1.058541f, -0.114418f, 0.418016f, 0.438177f, 1.042157f, 0.420788f, 0.554656f, 0.714696f, + 0.778748f, 1.693937f, 0.954506f, 0.957155f, 0.581167f, 0.971378f, 0.951858f, 0.841796f, 0.464267f, + 0.329466f, 1.016010f, 1.023249f, 0.637742f, 0.840873f, 0.229559f, 1.614064f, 0.264498f, -0.269998f, + 0.941741f, 0.066780f, -0.447346f, -0.301152f, 0.471130f, 0.673516f, 0.764475f, 0.221142f, 0.141437f, + 0.050416f, -0.363394f, 0.133119f, 0.851959f, 0.138650f, 1.246940f, -0.408690f, 0.153658f, 0.840290f, + 0.181189f, 0.244843f, 1.995885f, -1.411448f, 1.422581f, 0.658642f, 0.243404f, 0.442854f, 0.230959f, + -0.272532f, 0.778544f, 1.461264f, 0.670758f, 2.274148f, 0.642745f, 0.948315f}; + + std::vector swish_data = { + 0.243866f, 1.365124f, 0.050220f, 0.657257f, -0.177689f, 1.365588f, 0.618877f, 0.492453f, 0.510088f, + 0.537078f, 0.379677f, 1.201586f, 0.100271f, 0.029728f, -0.182035f, 0.077149f, 0.357653f, 0.016068f, + 0.258221f, 0.062951f, 0.028085f, 0.331049f, -0.251691f, -0.092633f, 0.412580f, 0.762192f, -0.013831f, + 0.062333f, -0.263020f, 0.143483f, 0.733510f, 0.029050f, 0.313013f, 0.102139f, -0.015817f, -0.090723f, + 0.454239f, 0.640686f, 0.067457f, 0.799871f, -0.008629f, 0.691892f, 0.476392f, 0.976283f, -0.217050f, + 0.624266f, 0.734605f, 0.229296f, 0.564844f, -0.253452f, 0.757935f, 1.399714f, 0.714629f, 0.080910f, + -0.178318f, -0.006105f, 0.820346f, 0.640120f, -0.171787f, 0.657118f, -0.213635f, 0.238256f, 0.850725f, + 1.111010f, 0.501217f, 0.206920f, 0.581032f, 0.398530f, 0.457656f, 0.408295f, 0.359337f, 0.245632f, + 0.428173f, 0.177226f, 0.881711f, 0.212098f, 0.378743f, 0.644037f, 0.252370f, 0.254082f, 0.307957f, + 0.549116f, 0.517441f, 0.727826f, -0.137456f, 1.370297f, -0.221845f, 1.072585f, 0.632512f, 0.387934f, + 0.548196f, 0.076070f, -0.007803f, -0.256636f, 0.768393f, 0.147922f, 0.904965f, 0.222318f, 0.306135f, + -0.052967f, 0.073060f, 0.947734f, 0.085489f, 0.866159f, -0.126290f, 0.127472f, 0.615866f, 0.237987f, + 0.262488f, 1.561563f, 0.207543f, -0.003415f, 0.141699f, 0.200946f, -0.188076f, 0.227198f, 0.516802f, + 0.062015f, 0.200143f, 0.632902f, -0.125327f, -0.277876f, 0.528298f, 1.722697f, 0.558246f, -0.202095f, + 1.226985f, 0.907526f, -0.159901f, 0.309820f, -0.020019f, 0.561637f, -0.181556f, 2.420778f, 0.105460f, + 0.752824f, 0.519554f, 0.823517f, -0.029484f, 0.040870f, 0.325333f, 0.022692f, 0.603779f, 0.019369f, + 0.096304f, 0.479364f, 0.425634f, -0.126475f, 0.768537f, 0.260306f, -0.187456f, 0.928184f, 0.475279f, + 0.858428f, 0.740447f, 0.062195f, 0.602277f, 0.102570f, -0.234669f, 0.035395f, 0.110958f, 0.141960f, + -0.078762f, 0.036662f, 0.950773f, 1.071117f, 0.094387f, 1.398649f, 0.039996f, 0.914138f, 0.049258f, + 0.902623f, -0.178574f, 0.579420f, 0.830635f, 0.140073f, 0.577730f, 1.082880f, 0.406556f, 0.741494f, + -0.219945f, 0.293266f, 0.017723f, 1.507298f, 0.435470f, 0.503657f, 0.547762f, 0.814111f, 0.172503f, + 0.294135f, 0.850672f, 0.277405f, 0.861257f, -0.206513f, 0.445763f, 0.882337f, 0.577514f, 0.086886f, + 0.256654f, 0.314115f, 0.855378f, 0.237559f, 0.900973f, 0.633758f, 0.870852f, 0.002510f, 0.373285f, + 0.897667f, 0.470606f, 0.673480f, 0.785239f, 0.770623f, 0.249797f, 1.283304f, -0.253514f, 0.863022f, + 0.502989f, 0.360029f, -0.038882f, 1.912126f, 0.287736f, 0.192089f, 0.558143f, 0.998140f, -0.044974f, + -0.199037f, 0.259179f, 0.197649f, -0.126316f, -0.258402f, -0.034082f, 0.522367f, -0.145276f, -0.032544f, + 0.843271f, 0.040511f, 0.272236f, -0.112454f, 0.142219f, 0.961279f, -0.267405f, 0.573859f, -0.178851f, + 0.066942f, -0.093427f, 1.220799f, 0.051105f, 0.262543f, 0.610777f, -0.197959f, 0.498287f, 0.191122f, + 0.568889f, 0.031705f, 0.380467f, 0.678707f, -0.192410f, 2.455177f, -0.054734f, 0.646839f, 0.132996f, + 1.802949f, 0.146142f, 0.352078f, 0.509331f, 0.856461f, 0.108424f, 0.392432f, -0.278360f, -0.001154f, + 1.587304f, 0.045678f, 0.327438f, -0.067726f, -0.267492f, -0.128642f, 0.448763f, -0.061851f, 0.384811f, + 0.401312f, 0.435012f, 0.522193f, 1.305406f, 0.062175f, -0.181835f, -0.167443f, -0.190078f, -0.018481f, + 0.272698f, 0.160295f, -0.009012f, 0.653681f, 0.929582f, -0.265990f, 0.746799f, 0.879794f, 0.009796f, + 0.475701f, 0.479070f, 1.076367f, 0.153117f, 0.587422f, 0.609541f, 0.054136f, 0.549380f, 0.326523f, + 0.847322f, -0.204150f, 1.050518f, 0.073575f, 1.648380f, -0.206833f, 0.600764f, 0.602378f, 0.716126f, + 0.711085f, 0.972323f, 0.375335f, 0.471277f, -0.097483f, -0.165546f, 1.120330f, 0.359888f, 0.373787f, + 0.636029f, 0.450923f, 0.269234f, 0.341984f, 0.057805f, 0.156379f, 0.056874f, 0.207681f, 0.474008f, + 0.653584f, 0.410021f, 0.208764f, 0.638058f, 0.661132f, 0.710716f, 0.671878f, -0.151240f, -0.000166f, + -0.178658f, 0.502916f, 0.415034f, 0.649641f, 0.786735f, 1.076488f, 0.889679f, 0.241274f, 0.743397f, + 0.546106f, 0.393839f, -0.169319f, 0.236975f, -0.045887f, 1.027756f, 0.494719f, -0.032942f, 0.296938f, + -0.040656f, 0.589695f, 0.848825f, -0.052676f, 0.441086f, 0.744888f, 0.943881f, 0.888123f, -0.270732f, + 1.086931f, 1.412803f, 0.381228f, 0.769628f, 0.200465f, 2.362491f, 0.711443f, 0.770470f, -0.049002f, + 0.519488f, -0.092087f, 0.427906f, 1.368965f, 0.260506f, 0.841215f, 0.064602f, 1.225849f, 0.294918f, + 0.735547f, 0.133336f, 1.254788f, 0.015386f, 0.596943f, 0.492345f, 0.581139f, 0.186914f, 1.487454f, + 1.006534f, -0.005585f, -0.001581f, 0.081065f, 0.145732f, 1.000125f, 0.206097f, 0.149164f, 0.134332f, + -0.187918f, 1.363060f, 0.694153f, 0.461047f, 0.536204f, 0.450521f, 0.062186f, 0.055983f, 0.498477f, + 0.028087f, 0.003598f, 0.016046f, 0.029994f, 0.021779f, 0.264239f, -0.033536f, 0.872580f, -0.122234f, + 0.295709f, -0.056175f, 1.027117f, -0.266875f, -0.084465f, 0.198578f, 0.650082f, 0.757945f, 0.173615f, + 0.576280f, -0.204651f, 1.137731f, 1.156216f, 0.391983f, 0.109964f, -0.173044f, 1.268957f, 0.618676f, + 0.527688f, -0.255739f, 0.731079f, 0.048719f, 0.275437f, 0.615220f, 1.936515f, 0.015392f, -0.095172f, + 0.460849f, 0.479997f, 0.508724f, -0.108577f, 0.345855f, 0.331264f, 0.699551f, 0.238672f, 0.559207f, + 0.247724f, 0.603606f, 0.871938f, 1.000926f, 0.133824f, 0.372154f, 0.840444f, 0.780221f, 0.674734f, + -0.218730f, 0.547163f, -0.185949f, 1.354473f, 0.542996f, 0.508608f, -0.011991f, 0.230168f, 0.055314f, + -0.264336f, 0.433681f, 0.713642f, 1.226827f, -0.035779f, 0.245986f, 0.969103f, -0.094573f, 0.103365f, + 0.050215f, 0.516856f, 0.383809f, 0.432058f, 0.644802f, -0.139653f, 0.169732f, 0.362299f, 0.038345f, + 0.204724f, 0.907438f, 0.013716f, 2.144154f, 1.195574f, -0.171073f, 1.521402f, -0.233436f, 0.370009f, + 0.665922f, -0.202732f, 0.536225f, 1.927784f, 0.518755f, -0.254002f, 0.083084f, 0.872584f, 0.550561f, + 0.499761f, 0.195041f, 0.621209f, 0.344486f, 0.962738f, -0.055014f, 0.451459f, 0.508984f, 0.643412f, + 0.327522f, 0.022782f, 0.415858f, 0.053347f, 0.883625f, -0.083251f, 0.221916f, 0.425394f, 0.154676f, + 0.123024f, 0.294614f, 0.168252f, 0.546489f, 0.263568f, 0.456214f, 0.687772f, -0.082351f, 0.062297f, + 0.929695f, -0.036025f, -0.162895f, -0.014807f, 0.024873f, 0.034856f, 0.000690f, -0.029181f, 0.159995f, + -0.254131f, 0.566570f, 0.455867f, 0.079798f, 0.950309f, 0.935859f, 0.043096f, -0.227895f, 0.595564f, + 0.851290f, 0.203369f, 0.584960f, 0.329856f, 0.829387f, -0.242043f, 0.415417f, 0.202362f, -0.091046f, + -0.218020f, 0.682626f, 0.833448f, 0.276201f, 0.786472f, -0.143303f, 0.214515f, 0.544456f, 1.164481f, + -0.092349f, -0.155917f, 0.424301f, 0.993236f, 0.547438f, -0.093257f, 0.341603f, 0.298046f, -0.003180f, + 0.245189f, 0.285878f, 0.281830f, 0.460745f, 0.035461f, 0.330722f, 0.108188f, 0.479376f, 0.591785f, + 0.748902f, 0.827456f, -0.255522f, 0.839288f, 0.074588f, 0.277417f, 0.388207f, 0.703465f, 0.262941f, + 0.902279f, 0.740486f, 0.082008f, 0.604751f, 0.302422f, -0.161350f, 0.306618f, 0.293111f, 0.437553f, + 0.536501f, 1.002620f, 0.086627f, 0.108969f, 0.629911f, 0.142787f, 0.823012f, 0.409770f, 0.318761f, + 0.615761f, 1.036466f, 0.129264f, -0.251066f, 0.441357f, 0.853822f, -0.159001f, 0.392318f, 0.485029f, + 1.345469f, 0.948325f, 0.845997f, 1.161375f, 0.495040f, 1.228578f, 0.504504f, -0.084682f, 1.060236f, + 0.818468f, -0.104546f, 2.138265f, 0.028526f, 0.643413f, 0.059589f, -0.257335f, 0.312156f, 0.625349f, + 0.574370f, 0.837068f, 0.578454f, 0.602389f, -0.244295f, 0.024967f, 0.094167f, 0.221779f, 0.317751f, + 0.632969f, 0.151727f, -0.189286f, -0.098691f, 0.091604f, 0.564490f, 0.686118f, 0.410026f, 0.894037f, + 0.866797f, 0.062251f, -0.152669f, -0.121153f, 1.069637f, 0.044176f, 0.359187f, 0.058271f, -0.021974f, + 0.619223f, 0.751405f, 1.416720f, 1.349653f, 0.990985f, 0.003070f, 0.523259f, -0.262766f, 1.266156f, + -0.159335f, 0.636225f, 0.654005f, 0.060952f, 0.550860f, -0.135870f, 0.474291f, 0.975459f, 1.058707f, + 0.231365f, -0.265132f, -0.138064f, 0.361950f, 0.721125f, -0.084439f, 0.735919f, 0.340258f, 0.341388f, + 0.697275f, 0.418773f, 0.283780f, 0.466470f, 0.337023f, 0.517416f, 0.824758f, 0.182914f, 0.310421f, + 0.271221f, 0.224097f, 0.198482f, 0.738142f, 0.867442f, 0.496878f, 0.155663f, 0.648210f, 0.660919f, + 0.413581f, 0.577837f, 0.646659f, 0.747968f, -0.138260f, -0.269231f, -0.180937f, 1.326083f, 0.737830f, + 0.596488f, -0.004666f, 0.120832f, 0.628741f, 0.364801f, 0.779395f, 0.137584f, 0.986884f, -0.116360f, + 0.221499f, 0.731756f, 0.068210f, 0.641700f, -0.050289f, 0.080593f, 0.038555f, -0.131760f, 0.696361f, + 0.408138f, 1.117023f, 0.156582f, 0.204659f, 0.232396f, 1.530559f, 0.362511f, 0.623333f, 0.567378f, + -0.066730f, 1.001448f, 0.198351f, 0.287564f, -0.070832f, -0.022918f, 0.523501f, -0.105927f, 0.382702f, + 1.613891f, 0.578661f, 0.938992f, -0.080327f, 0.384000f, 0.575932f, 0.605480f, -0.234058f, 1.135901f, + 0.498675f, 0.633730f, 0.548760f, 0.655944f, 0.010501f, 0.332705f, 0.636554f, -0.011600f, 0.005264f, + -0.029940f, 0.314914f, 0.490896f, -0.263180f, -0.093822f, 0.537700f, -0.161157f, 1.395845f, 0.238550f, + 0.506708f, 0.514549f, 0.801729f, 0.062210f, 0.298229f, 0.064261f, -0.046150f, 0.013210f, 0.359935f, + 0.127668f, -0.044238f, 0.480672f, 0.924329f, -0.222613f, 0.698375f, -0.125735f, -0.023112f, 0.913967f, + -0.074537f, 1.180120f, -0.102207f, 0.608330f, 0.751979f, 0.153044f, 0.578823f, 0.549166f, 1.010328f, + -0.120458f, 1.186345f, 0.005587f, 0.436457f, 0.224131f, 0.385073f, 0.636303f, 0.608590f, 0.582164f, + -0.106517f, 0.303639f, 0.640568f, -0.243616f, 0.633947f, 0.432398f, 0.956541f, 0.127733f, 0.838249f, + -0.015911f, 0.285687f, 0.277146f, -0.255225f, 0.164428f, 0.732134f, 0.197982f, 0.781012f, 1.207408f, + 0.205636f, 0.023088f, 0.542946f, 0.916584f, 0.688003f, 0.642034f, -0.180515f, 1.353391f, 0.178024f, + 0.355219f, 0.727050f, 0.373560f, -0.159025f, 0.077809f, -0.269769f, 0.662657f, 0.738113f, 0.126410f, + 0.145023f, -0.097077f, 0.272082f, -0.043221f, 0.961003f, 0.158794f, 0.116380f, 0.360497f, 0.041430f, + -0.124825f, 0.549003f, 0.424474f, 0.387609f, -0.190899f, 1.273897f, 0.224349f, 0.054939f, 0.483341f, + 1.185376f, 0.930808f, 0.170383f, -0.145573f, -0.042823f, 0.549434f, 1.155390f, -0.272434f, 0.509246f, + 0.570132f, 0.525628f, -0.204372f, -0.086430f, 0.802915f, 0.141080f, 0.602158f, -0.143482f, 0.635026f, + 0.627989f, 0.197799f, 0.432245f, 0.754035f, 0.462362f, 0.434440f, 0.645105f, 0.124847f, 0.210367f, + -0.003652f, -0.157717f, 0.177076f, 0.365097f, 0.265550f, -0.131768f, -0.109161f, 0.376140f, -0.159695f, + 1.104433f, -0.150630f, 0.424806f, 0.560069f, -0.178628f, 0.062287f, 0.277207f, -0.075345f, 0.355371f, + -0.026496f, 0.217098f, 0.137113f, -0.079923f, -0.002627f, 0.999099f, 0.285733f, 0.561619f, -0.199164f, + -0.017134f, 0.578059f, -0.096513f, 0.286938f, 0.180099f, 0.645301f, 0.812592f, 0.009046f, 0.589835f, + 0.624253f, 0.793519f, -0.249415f, -0.203734f, 0.407439f, 1.011931f, 1.116821f, 0.542856f, 0.705290f, + 0.739027f, 0.623249f, 0.522368f, 0.353579f, 0.995898f, 1.767281f, -0.081556f, -0.076679f, 0.835456f, + 0.079021f, 0.785873f, -0.053940f, 0.252067f, 0.266335f, 0.770431f, 0.254018f, 0.352326f, 0.479874f, + 0.533761f, 1.430939f, 0.689173f, 0.691594f, 0.372724f, 0.704632f, 0.686755f, 0.588283f, 0.285072f, + 0.191627f, 0.745949f, 0.752707f, 0.417238f, 0.587475f, 0.127896f, 1.346089f, 0.149638f, -0.116884f, + 0.677536f, 0.034505f, -0.174461f, -0.128073f, 0.290052f, 0.446063f, 0.521620f, 0.122747f, 0.075711f, + 0.025843f, -0.149042f, 0.070983f, 0.597205f, 0.074123f, 0.968585f, -0.163160f, 0.082720f, 0.586964f, + 0.098779f, 0.137334f, 1.757106f, -0.276652f, 1.146234f, 0.434016f, 0.136440f, 0.269671f, 0.128756f, + -0.117812f, 0.533588f, 1.186146f, 0.443822f, 2.062000f, 0.421238f, 0.683523f}; + + // Test float16, without activation + int min_cuda_architecture = 530; + if (HasCudaEnvironment(min_cuda_architecture)) { + OpTester test("GroupNorm", 1, onnxruntime::kMSDomain); + test.AddAttribute("epsilon", 1e-05f); + test.AddAttribute("groups", 32); + test.AddAttribute("activation", 0); + + test.AddInput("X", dims, ToFloat16(input_data)); + test.AddInput("gamma", {C}, gamma_data); + + test.AddInput("beta", {C}, beta_data); + + constexpr float rel_error = 0.0f; + constexpr float abs_error = 0.02f; + test.AddOutput("Y", dims, ToFloat16(norm_data), false, rel_error, abs_error); + + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } + + // Test float32, with activation + if (HasCudaEnvironment(0)) { + OpTester test("GroupNorm", 1, onnxruntime::kMSDomain); + test.AddAttribute("epsilon", 1e-05f); + test.AddAttribute("groups", 32); + test.AddAttribute("activation", 1); + + test.AddInput("X", dims, input_data); + test.AddInput("gamma", {C}, gamma_data); + test.AddInput("beta", {C}, beta_data); + + constexpr float rel_error = 0.0f; + constexpr float abs_error = 0.01f; + test.AddOutput("Y", dims, swish_data, false, rel_error, abs_error); + + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } +} + +} // namespace test +} // namespace onnxruntime diff --git a/onnxruntime/test/contrib_ops/nhwc_conv_op_test.cc b/onnxruntime/test/contrib_ops/nhwc_conv_op_test.cc new file mode 100644 index 0000000000000..6cffaa4d57bf4 --- /dev/null +++ b/onnxruntime/test/contrib_ops/nhwc_conv_op_test.cc @@ -0,0 +1,223 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "gtest/gtest.h" +#include "test/providers/provider_test_utils.h" +#include "test/common/tensor_op_test_utils.h" +#include "test/common/cuda_op_test_utils.h" + +using namespace std; +namespace onnxruntime { +namespace test { + +namespace { + +struct NhwcConvOpAndTestAttributes { + string auto_pad; + vector dilations; + int64_t group; + vector kernel_shape; + vector pads; + vector strides; + std::unordered_set excluded_providers; +}; + +void TestNhwcConvOp(const NhwcConvOpAndTestAttributes& attributes, + const vector>& inputs, + const vector>& input_shapes, + const std::initializer_list& expected_output, + const vector& expected_output_shape, + bool use_float16, + bool weight_is_initializer = false) { + int min_cuda_architecture = use_float16 ? 530 : 0; + bool enable_cuda = HasCudaEnvironment(min_cuda_architecture); + if (enable_cuda) { + OpTester test("NhwcConv", 1, onnxruntime::kMSDomain); + test.AddAttribute("group", attributes.group); + test.AddAttribute("kernel_shape", attributes.kernel_shape); + + if (!attributes.dilations.empty()) { + test.AddAttribute("dilations", attributes.dilations); + } + + // Only one of pads / auto_pad can be present + if (!attributes.pads.empty()) { + test.AddAttribute("pads", attributes.pads); + } else { + test.AddAttribute("auto_pad", attributes.auto_pad); + } + + if (!attributes.strides.empty()) { + test.AddAttribute("strides", attributes.strides); + } + + ORT_ENFORCE(inputs.size() <= 3, "Our name array is only setup to handle 3 inputs"); + const char* szNames[] = {"X", "W", "B"}; + + if (use_float16) { + test.AddInput(szNames[0], input_shapes[0], ToFloat16(inputs[0])); + test.AddInput(szNames[1], input_shapes[1], ToFloat16(inputs[1]), weight_is_initializer); + if (inputs.size() == 3) { + test.AddInput(szNames[2], input_shapes[2], ToFloat16(inputs[2])); + } + test.AddOutput("Y", expected_output_shape, ToFloat16(expected_output)); + } else { + test.AddInput(szNames[0], input_shapes[0], inputs[0]); + test.AddInput(szNames[1], input_shapes[1], inputs[1], weight_is_initializer); + if (inputs.size() == 3) { + test.AddInput(szNames[2], input_shapes[2], inputs[2]); + } + test.AddOutput("Y", expected_output_shape, expected_output); + } + + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } +} + +void RunNhwcConv(const NhwcConvOpAndTestAttributes& attributes, + const vector>& inputs, + const vector>& input_shapes, + const std::initializer_list& expected_output, + const vector& expected_output_shape) { + bool use_float16 = true; + bool weight_is_initializer = true; + TestNhwcConvOp(attributes, inputs, input_shapes, expected_output, expected_output_shape, use_float16, weight_is_initializer); + + use_float16 = false; + weight_is_initializer = false; + TestNhwcConvOp(attributes, inputs, input_shapes, expected_output, expected_output_shape, use_float16, weight_is_initializer); +} + +} // namespace + +TEST(NhwcConvTest, Conv2D_2) { + NhwcConvOpAndTestAttributes attrs = { + "", // auto_pad + vector{1, 1}, // dilations + 1, // group + vector{1, 1}, // kernel_shape + vector{0, 0, 0, 0}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X = { + 0.45246148109436035f, 0.15498268604278564f, 0.11199361085891724f, -0.39421093463897705f, + 0.2626858949661255f, 0.13414543867111206f, -0.27184486389160156f, -0.43028733134269714f, + -0.26825493574142456f, 0.3893144130706787f, -0.13631996512413025f, -0.009590476751327515f, + -0.48771554231643677f, -0.25256502628326416f, -0.2812897562980652f, 0.4043201804161072f, + 0.07795023918151855f, 0.326981782913208f, 0.13114392757415771f, -0.4416425824165344f, + 0.12446999549865723f, 0.36739975214004517f, 0.1698915958404541f, 0.2008744478225708f, + 0.23339951038360596f, 0.38613730669021606f, 0.11117297410964966f, 0.3877097964286804f, + 0.20812749862670898f, -0.34297940135002136f, -0.029246658086776733f, -0.20483523607254028f, + -0.19244328141212463f, -0.11104947328567505f, -0.32830488681793213f, -0.01800677180290222f, + 0.3618946671485901f, -0.40949052572250366f, -0.18248388171195984f, -0.3349453806877136f, + -0.34091079235076904f, 0.006497859954833984f, 0.4537564516067505f, 0.08006560802459717f, + -0.14788749814033508f, 0.034442365169525146f, -0.33322954177856445f, 0.06049239635467529f, + 0.42619407176971436f}; + vector X_shape = {1, 7, 7, 1}; + vector W = {-0.4406261742115021f}; + vector W_shape = {1, 1, 1, 1}; + vector Y_shape = {1, 7, 7, 1}; + auto expected_vals = { + -0.19936637580394745f, -0.06828942894935608f, -0.04934731498360634f, 0.17369966208934784f, + -0.11574628204107285f, -0.05910799279808998f, 0.1197819635272026f, 0.18959586322307587f, + 0.1182001456618309f, -0.17154212296009064f, 0.06006614491343498f, 0.0042258151806890965f, + 0.21490024030208588f, 0.11128675937652588f, 0.12394362688064575f, -0.17815405130386353f, + -0.034346915781497955f, -0.14407673478126526f, -0.05778544768691063f, 0.19459928572177887f, + -0.05484473705291748f, -0.16188594698905945f, -0.07485868036746979f, -0.08851054310798645f, + -0.10284193605184555f, -0.17014220356941223f, -0.04898572340607643f, -0.17083507776260376f, + -0.09170642495155334f, 0.1511256992816925f, 0.012886842712759972f, 0.09025576710700989f, + 0.08479554951190948f, 0.0489313043653965f, 0.14465972781181335f, 0.007934254594147205f, + -0.15946026146411896f, 0.1804322451353073f, 0.08040717244148254f, 0.1475857049226761f, + 0.15021422505378723f, -0.0028631272725760937f, -0.19993697106838226f, -0.03527900204062462f, + 0.06516310572624207f, -0.015176207758486271f, 0.14682966470718384f, -0.02665453404188156f, + -0.18779225647449493f}; + RunNhwcConv(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape); +} + +TEST(NhwcConvTest, Conv2D_Bias_1) { + NhwcConvOpAndTestAttributes attrs = { + "", // auto_pad + vector{1, 1}, // dilations + 1, // group + vector{2, 2}, // kernel_shape + vector{0, 0, 0, 0}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f}; + vector X_shape = {1, 3, 3, 1}; + vector W = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; + vector W_shape = {2, 2, 2, 1}; + vector Y_shape = {1, 2, 2, 2}; + vector B = {1.0f, -1.0f}; + vector B_shape = {2}; + auto expected_vals = {13.0f, 11.0f, 17.0f, 15.0f, 25.0f, 23.0f, 29.0f, 27.0f}; + + RunNhwcConv(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); +} + +TEST(NhwcConvTest, Conv2D_AutoPad1) { + NhwcConvOpAndTestAttributes attrs = { + "SAME_UPPER", // auto_pad + vector{1, 1}, // dilations + 1, // group + vector{3, 3}, // kernel_shape + {}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X = vector(25, 1.0f); + vector X_shape = {1, 5, 5, 1}; + vector W = {0.0f, 1.0f, 2.0f, + 3.0f, 4.0f, 5.0f, + 6.0f, 7.0f, 8.0f}; + + vector W_shape = {1, 3, 3, 1}; + vector Y_shape = {1, 5, 5, 1}; + auto expected_vals = {24.0f, 33.0f, 33.0f, 33.0f, 20.0f, + 27.0f, 36.0f, 36.0f, 36.0f, 21.0f, + 27.0f, 36.0f, 36.0f, 36.0f, 21.0f, + 27.0f, 36.0f, 36.0f, 36.0f, 21.0f, + 12.0f, 15.0f, 15.0f, 15.0f, 8.0f}; + RunNhwcConv(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape); +} + +TEST(NhwcConvTest, Conv2D_AutoPad2) { + NhwcConvOpAndTestAttributes attrs = { + "SAME_LOWER", // auto_pad + vector{1, 1}, // dilations + 1, // group + vector{3, 3}, // kernel_shape + {}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X = {1.0f, 0.0f, 1.0f, 0.0f, 1.0f, + 1.0f, 0.0f, 1.0f, 0.0f, 1.0f, + 1.0f, 0.0f, 1.0f, 0.0f, 1.0f, + 1.0f, 0.0f, 1.0f, 0.0f, 1.0f, + 1.0f, 0.0f, 1.0f, 0.0f, 1.0f}; + vector X_shape = {1, 5, 5, 1}; + vector W = {0.0f, 1.0f, 2.0f, + 3.0f, 4.0f, 5.0f, + 6.0f, 7.0f, 8.0f}; + + vector W_shape = {1, 3, 3, 1}; + vector Y_shape = {1, 5, 5, 1}; + auto expected_vals = {11.0f, 22.0f, 11.0f, 22.0f, 11.0f, + 12.0f, 24.0f, 12.0f, 24.0f, 12.0f, + 12.0f, 24.0f, 12.0f, 24.0f, 12.0f, + 12.0f, 24.0f, 12.0f, 24.0f, 12.0f, + 5.0f, 10.0f, 5.0f, 10.0f, 5.0f}; + RunNhwcConv(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape); +} + +} // namespace test +} // namespace onnxruntime diff --git a/onnxruntime/test/python/transformers/test_attention_fusion.py b/onnxruntime/test/python/transformers/test_attention_fusion.py index 74d20295a0a63..657d52cc15a31 100644 --- a/onnxruntime/test/python/transformers/test_attention_fusion.py +++ b/onnxruntime/test/python/transformers/test_attention_fusion.py @@ -40,6 +40,7 @@ def test_multi_head_attention_fusion(self): onnx.save(model, model_path) options = FusionOptions("bert") options.use_multi_head_attention = True + options.use_raw_attention_mask(True) optimized_model = optimize_model(model_path, optimization_options=options) os.remove(model_path) self.verify_fusion(optimized_model, "attention_mha.onnx") @@ -49,7 +50,9 @@ def test_attention_fusion(self): dir = "." model_path = os.path.join(dir, "attention.onnx") onnx.save(model, model_path) - optimized_model = optimize_model(model_path) + options = FusionOptions("bert") + options.use_raw_attention_mask(True) + optimized_model = optimize_model(model_path, optimization_options=options) os.remove(model_path) self.verify_fusion(optimized_model, "attention_opt.onnx") @@ -64,7 +67,9 @@ def test_attention_fusion_pruned_model(self): dir = "." model_path = os.path.join(dir, "pruned_attention.onnx") onnx.save(model, model_path) - optimized_model = optimize_model(model_path) + options = FusionOptions("bert") + options.use_raw_attention_mask(True) + optimized_model = optimize_model(model_path, optimization_options=options) os.remove(model_path) self.verify_fusion(optimized_model, "pruned_attention_opt.onnx") @@ -80,7 +85,9 @@ def test_attention_fusion_reverse_add_order(self): dir = "." model_path = os.path.join(dir, "bert_attention_reverse_add_order.onnx") onnx.save(model, model_path) - optimized_model = optimize_model(model_path) + options = FusionOptions("bert") + options.use_raw_attention_mask(True) + optimized_model = optimize_model(model_path, optimization_options=options) os.remove(model_path) # reverse add input order will get same optimized model @@ -96,7 +103,9 @@ def test_attention_fusion_for_varied_qkv_dimensions(self): dir = "." model_path = os.path.join(dir, "attention_with_varied_qkv.onnx") onnx.save(model, model_path) - optimized_model = optimize_model(model_path) + options = FusionOptions("bert") + options.use_raw_attention_mask(True) + optimized_model = optimize_model(model_path, optimization_options=options) os.remove(model_path) self.verify_fusion(optimized_model, "attention_with_varied_qkv_opt.onnx") @@ -113,7 +122,9 @@ def test_attention_fusion_for_varied_qkv_dimensions_with_wrong_opt_parameters(se onnx.save(model, model_path) # wrong num_heads and hidden_size - optimized_model = optimize_model(model_path, "bert", num_heads=8, hidden_size=8) + options = FusionOptions("bert") + options.use_raw_attention_mask(True) + optimized_model = optimize_model(model_path, "bert", num_heads=8, hidden_size=8, optimization_options=options) os.remove(model_path) From c6c11039d76c95798238e358c898a562a200a239 Mon Sep 17 00:00:00 2001 From: pengwa Date: Fri, 3 Feb 2023 16:59:11 +0800 Subject: [PATCH 05/68] Fix sharing scalar bug (#14544) If an initializer is used as graph outputs, we should keep its name, instead of renaming it as constant sharing transformer did currently. To fix https://github.com/microsoft/onnxruntime/issues/14488 --- .../core/optimizer/constant_sharing.cc | 12 ++- .../test/optimizer/graph_transform_test.cc | 74 ++++++++++++++++-- .../transform/scalar_const_not_share.onnx | Bin 0 -> 212 bytes .../transform/scalar_const_not_share.py | 19 +++++ 4 files changed, 98 insertions(+), 7 deletions(-) create mode 100644 onnxruntime/test/testdata/transform/scalar_const_not_share.onnx create mode 100644 onnxruntime/test/testdata/transform/scalar_const_not_share.py diff --git a/onnxruntime/core/optimizer/constant_sharing.cc b/onnxruntime/core/optimizer/constant_sharing.cc index 96c60bfd145d8..fa9a309098c76 100644 --- a/onnxruntime/core/optimizer/constant_sharing.cc +++ b/onnxruntime/core/optimizer/constant_sharing.cc @@ -129,7 +129,9 @@ struct GetOrAddValueInConstantStoreDispatcher { } // namespace Status ConstantSharing::ApplyImpl(Graph& graph, bool& modified, int /*graph_level*/, - const logging::Logger& /*logger*/) const { + const logging::Logger& logger) const { + int shared_count = 0; + // Accumulated map from type/value/rank to initializer: // > The key is a string representation of initializer's data type, value and rank. // > The value is newly created initializer NodeArg* to be shared. @@ -138,9 +140,11 @@ Status ConstantSharing::ApplyImpl(Graph& graph, bool& modified, int /*graph_leve InlinedVector original_initializer_names; original_initializer_names.reserve(initialized_tensor_set.size()); for (const auto& entry : initialized_tensor_set) { - // Ignore if the initializer already handled, or not a constant initializer. + // Ignore if the initializer exists in graph output, already handled, + // or not a constant initializer (implicitly excludes the graph input). if (IsSharedInitializer(entry.first) || !graph_utils::IsConstantInitializer(graph, entry.first) || + graph.IsOutput(graph.GetNodeArg(entry.first)) || excluded_initializers_.find(entry.first) != excluded_initializers_.end()) { continue; } @@ -191,6 +195,8 @@ Status ConstantSharing::ApplyImpl(Graph& graph, bool& modified, int /*graph_leve NodeArg& shared_scalar_initializer_node_arg = graph_utils::AddInitializer(graph, constant_tensor_proto_as_replacement); pattern_key_to_shared_arg_map[pattern_key] = &shared_scalar_initializer_node_arg; + } else { + shared_count += 1; } ReplaceInputsToUseSharedInitializer(graph, consumer_node_to_input_ports_map, origin_initializer_node_arg, @@ -199,6 +205,8 @@ Status ConstantSharing::ApplyImpl(Graph& graph, bool& modified, int /*graph_leve modified = true; } + LOGS(logger, INFO) << "Total shared scalar initializer count: " << shared_count; + return Status::OK(); } diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc index 3b4b793ffc00a..fde8392d943cd 100755 --- a/onnxruntime/test/optimizer/graph_transform_test.cc +++ b/onnxruntime/test/optimizer/graph_transform_test.cc @@ -4339,7 +4339,7 @@ TEST_F(GraphTransformationTests, BitmaskDropoutFusionTest) { /* This test build a graph like: - input0 input1 + input0 input1 \ / Add -----------------| @@ -4359,7 +4359,7 @@ This test build a graph like: After fusion, the graph become: input0 input1 - \ / + \ / Add (Constant Initializer) \ / Reshape @@ -4436,15 +4436,16 @@ TEST_F(GraphTransformationTests, ReshapeFusionOpsetTest) { builder.AddNode("Unsqueeze", {gather_out_1}, {unsqueeze_out_1}).AddAttribute("axes", std::vector{0}); } builder.AddNode("ConcatTraining", {unsqueeze_out_0, unsqueeze_out_1, single_value_1d_int_16, single_value_1d_int_64}, - {concattraining1_out, concattraining1_length}, "com.microsoft").AddAttribute("axis", static_cast(0)); + {concattraining1_out, concattraining1_length}, "com.microsoft") + .AddAttribute("axis", static_cast(0)); builder.AddNode("Reshape", {add_out, concattraining1_out}, {out}); }; std::unique_ptr transformer = std::make_unique(); if (opset_version == 15 && shape_test_for_opset15) { ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, opset_version, *logger_, std::move(transformer), TransformerLevel::Level1, 1, - pre_graph_checker, pre_graph_checker)); - } else{ + pre_graph_checker, pre_graph_checker)); + } else { ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, opset_version, *logger_, std::move(transformer), TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker)); } @@ -6243,6 +6244,69 @@ TEST_F(GraphTransformationTests, ConstantSharing_ShareIntMaxOrFloatInfinityIniti } } +/* +Test graph as below. + graph input [2] (float) Constant (1.0float) Constant (1.0uint8) + \_______________ ________________/ | | + \/ | | + Add | | + | | | + graph output [2](float) graph output [](float) graph output [](int8) + +Be noted: expected result graph should maintain original graph outputs, + both float and unin8 constant values are not shared. +*/ +TEST_F(GraphTransformationTests, ConstantSharing_ShouldNotShareForGraphOutput) { + constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "scalar_const_not_share.onnx"; + std::shared_ptr model; + ASSERT_STATUS_OK(Model::Load(model_uri, model, nullptr, *logger_)); + Graph& graph = model->MainGraph(); + { + std::map op_to_count = CountOpsInGraph(graph); + ASSERT_TRUE(op_to_count["Add"] == 1); + // Be noted, constant nodes are converted to initialized already. + ASSERT_TRUE(graph.GetAllInitializedTensors().size() == 2U); + } + + onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; + std::unique_ptr transformer = std::make_unique(); + ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::move(transformer), TransformerLevel::Level1)); + ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_)); + + { + const InitializedTensorSet& initialized_tensor_set = graph.GetAllInitializedTensors(); + ASSERT_TRUE(initialized_tensor_set.size() == 2U); + const NodeArg* add_initializer = nullptr; + for (auto& node : graph.Nodes()) { + if (node.OpType().compare("Add") == 0) { + add_initializer = node.InputDefs()[1]; + ASSERT_TRUE(add_initializer->Shape()->dim_size() == 0); + ASSERT_TRUE(add_initializer->Name().compare("y_scale") == 0); + } + } + ASSERT_TRUE(add_initializer != nullptr); + for (const auto& entry : initialized_tensor_set) { + if (entry.first.compare("y_scale") == 0) { + const ONNX_NAMESPACE::TensorProto* tensor_proto = entry.second; + onnxruntime::Initializer int64_const{*tensor_proto, graph.ModelPath()}; + ASSERT_TRUE(int64_const.size() == 1); + float float_const_value = *(int64_const.data()); + ASSERT_TRUE(float_const_value == 1); + } else { + const ONNX_NAMESPACE::TensorProto* tensor_proto = entry.second; + onnxruntime::Initializer uint8_const{*tensor_proto, graph.ModelPath()}; + ASSERT_TRUE(uint8_const.size() == 1); + uint8_t uint8_const_value = *(uint8_const.data()); + ASSERT_TRUE(uint8_const_value == static_cast(1)); + } + } + + auto op_count = CountOpsInGraph(graph); + ASSERT_TRUE(op_count.size() == 1U); + ASSERT_TRUE(op_count["Add"] == 1); + } +} + TEST_F(GraphTransformationTests, GatherToSplitFusion) { auto build_test_case = [&](ModelTestBuilder& builder) { auto* data_arg = builder.MakeInput({{54}}); diff --git a/onnxruntime/test/testdata/transform/scalar_const_not_share.onnx b/onnxruntime/test/testdata/transform/scalar_const_not_share.onnx new file mode 100644 index 0000000000000000000000000000000000000000..13e572950f86ef753f1e3cc807040e2cf2f6395f GIT binary patch literal 212 zcmd Date: Fri, 3 Feb 2023 20:11:50 +0800 Subject: [PATCH 06/68] link mpi when either use_mpi or use_nccl enabled (#14467) ### Only link mpi when either use_mpi or use_nccl enabled To fix the issue https://github.com/microsoft/onnxruntime/issues/14278. Talked with @askhade, we think if users want to enable NCCL/MPi but MPI is not found, it should be failure instead of warning. So this PR made the change. As a result, to make CIs pass, we need disable NCCL/MPI explicitly in the build command. This PR take an alternative approach, e.g. since NCCL and MPi are not used for customers, disable NCCL by default if "--disable_nccl" not specified, disable MPI by default if "--use_mpi" not specified. ### Motivation and Context --- cmake/CMakeLists.txt | 31 ++++++++++++++++++------------- tools/ci_build/build.py | 6 ++++-- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 5c088aa8cddc4..1ff5760422177 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -1347,19 +1347,22 @@ if (onnxruntime_ENABLE_TRAINING) find_package(MPI) - if (MPI_CXX_FOUND) - message( STATUS "MPI Version: ${MPI_CXX_VERSION}") - message( STATUS "MPI (include: ${MPI_CXX_INCLUDE_DIRS}, library: ${MPI_CXX_LIBRARIES})" ) - mark_as_advanced(MPI_CXX_INCLUDE_DIRS MPI_CXX_LIBRARIES) - list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${MPI_CXX_LIBRARIES} ${MPI_CXX_LINK_FLAGS}) - else () - set(onnxruntime_USE_NCCL OFF) - set(onnxruntime_USE_MPI OFF) - message( WARNING "MPI is not found. Please define onnxruntime_MPI_HOME to specify the path of MPI. Otherwise, NCCL will be disabled." ) + if (onnxruntime_USE_MPI OR onnxruntime_USE_NCCL) + if (MPI_CXX_FOUND) + message( STATUS "MPI Version: ${MPI_CXX_VERSION}") + message( STATUS "MPI (include: ${MPI_CXX_INCLUDE_DIRS}, library: ${MPI_CXX_LIBRARIES})" ) + mark_as_advanced(MPI_CXX_INCLUDE_DIRS MPI_CXX_LIBRARIES) + list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${MPI_CXX_LIBRARIES} ${MPI_CXX_LINK_FLAGS}) + else () + message( + FATAL_ERROR + "MPI is not found. Please define onnxruntime_MPI_HOME to specify the path of MPI. Otherwise, NCCL will be disabled." + ) + endif() endif() # Find NCCL and MPI - if (onnxruntime_USE_NCCL AND MPI_CXX_FOUND) + if (onnxruntime_USE_NCCL) if (onnxruntime_USE_CUDA) set(NCCL_LIBNAME "nccl") elseif (onnxruntime_USE_ROCM) @@ -1417,13 +1420,15 @@ if (onnxruntime_ENABLE_TRAINING) add_definitions(-DORT_USE_NCCL=1) message( STATUS "NCCL is enabled in Linux GPU Build." ) else () - set(onnxruntime_USE_NCCL OFF) - message( WARNING "NCCL is not found. Please use --nccl_home to specify the path of NCCL. Otherwise, NCCL is disabled." ) + message( + FATAL_ERROR + "NCCL is not found. Please use --nccl_home to specify the path of NCCL. Otherwise, NCCL is disabled." + ) endif() endif() endif() - if (onnxruntime_USE_MPI AND MPI_CXX_FOUND) + if (onnxruntime_USE_MPI) add_definitions(-DUSE_MPI=1) endif() diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index d552fb71b6547..f421800523667 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -192,10 +192,12 @@ def convert_arg_line_to_args(self, arg_line): parser.add_argument("--enable_training_apis", action="store_true", help="Enable ort training apis.") parser.add_argument("--enable_training_ops", action="store_true", help="Enable training ops in inference graph.") - parser.add_argument("--disable_nccl", action="store_true", help="Disable Nccl.") + parser.add_argument("--disable_nccl", action="store_false", help="Disable NCCL, by default NCCL is disabled.") parser.add_argument("--mpi_home", help="Path to MPI installation dir") parser.add_argument("--nccl_home", help="Path to NCCL installation dir") - parser.add_argument("--use_mpi", nargs="?", default=True, const=True, type=_str_to_bool) + parser.add_argument( + "--use_mpi", nargs="?", default=False, const=True, type=_str_to_bool, help="Disabled by default." + ) # enable ONNX tests parser.add_argument( From a5eb616819584d8f15ca21124e3892e9f209abc4 Mon Sep 17 00:00:00 2001 From: Baiju Meswani Date: Fri, 3 Feb 2023 09:01:30 -0800 Subject: [PATCH 07/68] Enable ability to control whether or not to quantize the bias (#14549) --- onnxruntime/python/tools/quantization/qdq_quantizer.py | 10 +++++++++- onnxruntime/python/tools/quantization/quantize.py | 5 +++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py index ec1b9db3dfaaf..a970e72aa1f86 100644 --- a/onnxruntime/python/tools/quantization/qdq_quantizer.py +++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py @@ -97,6 +97,13 @@ def __init__( False if "AddQDQPairToWeight" not in extra_options else extra_options["AddQDQPairToWeight"] ) + # Some scenarios do not need the bias quantized. For example, in the case of Quantization Aware Training, + # quantizing the bias is not needed. This is because in QAT, all model parameters are expected to be in + # floating point format. To that end, we can use the FakeQuant operator for weights and activations that + # can always have QDQ pairs (by using AddQDQPairToWeight). But for biases in a quantized model, we can't use + # FakeQuant because it only ever appears before a DQ (since it is quantized as int32). + self.quantize_bias = True if "QuantizeBias" not in extra_options else extra_options["QuantizeBias"] + # The default behavior is that multiple nodes can share a QDQ pair as their inputs. # In TRT, QDQ pair can’t be shared between nodes, so it will create dedicated QDQ pairs for each node. self.dedicated_qdq_pair = ( @@ -211,7 +218,8 @@ def quantize_model(self): self._quantize_normal_tensors() self._quantize_sharing_param_tensors() - self._quantize_bias_tensors() + if self.quantize_bias: + self._quantize_bias_tensors() self.remove_nodes() if not self.add_qdq_pair_to_weight: self.model.clean_initializers() diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py index 80ae592d49071..1d4aae268f79e 100644 --- a/onnxruntime/python/tools/quantization/quantize.py +++ b/onnxruntime/python/tools/quantization/quantize.py @@ -139,6 +139,11 @@ def __init__( Default is 0.01. Constant smoothing factor to use when computing the moving average of the minimum and maximum values. Effective only when the calibration method selected is MinMax and when CalibMovingAverage is set to True. + QuantizeBias = True/False : + Default is True which quantizes floating-point biases and it solely inserts + a DeQuantizeLinear node. If False, it remains floating-point bias and does not insert + any quantization nodes associated with biases. + This extra option is only effective when quant_format is QuantFormat.QDQ. execution_provider : A enum indicates the Execution Provider such as: CPU, TRT, NNAPI, SNE, etc. Raises: ValueError: Raise ValueError if execution provider is unknown From 638f21b969143f65f4c47ce6b0f41f435e1d3ec3 Mon Sep 17 00:00:00 2001 From: "Nat Kershaw (MSFT)" Date: Fri, 3 Feb 2023 09:43:29 -0800 Subject: [PATCH 08/68] Upgrade doxygen to fix C API docs build issue (#13950) --- .github/workflows/publish-c-apidocs.yml | 16 +++--- docs/c_cxx/Doxyfile | 49 ------------------- docs/c_cxx/doxygen-header.html | 2 +- .../core/session/onnxruntime_c_api.h | 5 +- 4 files changed, 14 insertions(+), 58 deletions(-) diff --git a/.github/workflows/publish-c-apidocs.yml b/.github/workflows/publish-c-apidocs.yml index 51378cfa5b87f..487336428ad3b 100644 --- a/.github/workflows/publish-c-apidocs.yml +++ b/.github/workflows/publish-c-apidocs.yml @@ -3,6 +3,10 @@ on: push: branches: - main + paths: + - include/onnxruntime/core/session + + workflow_dispatch: jobs: publish: @@ -13,10 +17,10 @@ jobs: - name: Install doxygen and dependencies run: | sudo apt update - sudo apt-get install libclang-9-dev - sudo apt-get install libclang-cpp9 - wget https://www.doxygen.nl/files/doxygen-1.9.2.linux.bin.tar.gz - tar xvzf doxygen-1.9.2.linux.bin.tar.gz + sudo apt-get install libclang-dev + sudo apt-get install libclang-cpp14 + wget https://www.doxygen.nl/files/doxygen-1.9.6.linux.bin.tar.gz + tar xvzf doxygen-1.9.6.linux.bin.tar.gz - name: Set commit ID id: vars run: echo "::set-output name=sha_short::$(git rev-parse --short HEAD)" @@ -24,7 +28,7 @@ jobs: run: | mkdir -p build/doxygen cd docs/c_cxx - ../../doxygen-1.9.2/bin/doxygen + ../../doxygen-1.9.6/bin/doxygen - uses: actions/checkout@v2 with: ref: gh-pages @@ -36,7 +40,7 @@ jobs: - name: Create Pull Request uses: peter-evans/create-pull-request@v3 with: - branch: gh-pages-pr + branch: gh-pages-pr-c-docs base: gh-pages title: '[Automated]: Update C/C++ API docs' commit-message: 'Update C/C++ API docs to commit ${{ steps.vars.outputs.sha_short }}' diff --git a/docs/c_cxx/Doxyfile b/docs/c_cxx/Doxyfile index 888e0a36b34b2..aa59e5be1a8e1 100644 --- a/docs/c_cxx/Doxyfile +++ b/docs/c_cxx/Doxyfile @@ -1654,17 +1654,6 @@ HTML_FORMULA_FORMAT = png FORMULA_FONTSIZE = 10 -# Use the FORMULA_TRANSPARENT tag to determine whether or not the images -# generated for formulas are transparent PNGs. Transparent PNGs are not -# supported properly for IE 6.0, but are supported on all modern browsers. -# -# Note that when changing this option you need to delete any form_*.png files in -# the HTML output directory before the changes have effect. -# The default value is: YES. -# This tag requires that the tag GENERATE_HTML is set to YES. - -FORMULA_TRANSPARENT = YES - # The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands # to create new LaTeX commands to be used in formulas as building blocks. See # the section "Including formulas" for details. @@ -2352,15 +2341,6 @@ EXTERNAL_PAGES = YES # Configuration options related to the dot tool #--------------------------------------------------------------------------- -# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram -# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to -# NO turns the diagrams off. Note that this option also works with HAVE_DOT -# disabled, but it is recommended to install and use dot, since it yields more -# powerful graphs. -# The default value is: YES. - -CLASS_DIAGRAMS = NO - # You can include diagrams made with dia in doxygen documentation. Doxygen will # then run dia to produce the diagram and insert it in the documentation. The # DIA_PATH tag allows you to specify the directory where the dia binary resides. @@ -2393,23 +2373,6 @@ HAVE_DOT = NO DOT_NUM_THREADS = 0 -# When you want a differently looking font in the dot files that doxygen -# generates you can specify the font name using DOT_FONTNAME. You need to make -# sure dot is able to find the font, which can be done by putting it in a -# standard location or by setting the DOTFONTPATH environment variable or by -# setting DOT_FONTPATH to the directory containing the font. -# The default value is: Helvetica. -# This tag requires that the tag HAVE_DOT is set to YES. - -DOT_FONTNAME = Helvetica - -# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of -# dot graphs. -# Minimum value: 4, maximum value: 24, default value: 10. -# This tag requires that the tag HAVE_DOT is set to YES. - -DOT_FONTSIZE = 10 - # By default doxygen will tell dot to use the default font as specified with # DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set # the path where dot can find it using this tag. @@ -2644,18 +2607,6 @@ DOT_GRAPH_MAX_NODES = 50 MAX_DOT_GRAPH_DEPTH = 0 -# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent -# background. This is disabled by default, because dot on Windows does not seem -# to support this out of the box. -# -# Warning: Depending on the platform used, enabling this option may lead to -# badly anti-aliased labels on the edges of a graph (i.e. they become hard to -# read). -# The default value is: NO. -# This tag requires that the tag HAVE_DOT is set to YES. - -DOT_TRANSPARENT = NO - # Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output # files in one run (i.e. multiple -o and -T options on the command line). This # makes dot run faster, but since only newer versions of dot (>1.8.10) support diff --git a/docs/c_cxx/doxygen-header.html b/docs/c_cxx/doxygen-header.html index cd2171adadc75..364f76f7f0580 100644 --- a/docs/c_cxx/doxygen-header.html +++ b/docs/c_cxx/doxygen-header.html @@ -1,4 +1,4 @@ - + diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index 37aa6a7cd87b0..bd9eb64ae840e 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -3703,7 +3703,6 @@ struct OrtApi { ORT_API2_STATUS(RegisterCustomOpsUsingFunction, _Inout_ OrtSessionOptions* options, _In_ const char* registration_func_name); - /// @} /// \name OrtKernelInfo /// Custom operator APIs. /// @{ @@ -3795,6 +3794,7 @@ struct OrtApi { * of an input during kernel/session creation. * * \param[in] info An instance of ::OrtKernelInfo. + * \param[in] index Which input to get the type information for * \param[out] type_info Pointer set to the resulting ::OrtTypeInfo. Must be freed with OrtApi::ReleaseTypeInfo. * * \snippet{doc} snippets.dox OrtStatus Return Value @@ -3809,6 +3809,7 @@ struct OrtApi { * of an output during kernel/session creation. * * \param[in] info An instance of ::OrtKernelInfo. + * \param[in] index Which input to get the type information for * \param[out] type_info Pointer set to the resulting ::OrtTypeInfo. Must be freed with OrtApi::ReleaseTypeInfo. * * \snippet{doc} snippets.dox OrtStatus Return Value @@ -3893,7 +3894,7 @@ struct OrtApi { * If oneDNN is not available, this function will return failure. * * \param[in] options - * \param[in] cann_options + * \param[in] dnnl_options * * \snippet{doc} snippets.dox OrtStatus Return Value * From 999e5bf45ee9ccdfc6b30b93468e8c9d7ea0bc7f Mon Sep 17 00:00:00 2001 From: Ye Wang <52801275+wangyems@users.noreply.github.com> Date: Fri, 3 Feb 2023 11:38:18 -0800 Subject: [PATCH 09/68] Add SLN support for t5 model with beam search (#14429) ### Description ### Motivation and Context --------- Co-authored-by: Ubuntu --- .../python/tools/symbolic_shape_infer.py | 3 + .../tools/transformers/convert_generation.py | 2 +- .../transformers/fusion_skiplayernorm.py | 22 ++++- .../transformers/models/t5/convert_to_onnx.py | 1 + .../tools/transformers/models/t5/t5_helper.py | 16 +++- .../tools/transformers/onnx_model_t5.py | 92 +++++++++++++++++++ .../python/tools/transformers/optimizer.py | 4 +- 7 files changed, 130 insertions(+), 10 deletions(-) create mode 100644 onnxruntime/python/tools/transformers/onnx_model_t5.py diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py index 689235b630d94..dbc939bce203d 100755 --- a/onnxruntime/python/tools/symbolic_shape_infer.py +++ b/onnxruntime/python/tools/symbolic_shape_infer.py @@ -198,6 +198,7 @@ def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""): "LayerNormalization": self._infer_LayerNormalization, "LongformerAttention": self._infer_LongformerAttention, "PythonOp": self._infer_PythonOp, + "SimplifiedLayerNormalization": self._infer_LayerNormalization, "SkipLayerNormalization": self._infer_SkipLayerNormalization, "SkipSimplifiedLayerNormalization": self._infer_SkipLayerNormalization, "GroupNorm": self._infer_GroupNorm, @@ -433,7 +434,9 @@ def _onnx_infer_single_node(self, node): "GemmFastGelu", "LayerNormalization", "LongformerAttention", + "SimplifiedLayerNormalization", "SkipLayerNormalization", + "SkipSimplifiedLayerNormalization", "PythonOp", "MultiHeadAttention", "GroupNorm", diff --git a/onnxruntime/python/tools/transformers/convert_generation.py b/onnxruntime/python/tools/transformers/convert_generation.py index 122a574064b47..a106d906d052d 100644 --- a/onnxruntime/python/tools/transformers/convert_generation.py +++ b/onnxruntime/python/tools/transformers/convert_generation.py @@ -483,7 +483,7 @@ def t5_to_onnx(args: argparse.Namespace): Path(args.output).parent, use_gpu=args.use_gpu, use_external_data_format=args.use_external_data_format, - optimize_onnx=False, + optimize_onnx=(args.precision != Precision.FLOAT16), precision=args.precision, verbose=False, use_decoder_start_token=False, diff --git a/onnxruntime/python/tools/transformers/fusion_skiplayernorm.py b/onnxruntime/python/tools/transformers/fusion_skiplayernorm.py index 5a32415aba3e3..7c54649553168 100644 --- a/onnxruntime/python/tools/transformers/fusion_skiplayernorm.py +++ b/onnxruntime/python/tools/transformers/fusion_skiplayernorm.py @@ -19,8 +19,13 @@ class FusionSkipLayerNormalization(Fusion): Note: This fusion does not check the input shape of Add and LayerNormalization. """ - def __init__(self, model: OnnxModel): - super().__init__(model, "SkipLayerNormalization", "LayerNormalization") + def __init__( + self, + model: OnnxModel, + fused_op_type: str = "SkipLayerNormalization", + search_op_types: str = "LayerNormalization", + ): + super().__init__(model, fused_op_type, search_op_types) # Update shape inference is needed since other fusions might add new edge which does not have shape info yet. self.shape_infer_helper = self.model.infer_runtime_shape({"batch_size": 4, "seq_len": 7}, update=True) @@ -44,6 +49,9 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node): if len(self.model.get_parents(add)) != 2: return + # Root Mean Square Layer Normalization + simplified = node.op_type == "SimplifiedLayerNormalization" + if self.shape_infer_helper is not None: if not self.shape_infer_helper.compare_shape(add.input[0], add.input[1]): logger.debug( @@ -89,12 +97,16 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node): ): self.nodes_to_remove.extend([add, node]) - inputs = [add.input[0], add.input[1], node.input[1], node.input[2]] + inputs = ( + [add.input[0], add.input[1], node.input[1], node.input[2]] + if not simplified + else [add.input[0], add.input[1], node.input[1]] + ) normalize_node = helper.make_node( - "SkipLayerNormalization", + self.fused_op_type, inputs=inputs, outputs=outputs, - name=self.model.create_node_name("SkipLayerNormalization", name_prefix="SkipLayerNorm"), + name=self.model.create_node_name(self.fused_op_type, name_prefix="SkipLayerNorm"), ) normalize_node.domain = "com.microsoft" diff --git a/onnxruntime/python/tools/transformers/models/t5/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/t5/convert_to_onnx.py index ae6995dd770af..eff24f58a0d97 100644 --- a/onnxruntime/python/tools/transformers/models/t5/convert_to_onnx.py +++ b/onnxruntime/python/tools/transformers/models/t5/convert_to_onnx.py @@ -203,6 +203,7 @@ def export_onnx_models( config.hidden_size, use_external_data_format, auto_mixed_precision=not disable_auto_mixed_precision, + use_gpu=use_gpu, ) else: logger.info(f"Skip optimizing: existed ONNX model {onnx_path}") diff --git a/onnxruntime/python/tools/transformers/models/t5/t5_helper.py b/onnxruntime/python/tools/transformers/models/t5/t5_helper.py index 4d853a6544ef0..c91c0da178e13 100644 --- a/onnxruntime/python/tools/transformers/models/t5/t5_helper.py +++ b/onnxruntime/python/tools/transformers/models/t5/t5_helper.py @@ -228,15 +228,25 @@ def optimize_onnx( hidden_size: int, use_external_data_format: bool = False, auto_mixed_precision: bool = True, + use_gpu: bool = False, ): """Optimize ONNX model with an option to convert it to use mixed precision.""" + + from fusion_options import FusionOptions + + optimization_options = None + if not use_gpu: + # Currently there is no SkipSimplifiedLayerNorm cpu kernel + optimization_options = FusionOptions("t5") + optimization_options.enable_skip_layer_norm = False + m = optimize_model( onnx_model_path, - model_type="bert", # TODO: support optimization for t5 + model_type="t5", num_heads=num_attention_heads, hidden_size=hidden_size, - opt_level=0, - optimization_options=None, + opt_level=2 if not is_float16 and not use_external_data_format else 0, + optimization_options=optimization_options, use_gpu=False, ) if is_float16: diff --git a/onnxruntime/python/tools/transformers/onnx_model_t5.py b/onnxruntime/python/tools/transformers/onnx_model_t5.py new file mode 100644 index 0000000000000..528467b9f256a --- /dev/null +++ b/onnxruntime/python/tools/transformers/onnx_model_t5.py @@ -0,0 +1,92 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import logging +from typing import Union + +from fusion_attention import AttentionMask, FusionAttention +from fusion_base import Fusion +from fusion_skiplayernorm import FusionSkipLayerNormalization +from onnx import NodeProto +from onnx_model import OnnxModel +from onnx_model_bert import BertOnnxModel + +logger = logging.getLogger(__name__) + +# TODO: Support decoder self/cross attention fusion and encoder self attention fusion +class FusionT5Attention(FusionAttention): + """ + Fuse T5 Attention subgraph into one Attention node. + """ + + def __init__( + self, + model: OnnxModel, + hidden_size: int, + num_heads: int, + attention_mask: AttentionMask, + ): + super().__init__(model, hidden_size, num_heads, attention_mask) + + def create_attention_node( + self, + mask_index: str, + matmul: NodeProto, + add: NodeProto, + num_heads: int, + hidden_size: int, + input: str, + output: str, + add_qk_str: str, + ) -> Union[NodeProto, None]: + # Not implemented yet + return None + + def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): + # Not implemented yet + return + + +# It's much easier to export it with the custom op. TODO: revisit later +class FusionRelativePositionBiasBlock(Fusion): + def __init__(self, model: OnnxModel, max_distance: int, is_bidirectional: bool): + super().__init__(model, "RelativePositionBias", "Add") + self.max_distance = max_distance + self.is_bidirectional = is_bidirectional + + def fuse(self, node, input_name_to_nodes, output_name_to_node): + # Not implemented yet + return + + +class FusionSkipSimplifiedLayerNormalization(FusionSkipLayerNormalization): + def __init__(self, model: OnnxModel): + super().__init__(model, "SkipSimplifiedLayerNormalization", "SimplifiedLayerNormalization") + self.shape_infer_helper = self.model.infer_runtime_shape( + {"batch_size": 2, "seq_len": 1, "encode_sequence_length": 8, "past_decode_sequence_length": 4}, update=True + ) + + def fuse(self, node, input_name_to_nodes, output_name_to_node): + super().fuse(node, input_name_to_nodes, output_name_to_node) + + +class T5OnnxModel(BertOnnxModel): + def __init__(self, model, num_heads, hidden_size): + super().__init__(model, num_heads, hidden_size) + self.attention_mask = AttentionMask(self) + self.attention_fusion = FusionT5Attention(self, self.hidden_size, self.num_heads, self.attention_mask) + self.skip_layer_norm_fusion = FusionSkipSimplifiedLayerNormalization(self) + # TODO: hardcode for now. double check later + self.rpb_fusion = FusionRelativePositionBiasBlock(self, 32, True) + + def fuse_attention(self): + self.attention_fusion.apply() + + def fuse_skip_layer_norm(self): + self.skip_layer_norm_fusion.apply() + + def postprocess(self): + self.rpb_fusion.apply() + self.clean_graph() + self.prune_graph() diff --git a/onnxruntime/python/tools/transformers/optimizer.py b/onnxruntime/python/tools/transformers/optimizer.py index 56076eedda78a..a18535c10591d 100644 --- a/onnxruntime/python/tools/transformers/optimizer.py +++ b/onnxruntime/python/tools/transformers/optimizer.py @@ -30,6 +30,7 @@ from onnx_model_bert_keras import BertOnnxModelKeras from onnx_model_bert_tf import BertOnnxModelTF from onnx_model_gpt2 import Gpt2OnnxModel +from onnx_model_t5 import T5OnnxModel from onnx_model_tnlr import TnlrOnnxModel from onnx_model_unet import UnetOnnxModel @@ -49,6 +50,7 @@ ), # might add a class for GPT2OnnxModel for TF later. "tnlr": (TnlrOnnxModel, "pytorch", 1), "unet": (UnetOnnxModel, "pytorch", 1), + "t5": (T5OnnxModel, "pytorch", 2), } @@ -248,7 +250,7 @@ def optimize_model( else [ "MatMulScaleFusion", "MatMulAddFusion", - "SimplifiedLayerNormFusion", + "MatmulTransposeFusion", "GemmActivationFusion", "BiasSoftmaxFusion", ] From c1a0fc55e7a5d685917a966f4059970869962986 Mon Sep 17 00:00:00 2001 From: Ted Themistokleous <107195283+TedThemistokleous@users.noreply.github.com> Date: Fri, 3 Feb 2023 22:35:45 -0500 Subject: [PATCH 10/68] [ROCm][MIGraphX EP]Add back in support for gfx1030 (#14565) Adds back in proper build support for the Navi gen cards (gfx1030) Co-authored-by: Ted Themistokleous --- cmake/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 1ff5760422177..57abcb04ba0e8 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -245,7 +245,7 @@ if (onnxruntime_USE_ROCM) endif() if (NOT CMAKE_HIP_ARCHITECTURES) - set(CMAKE_HIP_ARCHITECTURES "gfx906;gfx908;gfx90a") + set(CMAKE_HIP_ARCHITECTURES "gfx906;gfx908;gfx90a;gfx1030") endif() file(GLOB rocm_cmake_components ${onnxruntime_ROCM_HOME}/lib/cmake/*) From 3d7518762ace6929be98e1203174c2dbf1ac094e Mon Sep 17 00:00:00 2001 From: Vincent Wang Date: Sat, 4 Feb 2023 15:20:18 +0800 Subject: [PATCH 11/68] [ORTModule] ATen Support for upsample_bilinear (#14519) It's required by model MobileViT. --- .../python/tools/symbolic_shape_infer.py | 11 ++++---- .../ortmodule/_custom_gradient_registry.py | 15 +++++++--- .../ortmodule/_custom_op_symbolic_registry.py | 13 +++++++++ .../python/orttraining_test_ortmodule_api.py | 28 +++++++++++++++++++ 4 files changed, 58 insertions(+), 9 deletions(-) diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py index dbc939bce203d..ae320279d724a 100755 --- a/onnxruntime/python/tools/symbolic_shape_infer.py +++ b/onnxruntime/python/tools/symbolic_shape_infer.py @@ -218,9 +218,10 @@ def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""): "_adaptive_avg_pool2d": self._infer_aten_pool2d, "numpy_T": self._infer_Transpose, "native_group_norm": self._infer_aten_group_norm, - "upsample_nearest1d": self._infer_aten_upsample_nearest, - "upsample_nearest2d": self._infer_aten_upsample_nearest, - "upsample_nearest3d": self._infer_aten_upsample_nearest, + "upsample_nearest1d": self._infer_aten_upsample, + "upsample_nearest2d": self._infer_aten_upsample, + "upsample_nearest3d": self._infer_aten_upsample, + "upsample_bilinear2d": self._infer_aten_upsample, } self.run_ = True self.suggested_merge_ = {} @@ -1389,14 +1390,14 @@ def _infer_aten_group_norm(self, node): ) ) - def _infer_aten_upsample_nearest(self, node): + def _infer_aten_upsample(self, node): new_shape = None input_shape = self._get_shape(node, 0) if input_shape is not None: new_shape = input_shape[:2] output_size = self._try_get_value(node, 1) if output_size is not None: - new_shape += [dim_size.item() for dim_size in output_size] + new_shape += [dim_size.item() if type(dim_size) == np.int64 else dim_size for dim_size in output_size] else: rank = len(input_shape) new_shape += [str(self._new_symbolic_dim_from_output(node, 0, i)) for i in range(2, rank)] diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py b/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py index e1d3d5fcf591b..89a766bd36c29 100644 --- a/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py +++ b/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py @@ -239,8 +239,10 @@ def native_group_norm_gradient(): # PyTorch removed related backward functions with "vec" overload name since 1.13. The functions with no overload name # are available for all versions, though they are not that convienent to use. -def _upsample_nearest_gradient(backward_fn, dims): +def _upsample_gradient(backward_fn, dims): scales = ["" for _ in range(dims)] + if "bilinear" in backward_fn: + scales = ["I(2)"] + scales return [ ("Shape", ["I(0)"], ["Shape_X"]), ("Shape", ["O(0)"], ["Shape_Y"]), @@ -258,14 +260,19 @@ def _upsample_nearest_gradient(backward_fn, dims): @register_gradient("org.pytorch.aten", "ATen", "upsample_nearest1d", "vec") def upsample_nearest1d_gradient(): - return _upsample_nearest_gradient("upsample_nearest1d_backward", 1) + return _upsample_gradient("upsample_nearest1d_backward", 1) @register_gradient("org.pytorch.aten", "ATen", "upsample_nearest2d", "vec") def upsample_nearest2d_gradient(): - return _upsample_nearest_gradient("upsample_nearest2d_backward", 2) + return _upsample_gradient("upsample_nearest2d_backward", 2) @register_gradient("org.pytorch.aten", "ATen", "upsample_nearest3d", "vec") def upsample_nearest3d_gradient(): - return _upsample_nearest_gradient("upsample_nearest3d_backward", 3) + return _upsample_gradient("upsample_nearest3d_backward", 3) + + +@register_gradient("org.pytorch.aten", "ATen", "upsample_bilinear2d", "vec") +def upsample_bilinear2d_gradient(): + return _upsample_gradient("upsample_bilinear2d_backward", 2) diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py index 17076d862ab3e..7cd889a1565ad 100644 --- a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py +++ b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py @@ -799,3 +799,16 @@ def upsample_nearest2d(g, input, output_size, scale_factors): @register_symbolic("upsample_nearest3d") def upsample_nearest3d(g, input, output_size, scale_factors): return _upsample_nearest(g, input, output_size, scale_factors, "upsample_nearest3d") + + +@register_symbolic("upsample_bilinear2d") +def upsample_bilinear2d(g, input, output_size, align_corners, scale_factors): + return g.op( + "org.pytorch.aten::ATen", + input, + output_size, + align_corners, + scale_factors, + operator_s="upsample_bilinear2d", + overload_name_s="vec", + ) diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py index 7758603c484fc..3cbdbd9139c88 100644 --- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py +++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py @@ -1782,6 +1782,34 @@ def run_step(model, input): _test_helpers.assert_values_are_close(ort_input.grad, pt_input.grad) +def test_aten_upsample_bilinear(): + class _NeuralNetUpsampleBilinear(torch.nn.Module): + def __init__(self): + super(_NeuralNetUpsampleBilinear, self).__init__() + + def forward(self, input): + return torch.nn.functional.interpolate(input, size=(8, 12), mode="bilinear") + + device = "cuda" + pt_model = _NeuralNetUpsampleBilinear().to(device) + ort_model = ORTModule(copy.deepcopy(pt_model)) + + def run_step(model, input): + prediction = model(input) + prediction.sum().backward() + return prediction + + # reset manual seed to reset the generator + torch.manual_seed(2333) + pt_input = torch.randn([2, 4, 6, 8], dtype=torch.float, device=device, requires_grad=True) + ort_input = copy.deepcopy(pt_input) + pt_prediction = run_step(pt_model, pt_input) + ort_prediction = run_step(ort_model, ort_input) + + _test_helpers.assert_values_are_close(ort_prediction, pt_prediction) + _test_helpers.assert_values_are_close(ort_input.grad, pt_input.grad) + + def test_gradient_correctness_cast_chain(): class NeuralNetCast(torch.nn.Module): def __init__(self, D): From 4bb95d7690bac6b25622dbef5b711c15ffb00eee Mon Sep 17 00:00:00 2001 From: PeixuanZuo <94887879+PeixuanZuo@users.noreply.github.com> Date: Mon, 6 Feb 2023 13:40:26 +0800 Subject: [PATCH 12/68] Change the return type of softmax function to Status (#14559) ### Description Change the return type of Softmax function(`dispatch_warpwise_softmax_forward `and `dispatch_blockwise_softmax_forward`) from `void ` to `Status`. ### Motivation and Context Softmax function will call TunableOp which return Status. It's necessary to pass the `Status` from inner function to outer function. --- .../contrib_ops/cuda/bert/attention_softmax.h | 12 ++++---- .../transformers/generation_device_helper.cc | 4 +-- .../cuda/transformers/sampling_cuda_helper.h | 30 +++++++++---------- .../contrib_ops/rocm/bert/attention_softmax.h | 12 ++++---- .../core/providers/cuda/math/softmax.cc | 11 +++---- .../core/providers/cuda/math/softmax.h | 8 ++--- .../core/providers/cuda/math/softmax_impl.cu | 28 +++++++++-------- .../core/providers/rocm/math/softmax.cc | 11 +++---- .../core/providers/rocm/math/softmax_impl.cu | 24 ++++++++------- 9 files changed, 69 insertions(+), 71 deletions(-) diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_softmax.h b/onnxruntime/contrib_ops/cuda/bert/attention_softmax.h index 953a45e15b32e..16b3cf053b586 100644 --- a/onnxruntime/contrib_ops/cuda/bert/attention_softmax.h +++ b/onnxruntime/contrib_ops/cuda/bert/attention_softmax.h @@ -714,12 +714,12 @@ Status ComputeSoftmaxWithRawMask(cudaStream_t stream, } if (use_persistent_softmax) { - dispatch_warpwise_softmax_forward(stream, - output, - persistent_softmax_workspace, - all_sequence_length, - all_sequence_length, - batch_size * num_heads * sequence_length); + return dispatch_warpwise_softmax_forward(stream, + output, + persistent_softmax_workspace, + all_sequence_length, + all_sequence_length, + batch_size * num_heads * sequence_length); } return CUDA_CALL(cudaGetLastError()); diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc index 1a5a9ac5d97b2..f6be2179dfdbf 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc +++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc @@ -337,11 +337,11 @@ Status ProcessLogits(const OrtValue& logits, // const CudaT* X_data = is_reuse_logits_buffer ? logits_data : reinterpret_cast(next_token_logits.data()); - dispatch_blockwise_softmax_forward( + ORT_RETURN_IF_ERROR((dispatch_blockwise_softmax_forward( cuda_stream, Y_data, X_data, vocab_size, is_reuse_logits_buffer ? padded_vocab_size : vocab_size, vocab_size, - batch_size * num_beams); + batch_size * num_beams))); #ifdef DEBUG_GENERATION dumper->Print("next_token_scores after softmax", next_token_scores.data(), batch_size, num_beams, vocab_size); diff --git a/onnxruntime/contrib_ops/cuda/transformers/sampling_cuda_helper.h b/onnxruntime/contrib_ops/cuda/transformers/sampling_cuda_helper.h index d82648890f94f..753aea9d38089 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/sampling_cuda_helper.h +++ b/onnxruntime/contrib_ops/cuda/transformers/sampling_cuda_helper.h @@ -88,14 +88,14 @@ Status Sample(AllocatorPtr& allocator, #endif gsl::span& d_sorted_softmaxed_score = sampling_state->d_sorted_softmaxed_score; - dispatch_blockwise_softmax_forward(cuda_stream, - d_sorted_softmaxed_score.data(), - reinterpret_cast(d_sorted_score.data()), - parameters->vocab_size, - parameters->vocab_size, - parameters->vocab_size, - parameters->batch_size); - + ORT_RETURN_IF_ERROR((dispatch_blockwise_softmax_forward(cuda_stream, + d_sorted_softmaxed_score.data(), + reinterpret_cast(d_sorted_score.data()), + parameters->vocab_size, + parameters->vocab_size, + parameters->vocab_size, + parameters->batch_size))); + #ifdef DEBUG_GENERATION dumper->Print("d_sorted_softmaxed_score_buffer", d_sorted_softmaxed_score.data(), @@ -122,13 +122,13 @@ Status Sample(AllocatorPtr& allocator, #endif gsl::span& d_softmaxed_score = sampling_state->d_softmaxed_score; - dispatch_blockwise_softmax_forward(cuda_stream, - d_softmaxed_score.data(), - reinterpret_cast(next_token_scores.data()), - parameters->vocab_size, - parameters->vocab_size, - parameters->vocab_size, - parameters->batch_size); + ORT_RETURN_IF_ERROR((dispatch_blockwise_softmax_forward(cuda_stream, + d_softmaxed_score.data(), + reinterpret_cast(next_token_scores.data()), + parameters->vocab_size, + parameters->vocab_size, + parameters->vocab_size, + parameters->batch_size))); #ifdef DEBUG_GENERATION dumper->Print("d_softmaxed_score_buffer", diff --git a/onnxruntime/contrib_ops/rocm/bert/attention_softmax.h b/onnxruntime/contrib_ops/rocm/bert/attention_softmax.h index 27ecdf253ecdb..7c99fc05ec9ee 100644 --- a/onnxruntime/contrib_ops/rocm/bert/attention_softmax.h +++ b/onnxruntime/contrib_ops/rocm/bert/attention_softmax.h @@ -513,12 +513,12 @@ Status ComputeSoftmaxWithRawMask(hipStream_t stream, } if (use_persistent_softmax) { - dispatch_warpwise_softmax_forward(stream, - output, - persistent_softmax_workspace, - all_sequence_length, - all_sequence_length, - batch_size * num_heads * sequence_length); + return dispatch_warpwise_softmax_forward(stream, + output, + persistent_softmax_workspace, + all_sequence_length, + all_sequence_length, + batch_size * num_heads * sequence_length); } return HIP_CALL(hipPeekAtLastError()); diff --git a/onnxruntime/core/providers/cuda/math/softmax.cc b/onnxruntime/core/providers/cuda/math/softmax.cc index dc1830a192945..5047a70242a5c 100644 --- a/onnxruntime/core/providers/cuda/math/softmax.cc +++ b/onnxruntime/core/providers/cuda/math/softmax.cc @@ -26,15 +26,12 @@ Status SoftMaxComputeHelper( auto X_data = reinterpret_cast(X); if (D <= 1024 && D * sizeof(T) <= 4096) { - dispatch_warpwise_softmax_forward, is_log_softmax>( + return dispatch_warpwise_softmax_forward, is_log_softmax>( stream, Y_data, X_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); - } else { - dispatch_blockwise_softmax_forward, is_log_softmax>( - stream, Y_data, X_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(D), - gsl::narrow_cast(N)); } - - return Status::OK(); + return dispatch_blockwise_softmax_forward, is_log_softmax>( + stream, Y_data, X_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(D), + gsl::narrow_cast(N)); } #define SPECIALIZED_SOFTMAX_HELPER_IMPL(T) \ diff --git a/onnxruntime/core/providers/cuda/math/softmax.h b/onnxruntime/core/providers/cuda/math/softmax.h index b2528bb0c8855..b66ad32517458 100644 --- a/onnxruntime/core/providers/cuda/math/softmax.h +++ b/onnxruntime/core/providers/cuda/math/softmax.h @@ -18,12 +18,12 @@ Status SoftMaxComputeHelper( int64_t axis); template -void dispatch_warpwise_softmax_forward(cudaStream_t stream, output_t* dst, const input_t* src, - int softmax_elements, int softmax_elements_stride, int batch_count); +Status dispatch_warpwise_softmax_forward(cudaStream_t stream, output_t* dst, const input_t* src, + int softmax_elements, int softmax_elements_stride, int batch_count); template -void dispatch_blockwise_softmax_forward(cudaStream_t stream, output_t* output, const input_t* input, - int softmax_elements, int input_stride, int output_stride, int batch_count); +Status dispatch_blockwise_softmax_forward(cudaStream_t stream, output_t* output, const input_t* input, + int softmax_elements, int input_stride, int output_stride, int batch_count); template class Softmax final : public CudaKernel { diff --git a/onnxruntime/core/providers/cuda/math/softmax_impl.cu b/onnxruntime/core/providers/cuda/math/softmax_impl.cu index dafc3a17900ac..4c097f714beb9 100644 --- a/onnxruntime/core/providers/cuda/math/softmax_impl.cu +++ b/onnxruntime/core/providers/cuda/math/softmax_impl.cu @@ -29,9 +29,9 @@ namespace onnxruntime { namespace cuda { template -void dispatch_warpwise_softmax_forward(cudaStream_t stream, output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count) { +Status dispatch_warpwise_softmax_forward(cudaStream_t stream, output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count) { if (softmax_elements == 0) { - return; + return Status::OK(); } else { int log2_elements = log2_ceil(softmax_elements); const int next_power_of_two = 1 << log2_elements; @@ -99,15 +99,16 @@ void dispatch_warpwise_softmax_forward(cudaStream_t stream, output_t* dst, const break; } } + return CUDA_CALL(cudaGetLastError()); } -#define SPECIALIZED_WRAPWISE_SOFTMAX_IMPL(input_t, output_t, acc_t) \ - template void dispatch_warpwise_softmax_forward(cudaStream_t stream, output_t * dst, \ - const input_t* src, int softmax_elements, \ - int softmax_elements_stride, int batch_count); \ - template void dispatch_warpwise_softmax_forward(cudaStream_t stream, output_t * dst, \ - const input_t* src, int softmax_elements, \ - int softmax_elements_stride, int batch_count); +#define SPECIALIZED_WRAPWISE_SOFTMAX_IMPL(input_t, output_t, acc_t) \ + template Status dispatch_warpwise_softmax_forward(cudaStream_t stream, output_t * dst, \ + const input_t* src, int softmax_elements, \ + int softmax_elements_stride, int batch_count); \ + template Status dispatch_warpwise_softmax_forward(cudaStream_t stream, output_t * dst, \ + const input_t* src, int softmax_elements, \ + int softmax_elements_stride, int batch_count); SPECIALIZED_WRAPWISE_SOFTMAX_IMPL(float, float, float) SPECIALIZED_WRAPWISE_SOFTMAX_IMPL(half, half, float) @@ -115,8 +116,8 @@ SPECIALIZED_WRAPWISE_SOFTMAX_IMPL(double, double, double) SPECIALIZED_WRAPWISE_SOFTMAX_IMPL(BFloat16, BFloat16, float) template -void dispatch_blockwise_softmax_forward(cudaStream_t stream, output_t* output, const input_t* input, int softmax_elements, - int input_stride, int output_stride, int batch_count) { +Status dispatch_blockwise_softmax_forward(cudaStream_t stream, output_t* output, const input_t* input, int softmax_elements, + int input_stride, int output_stride, int batch_count) { dim3 grid(batch_count); constexpr int ILP = sizeof(float4) / sizeof(input_t); dim3 block = SoftMax_getBlockSize(ILP, softmax_elements); @@ -129,13 +130,14 @@ void dispatch_blockwise_softmax_forward(cudaStream_t stream, output_t* output, c <<>>(output, const_cast(input), softmax_elements, input_stride, output_stride); } + return CUDA_CALL(cudaGetLastError()); } #define SPECIALIZED_BLOCKWISE_SOFTMAX_IMPL(input_t, output_t, acc_t) \ - template void dispatch_blockwise_softmax_forward( \ + template Status dispatch_blockwise_softmax_forward( \ cudaStream_t stream, output_t * output, const input_t* src, int softmax_elements, \ int input_stride, int output_stride, int batch_count); \ - template void dispatch_blockwise_softmax_forward( \ + template Status dispatch_blockwise_softmax_forward( \ cudaStream_t stream, output_t * output, const input_t* src, int softmax_elements, \ int input_stride, int output_stride, int batch_count); diff --git a/onnxruntime/core/providers/rocm/math/softmax.cc b/onnxruntime/core/providers/rocm/math/softmax.cc index 275c8ad3978f5..22bcaecf34f65 100644 --- a/onnxruntime/core/providers/rocm/math/softmax.cc +++ b/onnxruntime/core/providers/rocm/math/softmax.cc @@ -26,15 +26,12 @@ Status SoftMaxComputeHelper( auto X_data = reinterpret_cast(X); if (D <= 1024 && D * sizeof(T) <= 4096) { - dispatch_warpwise_softmax_forward, is_log_softmax>( + return dispatch_warpwise_softmax_forward, is_log_softmax>( stream, Y_data, X_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(N)); - } else { - dispatch_blockwise_softmax_forward, is_log_softmax>( - stream, Y_data, X_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(D), - gsl::narrow_cast(N)); } - - return Status::OK(); + return dispatch_blockwise_softmax_forward, is_log_softmax>( + stream, Y_data, X_data, gsl::narrow_cast(D), gsl::narrow_cast(D), gsl::narrow_cast(D), + gsl::narrow_cast(N)); } #define SPECIALIZED_SOFTMAX_HELPER_IMPL(T) \ diff --git a/onnxruntime/core/providers/rocm/math/softmax_impl.cu b/onnxruntime/core/providers/rocm/math/softmax_impl.cu index f5a26ef045881..d37235acfa0e1 100644 --- a/onnxruntime/core/providers/rocm/math/softmax_impl.cu +++ b/onnxruntime/core/providers/rocm/math/softmax_impl.cu @@ -30,9 +30,9 @@ namespace onnxruntime { namespace rocm { template -void dispatch_warpwise_softmax_forward(hipStream_t stream, output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count) { +Status dispatch_warpwise_softmax_forward(hipStream_t stream, output_t* dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count) { if (softmax_elements == 0) { - return; + return Status::OK(); } else { int log2_elements = log2_ceil(softmax_elements); const int next_power_of_two = 1 << log2_elements; @@ -88,11 +88,12 @@ void dispatch_warpwise_softmax_forward(hipStream_t stream, output_t* dst, const break; } } + return HIP_CALL(hipGetLastError()); } #define SPECIALIZED_SOFTMAX_IMPL(input_t, output_t, acc_t) \ -template void dispatch_warpwise_softmax_forward(hipStream_t stream, output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); \ -template void dispatch_warpwise_softmax_forward(hipStream_t stream, output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); +template Status dispatch_warpwise_softmax_forward(hipStream_t stream, output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); \ +template Status dispatch_warpwise_softmax_forward(hipStream_t stream, output_t * dst, const input_t* src, int softmax_elements, int softmax_elements_stride, int batch_count); SPECIALIZED_SOFTMAX_IMPL(float, float, float) SPECIALIZED_SOFTMAX_IMPL(half, half, float) @@ -100,7 +101,7 @@ SPECIALIZED_SOFTMAX_IMPL(double, double, double) SPECIALIZED_SOFTMAX_IMPL(BFloat16, BFloat16, float) template -void dispatch_blockwise_softmax_forward(hipStream_t stream, output_t* output, const input_t* input, int softmax_elements, +Status dispatch_blockwise_softmax_forward(hipStream_t stream, output_t* output, const input_t* input, int softmax_elements, int input_stride, int output_stride, int batch_count) { dim3 grid(batch_count); constexpr int ILP = sizeof(float4) / sizeof(input_t); @@ -114,14 +115,15 @@ void dispatch_blockwise_softmax_forward(hipStream_t stream, output_t* output, co <<>>(output, const_cast(input), softmax_elements, input_stride, output_stride); } + return HIP_CALL(hipGetLastError()); } -#define SPECIALIZED_BLOCKWISE_SOFTMAX_IMPL(input_t, output_t, acc_t) \ - template void dispatch_blockwise_softmax_forward( \ - hipStream_t stream, output_t * output, const input_t* src, int softmax_elements, \ - int input_stride, int output_stride, int batch_count); \ - template void dispatch_blockwise_softmax_forward( \ - hipStream_t stream, output_t * output, const input_t* src, int softmax_elements, \ +#define SPECIALIZED_BLOCKWISE_SOFTMAX_IMPL(input_t, output_t, acc_t) \ + template Status dispatch_blockwise_softmax_forward( \ + hipStream_t stream, output_t * output, const input_t* src, int softmax_elements, \ + int input_stride, int output_stride, int batch_count); \ + template Status dispatch_blockwise_softmax_forward( \ + hipStream_t stream, output_t * output, const input_t* src, int softmax_elements, \ int input_stride, int output_stride, int batch_count); SPECIALIZED_BLOCKWISE_SOFTMAX_IMPL(float, float, float) From 20684021dad9339941e62618ec241ddc6ecc0c43 Mon Sep 17 00:00:00 2001 From: cao lei Date: Mon, 6 Feb 2023 09:53:48 -0800 Subject: [PATCH 13/68] do not use raw pointer for CpuBuffersInfo::buffers (#14574) ### Description Do not use raw pointer for CpuBuffersInfo::buffers object ### Motivation and Context This PR is to fix the bug 11159: https://dev.azure.com/aiinfra/ONNX%20Runtime/_workitems/edit/11159/ --- onnxruntime/core/providers/cuda/cuda_stream_handle.cc | 5 ++--- onnxruntime/core/providers/rocm/rocm_stream_handle.cc | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc index b818e9b57a7b2..81d0070f1aeaf 100644 --- a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc +++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc @@ -102,7 +102,7 @@ struct CpuBuffersInfo { // should contain all values in // deferred_release_buffer_pool_[my_stream] // when release my_stream's buffers. - void** buffers; + std::unique_ptr buffers; // CPU buffer buffers[i]. // Number of buffer points in "buffers". size_t n_buffers; @@ -117,7 +117,6 @@ static void CUDART_CB ReleaseCpuBufferCallback(void* raw_info) { for (size_t i = 0; i < info->n_buffers; ++i) { info->allocator->Free(info->buffers[i]); } - delete[] info->buffers; } Status CudaStream::CleanUpOnRunEnd() { @@ -128,7 +127,7 @@ Status CudaStream::CleanUpOnRunEnd() { if (release_cpu_buffer_on_cuda_stream_ && cpu_allocator_->Info().alloc_type == OrtArenaAllocator) { std::unique_ptr cpu_buffers_info = std::make_unique(); cpu_buffers_info->allocator = cpu_allocator_; - cpu_buffers_info->buffers = new void*[deferred_cpu_buffers_.size()]; + cpu_buffers_info->buffers = std::make_unique(deferred_cpu_buffers_.size()); for (size_t i = 0; i < deferred_cpu_buffers_.size(); ++i) { cpu_buffers_info->buffers[i] = deferred_cpu_buffers_.at(i); } diff --git a/onnxruntime/core/providers/rocm/rocm_stream_handle.cc b/onnxruntime/core/providers/rocm/rocm_stream_handle.cc index c87fa6983425c..fb6eeb6746376 100644 --- a/onnxruntime/core/providers/rocm/rocm_stream_handle.cc +++ b/onnxruntime/core/providers/rocm/rocm_stream_handle.cc @@ -83,7 +83,7 @@ void RocmStream::EnqueDeferredCPUBuffer(void* cpu_buffer) { struct CpuBuffersInfo { // TODO: should be moved to base class AllocatorPtr allocator; - void** buffers; + std::unique_ptr buffers; // CPU buffer buffers[i]. // Number of buffer points in "buffers". size_t n_buffers; @@ -95,7 +95,6 @@ static void ReleaseCpuBufferCallback(hipStream_t /*stream*/, hipError_t /*status for (size_t i = 0; i < info->n_buffers; ++i) { info->allocator->Free(info->buffers[i]); } - delete[] info->buffers; } Status RocmStream::CleanUpOnRunEnd() { @@ -106,7 +105,7 @@ Status RocmStream::CleanUpOnRunEnd() { if (release_cpu_buffer_on_rocm_stream_ && cpu_allocator_->Info().alloc_type == OrtArenaAllocator) { std::unique_ptr cpu_buffers_info = std::make_unique(); cpu_buffers_info->allocator = cpu_allocator_; - cpu_buffers_info->buffers = new void*[deferred_cpu_buffers_.size()]; + cpu_buffers_info->buffers = std::make_unique(deferred_cpu_buffers_.size()); for (size_t i = 0; i < deferred_cpu_buffers_.size(); ++i) { cpu_buffers_info->buffers[i] = deferred_cpu_buffers_.at(i); } From b8fb9320ac55c32fa8b5bfb1a4a0335537fac048 Mon Sep 17 00:00:00 2001 From: Patrice Vignola Date: Mon, 6 Feb 2023 10:01:02 -0800 Subject: [PATCH 14/68] [DML EP] Fix ScatterElements registration (#14560) --- docs/OperatorKernels.md | 2 +- .../DmlExecutionProvider/src/Operators/OperatorRegistration.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index 7e4eb38be780b..286cad61d599f 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -1090,7 +1090,7 @@ Do not modify directly.* |Scatter|*in* data:**T**
*in* indices:**Tind**
*in* updates:**T**
*out* output:**T**|13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**Tind** = tensor(int32), tensor(int64)| |||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**Tind** = tensor(int32), tensor(int64)| |||9+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**Tind** = tensor(int32), tensor(int64)| -|ScatterElements|*in* data:**T**
*in* indices:**Tind**
*in* updates:**T**
*out* output:**T**|16+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| +|ScatterElements|*in* data:**T**
*in* indices:**Tind**
*in* updates:**T**
*out* output:**T**|16+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**Tind** = tensor(int32), tensor(int64)| |||13+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**Tind** = tensor(int32), tensor(int64)| |||11+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)
**Tind** = tensor(int32), tensor(int64)| |ScatterND|*in* data:**T**
*in* indices:**tensor(int64)**
*in* updates:**T**
*out* output:**T**|16+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp index 93894983533ce..09a7c923235cd 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp @@ -487,7 +487,7 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation {REG_INFO_VER( 13, Scatter, typeNameListScatterGather, supportedTypeListScatterGather, DmlGraphSupport::Supported)}, {REG_INFO( 11, ScatterElements, typeNameListScatterGather, supportedTypeListScatterGather, DmlGraphSupport::Supported)}, {REG_INFO( 13, ScatterElements, typeNameListScatterGather, supportedTypeListScatterGather, DmlGraphSupport::Supported)}, - {REG_INFO( 16, ScatterElements, typeNameListScatterGatherND, supportedTypeListScatterGather, DmlGraphSupport::Supported, requiredConstantCpuInputs(), std::nullopt, QueryScatter)}, + {REG_INFO( 16, ScatterElements, typeNameListScatterGather, supportedTypeListScatterGather, DmlGraphSupport::Supported, requiredConstantCpuInputs(), std::nullopt, QueryScatter)}, {REG_INFO( 11, ScatterND, typeNameListScatterGatherND, supportedTypeListScatterGatherND, DmlGraphSupport::Supported)}, {REG_INFO( 13, ScatterND, typeNameListScatterGatherND, supportedTypeListScatterGatherND, DmlGraphSupport::Supported)}, {REG_INFO( 16, ScatterND, typeNameListScatterGatherND, supportedTypeListScatterGatherND, DmlGraphSupport::Supported, requiredConstantCpuInputs(), std::nullopt, QueryScatter)}, From 6f2dd10d5278a85f85e552b37b3b469ab198d31b Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Mon, 6 Feb 2023 15:36:42 -0800 Subject: [PATCH 15/68] IdentityBuilder should add Delimit for each input (#14592) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …("####") should append for each input_def, not only on the last one else branch of this if should return ignore_identity https://github.com/microsoft/onnxruntime/blob/3d7518762ace6929be98e1203174c2dbf1ac094e/onnxruntime/core/optimizer/identical_children_consolidation.cc#L66 identity.append("####") should append for each input_def, not only on the last one ### Description ### Motivation and Context --- .../core/optimizer/identical_children_consolidation.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/optimizer/identical_children_consolidation.cc b/onnxruntime/core/optimizer/identical_children_consolidation.cc index 17f01cebcdb6c..07dc25dabde5f 100644 --- a/onnxruntime/core/optimizer/identical_children_consolidation.cc +++ b/onnxruntime/core/optimizer/identical_children_consolidation.cc @@ -117,8 +117,11 @@ string_view IdenticalChildrenConsolidation::IdentityBuilder(const Graph& graph, } else { identity.append(name); } + } else { + return ignore_identity; } + identity.append("####"); } - return {identity.append("####")}; + return {identity}; } } // namespace onnxruntime \ No newline at end of file From a5dab850b8dfa64d8818eefa01e0df40cf3e4400 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 7 Feb 2023 01:38:00 +0000 Subject: [PATCH 16/68] Bump jszip from 3.7.1 to 3.8.0 in /js/web (#14536) --- js/web/package-lock.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/js/web/package-lock.json b/js/web/package-lock.json index 186fee0dcfd54..3ae7d9a814c5b 100644 --- a/js/web/package-lock.json +++ b/js/web/package-lock.json @@ -3654,9 +3654,9 @@ } }, "node_modules/jszip": { - "version": "3.7.1", - "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.7.1.tgz", - "integrity": "sha512-ghL0tz1XG9ZEmRMcEN2vt7xabrDdqHHeykgARpmZ0BiIctWxM47Vt63ZO2dnp4QYt/xJVLLy5Zv1l/xRdh2byg==", + "version": "3.8.0", + "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.8.0.tgz", + "integrity": "sha512-cnpQrXvFSLdsR9KR5/x7zdf6c3m8IhZfZzSblFEHSqBaVwD2nvJ4CuCKLyvKvwBgZm08CgfSoiTBQLm5WW9hGw==", "dev": true, "dependencies": { "lie": "~3.3.0", @@ -9947,9 +9947,9 @@ } }, "jszip": { - "version": "3.7.1", - "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.7.1.tgz", - "integrity": "sha512-ghL0tz1XG9ZEmRMcEN2vt7xabrDdqHHeykgARpmZ0BiIctWxM47Vt63ZO2dnp4QYt/xJVLLy5Zv1l/xRdh2byg==", + "version": "3.8.0", + "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.8.0.tgz", + "integrity": "sha512-cnpQrXvFSLdsR9KR5/x7zdf6c3m8IhZfZzSblFEHSqBaVwD2nvJ4CuCKLyvKvwBgZm08CgfSoiTBQLm5WW9hGw==", "dev": true, "requires": { "lie": "~3.3.0", From d632f9a3fa126a64bcab7be2000220e2a73765be Mon Sep 17 00:00:00 2001 From: ytaous <4484531+ytaous@users.noreply.github.com> Date: Mon, 6 Feb 2023 20:52:06 -0800 Subject: [PATCH 17/68] [ROCm] Enable Sampling Op UT on AMD (#14581) Making basic porting effort to run Sampling UT on ROCm ep, based on the commits: https://github.com/microsoft/onnxruntime/pull/13426 https://github.com/microsoft/onnxruntime/pull/14218 1. enabling EmbedLayerNorm op 2. enabling Sampling op 3. enabling helpers to copy data from CPU->GPU for subgraph This task is the first checkpoint. There could be other missing ops when testing a real model. We will migrate more code onto ROCm as needed. Co-authored-by: Ubuntu --- cmake/onnxruntime_rocm_hipify.cmake | 13 --- .../cpu/transformers/subgraph_base.cc | 4 +- .../cuda/transformers/dump_cuda_tensor.cc | 17 +-- .../cuda/transformers/generation_cuda_impl.cu | 108 +++++++++--------- .../transformers/generation_device_helper.cc | 36 +++--- .../cuda/transformers/sampling_cuda_helper.h | 3 + .../contrib_ops/rocm/rocm_contrib_kernels.cc | 6 +- onnxruntime/core/framework/session_state.cc | 2 +- .../core/providers/rocm/math/softmax_impl.cu | 3 + .../contrib_ops/embed_layer_norm_op_test.cc | 7 +- onnxruntime/test/contrib_ops/sampling_test.cc | 45 ++++---- tools/ci_build/amd_hipify.py | 6 + 12 files changed, 137 insertions(+), 113 deletions(-) diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake index 92a3260714a36..6eb315c59bc80 100644 --- a/cmake/onnxruntime_rocm_hipify.cmake +++ b/cmake/onnxruntime_rocm_hipify.cmake @@ -11,10 +11,6 @@ set(contrib_ops_excluded_files "bert/attention_softmax.h" "bert/multihead_attention.cc" "bert/multihead_attention.h" - "bert/embed_layer_norm.cc" - "bert/embed_layer_norm.h" - "bert/embed_layer_norm_impl.cu" - "bert/embed_layer_norm_impl.h" "bert/fast_gelu_impl.cu" "bert/fast_gelu_impl.h" "bert/fast_gelu.cc" @@ -85,17 +81,8 @@ set(contrib_ops_excluded_files "tensor/image_scaler_impl.h" "transformers/beam_search.cc" "transformers/beam_search.h" - "transformers/generation_device_helper.cc" - "transformers/generation_device_helper.h" - "transformers/generation_cuda_impl.cu" - "transformers/generation_cuda_impl.h" "transformers/greedy_search.cc" "transformers/greedy_search.h" - "transformers/sampling.cc" - "transformers/sampling.h" - "transformers/sampling_cuda_helper.h" - "transformers/dump_cuda_tensor.cc" - "transformers/dump_cuda_tensor.h" "conv_transpose_with_dynamic_pads.cc" "conv_transpose_with_dynamic_pads.h" "cuda_contrib_kernels.cc" diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc b/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc index c7a2b8f0c0fc1..c8be36a41e944 100644 --- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc +++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc @@ -116,7 +116,9 @@ const IExecutionProvider* Subgraph::GetProvider() const { const ExecutionProviders& providers = session_state_->GetExecutionProviders(); const IExecutionProvider* cpu_provider = providers.Get(onnxruntime::kCpuExecutionProvider); const IExecutionProvider* cuda_provider = providers.Get(onnxruntime::kCudaExecutionProvider); - const IExecutionProvider* provider = cuda_provider ? cuda_provider : cpu_provider; + const IExecutionProvider* rocm_provider = providers.Get(onnxruntime::kRocmExecutionProvider); + const IExecutionProvider* gpu_provider = cuda_provider ? cuda_provider : rocm_provider; + const IExecutionProvider* provider = gpu_provider ? gpu_provider : cpu_provider; return provider; } diff --git a/onnxruntime/contrib_ops/cuda/transformers/dump_cuda_tensor.cc b/onnxruntime/contrib_ops/cuda/transformers/dump_cuda_tensor.cc index 741f9ac259da1..3046a58040635 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/dump_cuda_tensor.cc +++ b/onnxruntime/contrib_ops/cuda/transformers/dump_cuda_tensor.cc @@ -17,12 +17,12 @@ class PinnedHostBuffer { public: PinnedHostBuffer(size_t length) : buffer_(nullptr) { - cudaHostAlloc(&buffer_, length * sizeof(T), cudaHostAllocDefault); + CUDA_CALL_THROW(cudaHostAlloc((void**)&buffer_, length * sizeof(T), cudaHostAllocDefault)); } virtual ~PinnedHostBuffer() { if (buffer_) { - cudaFreeHost(buffer_); + CUDA_CALL_THROW(cudaFreeHost(buffer_)); } } @@ -46,8 +46,9 @@ void DumpGpuTensor(const char* name, const T* tensor, int dim0, int dim1, bool i // In that case, we copy tensor data as well. It is not needed, but it keeps code simple. int num_items = dim0 * dim1; auto data = std::make_shared>(num_items); - cudaDeviceSynchronize(); - cudaMemcpy(*data, tensor, num_items * sizeof(T), is_gpu_tensor ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost); + CUDA_CALL_THROW(cudaDeviceSynchronize()); + CUDA_CALL_THROW(cudaMemcpy(*data, tensor, num_items * sizeof(T), is_gpu_tensor ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost)); + if (nullptr != name) { std::cout << std::string(name) << std::endl; @@ -64,8 +65,8 @@ template void DumpGpuTensor(const char* name, const T* tensor, int dim0, int dim1, int dim2, bool is_gpu_tensor) { int num_items = dim0 * dim1 * dim2; auto data = std::make_shared>(num_items); - cudaDeviceSynchronize(); - cudaMemcpy(*data, tensor, num_items * sizeof(T), is_gpu_tensor ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost); + CUDA_CALL_THROW(cudaDeviceSynchronize()); + CUDA_CALL_THROW(cudaMemcpy(*data, tensor, num_items * sizeof(T), is_gpu_tensor ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost)); if (nullptr != name) { std::cout << std::string(name) << std::endl; @@ -82,8 +83,8 @@ template void DumpGpuTensor(const char* name, const T* tensor, int dim0, int dim1, int dim2, int dim3, bool is_gpu_tensor) { int num_items = dim0 * dim1 * dim2 * dim3; auto data = std::make_shared>(num_items); - cudaDeviceSynchronize(); - cudaMemcpy(*data, tensor, num_items * sizeof(T), is_gpu_tensor ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost); + CUDA_CALL_THROW(cudaDeviceSynchronize()); + CUDA_CALL_THROW(cudaMemcpy(*data, tensor, num_items * sizeof(T), is_gpu_tensor ? cudaMemcpyDeviceToHost : cudaMemcpyHostToHost)); if (nullptr != name) { std::cout << std::string(name) << std::endl; diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu index 523603a550be9..90c91228204b6 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu +++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu @@ -320,33 +320,33 @@ void GetTempStorageSize(const T* d_keys_in, bool is_descending, size_t& temp_storage_bytes) { if (is_descending) { - cub::DeviceSegmentedRadixSort::SortPairsDescending(nullptr, - temp_storage_bytes, - d_keys_in, - (T*)nullptr, - d_values_in, - (int*)nullptr, - num_items, - num_segments, - d_offsets, - d_offsets + 1, - 0, - sizeof(T) * 8, - stream); + CUDA_CALL_THROW(cub::DeviceSegmentedRadixSort::SortPairsDescending(nullptr, + temp_storage_bytes, + d_keys_in, + (T*)nullptr, + d_values_in, + (int*)nullptr, + num_items, + num_segments, + d_offsets, + d_offsets + 1, + 0, + sizeof(T) * 8, + stream)); } else { - cub::DeviceSegmentedRadixSort::SortPairs(nullptr, - temp_storage_bytes, - d_keys_in, - (T*)nullptr, - d_values_in, - (int*)nullptr, - num_items, - num_segments, - d_offsets, - d_offsets + 1, - 0, - sizeof(T) * 8, - stream); + CUDA_CALL_THROW(cub::DeviceSegmentedRadixSort::SortPairs(nullptr, + temp_storage_bytes, + d_keys_in, + (T*)nullptr, + d_values_in, + (int*)nullptr, + num_items, + num_segments, + d_offsets, + d_offsets + 1, + 0, + sizeof(T) * 8, + stream)); } } @@ -412,33 +412,33 @@ void LaunchSortPairs(void* d_temp_storage, cudaStream_t stream, bool is_descending) { if (is_descending) { - cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - num_items, - num_segments, - d_offsets, - d_offsets + 1, - 0, - sizeof(T) * 8, - stream); + CUDA_CALL_THROW(cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + num_items, + num_segments, + d_offsets, + d_offsets + 1, + 0, + sizeof(T) * 8, + stream)); } else { - cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - num_items, - num_segments, - d_offsets, - d_offsets + 1, - 0, - sizeof(T) * 8, - stream); + CUDA_CALL_THROW(cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + num_items, + num_segments, + d_offsets, + d_offsets + 1, + 0, + sizeof(T) * 8, + stream)); } } @@ -721,9 +721,9 @@ void TorchMultinomialKernelLauncher(float* d_input, cudaStream_t stream) { // Store the props in class variables int device; - cudaGetDevice(&device); + CUDA_CALL_THROW(cudaGetDevice(&device)); cudaDeviceProp props; - cudaGetDeviceProperties(&props, device); + CUDA_CALL_THROW(cudaGetDeviceProperties(&props, device)); int numSM = props.multiProcessorCount; int maxThreads = props.maxThreadsPerBlock; diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc index f6be2179dfdbf..703bd6a0e90ff 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc +++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc @@ -17,14 +17,23 @@ #include "contrib_ops/cpu/transformers/subgraph_gpt.h" #include "contrib_ops/cuda/transformers/beam_search_topk.h" #include "contrib_ops/cuda/transformers/greedy_search_top_one.h" + +// the includes would be dummy for ROCm, we will ignore them for now +#ifdef ENABLE_NVTX_PROFILE #include "core/providers/cuda/nvtx_profile.h" #include "core/providers/cuda/nvtx_profile_context.h" +#endif + #include "sampling_cuda_helper.h" #ifdef DEBUG_GENERATION #include #endif +using onnxruntime::cuda::ToCudaType; +using onnxruntime::cuda::TArray; +using onnxruntime::cuda::TopKImpl; + namespace onnxruntime { namespace concurrency { class ThreadPool; @@ -203,12 +212,13 @@ void InitBeamState(transformers::IBeamSearchState* beam_state, // TODO(tianleiwu): we can use another stream to avoid blocking subgraph execution. cudaStream_t cuda_stream = ort_stream ? static_cast(ort_stream->GetHandle()) : nullptr; - cudaMemsetAsync(beam_state->next_token_logits.data(), 0, beam_state->next_token_logits.size_bytes(), cuda_stream); - cudaMemsetAsync(beam_state->next_token_scores.data(), 0, beam_state->next_token_scores.size_bytes(), cuda_stream); - cudaMemsetAsync(beam_state->next_tokens.data(), 0, beam_state->next_tokens.size_bytes(), cuda_stream); - cudaMemsetAsync(beam_state->next_indices.data(), 0, beam_state->next_indices.size_bytes(), cuda_stream); - cudaMemsetAsync(beam_state->next_scores.data(), 0, beam_state->next_scores.size_bytes(), cuda_stream); - cudaMemsetAsync(beam_state->topk_buffer.data(), 0, beam_state->topk_buffer.size_bytes(), cuda_stream); + CUDA_CALL_THROW(cudaMemsetAsync(beam_state->next_token_logits.data(), 0, beam_state->next_token_logits.size_bytes(), cuda_stream)); + CUDA_CALL_THROW(cudaMemsetAsync(beam_state->next_token_scores.data(), 0, beam_state->next_token_scores.size_bytes(), cuda_stream)); + CUDA_CALL_THROW(cudaMemsetAsync(beam_state->next_tokens.data(), 0, beam_state->next_tokens.size_bytes(), cuda_stream)); + CUDA_CALL_THROW(cudaMemsetAsync(beam_state->next_indices.data(), 0, beam_state->next_indices.size_bytes(), cuda_stream)); + CUDA_CALL_THROW(cudaMemsetAsync(beam_state->next_scores.data(), 0, beam_state->next_scores.size_bytes(), cuda_stream)); + CUDA_CALL_THROW(cudaMemsetAsync(beam_state->topk_buffer.data(), 0, beam_state->topk_buffer.size_bytes(), cuda_stream)); + // Initialize score of first beam of each group with 0 and the rest with -1e9. cuda::LaunchInitKernel(beam_state->beam_scores.data(), batch_size, num_beams, cuda_stream); @@ -216,8 +226,8 @@ void InitBeamState(transformers::IBeamSearchState* beam_state, // copy sequence lengths to GPU // since next_positions is only needed to update feeds after subgraph execution, so it is fine to use Async here. if (!beam_state->next_positions.empty()) { // next_positions is empty for T5 - cudaMemcpyAsync(beam_state->next_positions.data(), sequence_lengths.data(), sequence_lengths.size_bytes(), - cudaMemcpyHostToDevice, cuda_stream); + CUDA_CALL_THROW(cudaMemcpyAsync(beam_state->next_positions.data(), sequence_lengths.data(), sequence_lengths.size_bytes(), + cudaMemcpyHostToDevice, cuda_stream)); } #ifdef ENABLE_NVTX_PROFILE @@ -234,12 +244,12 @@ void InitGreedyState(transformers::IGreedySearchState* greedy_state, initStateRange.Begin(); #endif - cudaStream_t cuda_stream = ort_stream ? reinterpret_cast(ort_stream->GetHandle()) : nullptr; - cudaMemsetAsync(greedy_state->next_token_scores.data(), 0, greedy_state->next_token_scores.size_bytes(), cuda_stream); - cudaMemsetAsync(greedy_state->next_positions.data(), 0, greedy_state->next_positions.size_bytes(), cuda_stream); + cudaStream_t cuda_stream = ort_stream ? reinterpret_cast(ort_stream->GetHandle()) : nullptr; + CUDA_CALL_THROW(cudaMemsetAsync(greedy_state->next_token_scores.data(), 0, greedy_state->next_token_scores.size_bytes(), cuda_stream)); + CUDA_CALL_THROW(cudaMemsetAsync(greedy_state->next_positions.data(), 0, greedy_state->next_positions.size_bytes(), cuda_stream)); - cudaMemcpyAsync(greedy_state->next_positions.data(), sequence_lengths.data(), sequence_lengths.size_bytes(), - cudaMemcpyHostToDevice, cuda_stream); + CUDA_CALL_THROW(cudaMemcpyAsync(greedy_state->next_positions.data(), sequence_lengths.data(), sequence_lengths.size_bytes(), + cudaMemcpyHostToDevice, cuda_stream)); #ifdef ENABLE_NVTX_PROFILE initStateRange.End(); diff --git a/onnxruntime/contrib_ops/cuda/transformers/sampling_cuda_helper.h b/onnxruntime/contrib_ops/cuda/transformers/sampling_cuda_helper.h index 753aea9d38089..2a5875aba5fa1 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/sampling_cuda_helper.h +++ b/onnxruntime/contrib_ops/cuda/transformers/sampling_cuda_helper.h @@ -11,6 +11,9 @@ #include #endif +using onnxruntime::cuda::ToCudaType; +using onnxruntime::cuda::dispatch_blockwise_softmax_forward; + namespace onnxruntime { namespace contrib { namespace SamplingCudaHelper { diff --git a/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc b/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc index b92efc3a6109a..e056c8cbfb64d 100644 --- a/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc @@ -69,6 +69,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ParametricSoftplus); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ParametricSoftplus); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ParametricSoftplus); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, Sampling); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, float, ScaledTanh); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, double, ScaledTanh); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, MLFloat16, ScaledTanh); @@ -166,8 +167,8 @@ Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) { 1, MLFloat16, DecoderAttention)>, // BuildKernelCreateInfo, // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, @@ -178,6 +179,7 @@ Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) { // BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc index facce93cde798..d9c02702d733e 100644 --- a/onnxruntime/core/framework/session_state.cc +++ b/onnxruntime/core/framework/session_state.cc @@ -1004,7 +1004,7 @@ Status SessionState::CreateSubgraphSessionState() { for (auto& node : graph_.Nodes()) { for (auto& entry : node.GetAttributeNameToMutableSubgraphMap()) { const auto& ep = node.GetExecutionProviderType(); - if (!ep.empty() && ep != kCpuExecutionProvider && ep != kCudaExecutionProvider) { + if (!ep.empty() && ep != kCpuExecutionProvider && ep != kCudaExecutionProvider && ep != kRocmExecutionProvider) { // SessionState is only used when ORT is executing the subgraph. If a non-ORT EP has taken the control flow // node containing the subgraph it will create whatever state it needs internally. continue; diff --git a/onnxruntime/core/providers/rocm/math/softmax_impl.cu b/onnxruntime/core/providers/rocm/math/softmax_impl.cu index d37235acfa0e1..1948878e7bb3f 100644 --- a/onnxruntime/core/providers/rocm/math/softmax_impl.cu +++ b/onnxruntime/core/providers/rocm/math/softmax_impl.cu @@ -131,6 +131,9 @@ SPECIALIZED_BLOCKWISE_SOFTMAX_IMPL(half, half, float) SPECIALIZED_BLOCKWISE_SOFTMAX_IMPL(double, double, double) SPECIALIZED_BLOCKWISE_SOFTMAX_IMPL(BFloat16, BFloat16, float) +#ifndef DISABLE_CONTRIB_OPS +SPECIALIZED_BLOCKWISE_SOFTMAX_IMPL(half, float, float) // used by BeamSearch op +#endif } } diff --git a/onnxruntime/test/contrib_ops/embed_layer_norm_op_test.cc b/onnxruntime/test/contrib_ops/embed_layer_norm_op_test.cc index be384a20a3190..884f4422d5d8b 100644 --- a/onnxruntime/test/contrib_ops/embed_layer_norm_op_test.cc +++ b/onnxruntime/test/contrib_ops/embed_layer_norm_op_test.cc @@ -17,10 +17,11 @@ static void RunTest(const embedlayernorm::OpData& data, int min_cuda_architecture = use_float16 ? 530 : 0; bool enable_cuda = HasCudaEnvironment(min_cuda_architecture); + bool enable_rocm = DefaultRocmExecutionProvider().get() != nullptr; bool enable_dml = DefaultDmlExecutionProvider().get() != nullptr; bool enable_cpu = !use_float16; - if (enable_cpu || enable_cuda || enable_dml) { + if (enable_cpu || enable_cuda || enable_dml || enable_rocm) { // Input and output shapes // Input 0 - input_ids : (batch_size, sequence_size) // Input 1 - segment_ids : (batch_size, sequence_size) @@ -143,6 +144,10 @@ static void RunTest(const embedlayernorm::OpData& data, std::vector> execution_providers; execution_providers.push_back(DefaultCudaExecutionProvider()); tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } else if (enable_rocm) { + std::vector> execution_providers; + execution_providers.push_back(DefaultRocmExecutionProvider()); + tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); } else if (enable_dml) { std::vector> execution_providers; execution_providers.push_back(DefaultDmlExecutionProvider()); diff --git a/onnxruntime/test/contrib_ops/sampling_test.cc b/onnxruntime/test/contrib_ops/sampling_test.cc index 48992f24a3234..e0ad20415ca5f 100644 --- a/onnxruntime/test/contrib_ops/sampling_test.cc +++ b/onnxruntime/test/contrib_ops/sampling_test.cc @@ -14,8 +14,8 @@ namespace onnxruntime { namespace test { #if defined(__linux__) && !defined(__ANDROID__) -#ifdef USE_CUDA -TEST(SamplingTest, Gpt2Sampling_CUDA) { +#if defined(USE_CUDA) || defined(USE_ROCM) +TEST(SamplingTest, Gpt2Sampling_GPU) { std::vector input_ids{ 0, 0, 0, 0, 0, 52, 195, 731, 321, 301, 734, 620, 41, 554, 74, 622, 206, 222, 75, 223, 221, 198, 224, 572, @@ -25,7 +25,6 @@ TEST(SamplingTest, Gpt2Sampling_CUDA) { std::vector min_length{1}; std::vector repetition_penalty{1.0f}; - std::vector expected_output{ 0, 0, 0, 0, 0, 52, 195, 731, 321, 301, 734, 620, 125, 543, 668, 41, 554, 74, 622, 206, 222, 75, 223, 221, 198, 224, 572, 776, 213, 697, @@ -35,9 +34,7 @@ TEST(SamplingTest, Gpt2Sampling_CUDA) { const int64_t sequence_length = 12; std::vector input_ids_shape{batch_size, sequence_length}; - std::vector parameter_shape{1}; - std::vector expected_output_shape{input_ids_shape[0], max_length[0]}; Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault); @@ -62,28 +59,36 @@ TEST(SamplingTest, Gpt2Sampling_CUDA) { const char* const output_names[] = {"sequences"}; Ort::SessionOptions session_options; +#ifdef USE_CUDA constexpr int min_cuda_architecture = 530; - if (HasCudaEnvironment(min_cuda_architecture)) { - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0)); + if (!HasCudaEnvironment(min_cuda_architecture)) { + LOGS_DEFAULT(WARNING) << "Hardware NOT support current architecture"; + return; + } + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0)); +#else // USE_ROCM + OrtROCMProviderOptions rocm_options; + // TODO - verify the default settings + session_options.AppendExecutionProvider_ROCM(rocm_options); +#endif - Ort::Session session(*ort_env, ORT_TSTR("testdata/transformers/tiny_gpt2_sampling.onnx"), session_options); + Ort::Session session(*ort_env, ORT_TSTR("testdata/transformers/tiny_gpt2_sampling.onnx"), session_options); - auto ort_outputs = session.Run(Ort::RunOptions{}, input_names, ort_inputs.data(), ort_inputs.size(), - output_names, 1); + auto ort_outputs = session.Run(Ort::RunOptions{}, input_names, ort_inputs.data(), ort_inputs.size(), + output_names, 1); - ASSERT_EQ(ort_outputs.size(), 1U); - const auto& sequences = ort_outputs[0]; - ASSERT_TRUE(sequences.IsTensor()); + ASSERT_EQ(ort_outputs.size(), 1U); + const auto& sequences = ort_outputs[0]; + ASSERT_TRUE(sequences.IsTensor()); - auto result_ts = sequences.GetTensorTypeAndShapeInfo(); - ASSERT_EQ(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, result_ts.GetElementType()); + auto result_ts = sequences.GetTensorTypeAndShapeInfo(); + ASSERT_EQ(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, result_ts.GetElementType()); - ASSERT_EQ(expected_output_shape, result_ts.GetShape()); - const auto* result_vals = sequences.GetTensorData(); - auto result_span = gsl::make_span(result_vals, expected_output.size()); + ASSERT_EQ(expected_output_shape, result_ts.GetShape()); + const auto* result_vals = sequences.GetTensorData(); + auto result_span = gsl::make_span(result_vals, expected_output.size()); - ASSERT_TRUE(std::equal(expected_output.cbegin(), expected_output.cend(), result_span.begin(), result_span.end())); - } + ASSERT_TRUE(std::equal(expected_output.cbegin(), expected_output.cend(), result_span.begin(), result_span.end())); } #endif diff --git a/tools/ci_build/amd_hipify.py b/tools/ci_build/amd_hipify.py index e6a5f8f8cc38a..08b258406ffd3 100644 --- a/tools/ci_build/amd_hipify.py +++ b/tools/ci_build/amd_hipify.py @@ -59,6 +59,8 @@ def hipify(hipify_perl_path, src_file_path, dst_file_path): s = s.replace("GPU_WARP_SIZE = 32", "GPU_WARP_SIZE = 64") s = s.replace("std::exp", "expf") s = s.replace("std::log", "logf") + s = s.replace("WaitCudaNotificationOnDevice", "WaitRocmNotificationOnDevice") + s = s.replace("hipHostAlloc", "hipHostMalloc") s = s.replace( "#include ", "#include \n#include ", @@ -67,6 +69,10 @@ def hipify(hipify_perl_path, src_file_path, dst_file_path): '#include "cub/device/device_radix_sort.cuh"', "#include \n#include ", ) + s = s.replace( + "#include ", + "#include ", + ) s = s.replace( "#include ", "#include " ) From cf8bad7f19830567bde7725835501c0d0dda7a71 Mon Sep 17 00:00:00 2001 From: Chun-Wei Chen Date: Mon, 6 Feb 2023 21:44:04 -0800 Subject: [PATCH 18/68] Fix CI failure: temporarily disable real model tests from onnx repo (#14606) ### Description To faster unblock pipeline failure globally, disable these real models tests from onnx repo for now. Meanwhile, we are trying to move these models to Azure. ### Motivation and Context https://github.com/onnx/onnx/issues/4857 these models in onnx repo are broken. They are setup 4 years ago and the owner of these AWS instances is unfound. --- .../testdata/onnx_backend_test_series_filters.jsonc | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc index 428910849f8db..7a9f09c1851dc 100644 --- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc +++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc @@ -131,7 +131,18 @@ "^test_edge_pad_cuda", "^test_reflect_pad_cuda", "^test_softplus_example_expanded_cuda", - "^test_softplus_expanded_cuda" + "^test_softplus_expanded_cuda", + + // TODO: Recover these real model tests from onnx + "^test_vgg19", + "^test_zfnet512", + "^test_bvlc_alexnet", + "^test_densenet121", + "^test_inception_v1", + "^test_inception_v2", + "^test_resnet50", + "^test_shufflenet", + "^test_squeezenet" ], "current_failing_tests_x86": [ "^test_vgg19", From f88a4646cdb47f106fc0838426dabe362353321d Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Tue, 7 Feb 2023 17:53:53 +0800 Subject: [PATCH 19/68] try VS 2022 in windowsAI pipeline (#14608) ### Description update VS2019 to VS 2022 in onnxruntime-Nuget-WindowsAI-Pipeline-Official ### Motivation and Context --- .pipelines/windowsai-steps.yml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/.pipelines/windowsai-steps.yml b/.pipelines/windowsai-steps.yml index aa9ec339c7b2a..d246b326764a2 100644 --- a/.pipelines/windowsai-steps.yml +++ b/.pipelines/windowsai-steps.yml @@ -3,12 +3,12 @@ parameters: displayName: BuildArch type: string default: 'x64' - + - name: Runtime displayName: MSVC Runtime, should be 'dynamic' or 'static'. type: string default: 'dynamic' - + - name: PythonPackageName displayName: PythonPackageName on nuget.org to use type: string @@ -17,8 +17,8 @@ parameters: jobs: - job: Windows_Packaging_${{ parameters.BuildArch }}_${{ parameters.Runtime }} pool: - type: windows - + type: windows + variables: ob_outputDirectory: '$(Build.ArtifactStagingDirectory)' ob_sdl_binskim_break: true @@ -40,7 +40,7 @@ jobs: restoreSolution: $(Build.SourcesDirectory)\.pipelines\nuget_config\x64\packages.config ${{ if eq(parameters.BuildArch, 'arm64') }}: restoreSolution: $(Build.SourcesDirectory)\.pipelines\nuget_config\x64\packages.config - + - script: | @echo off set vswherepath="%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" @@ -50,10 +50,10 @@ jobs: set vsdevcmd="%%i\Common7\Tools\vsdevcmd.bat" ) ) - + @echo vslatest %vslatest% @echo vsdevcmd %vsdevcmd% - + @echo ##vso[task.setvariable variable=vslatest]%vslatest% @echo ##vso[task.setvariable variable=vsdevcmd]%vsdevcmd% -arch=${{ parameters.BuildArch }} displayName: 'locate vsdevcmd via vswhere' @@ -80,7 +80,7 @@ jobs: 7z x cmake-3.24.3-windows-x86_64.zip set PYTHONHOME=$(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.7.9\tools set PYTHONPATH=$(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.7.9\tools - $(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.7.9\tools\python.exe "$(Build.SourcesDirectory)\tools\ci_build\build.py" --build_dir $(Build.BinariesDirectory) --build_shared_lib --enable_onnx_tests --ms_experimental --use_dml --use_winml --cmake_generator "Visual Studio 16 2019" --update --config RelWithDebInfo --enable_lto --use_telemetry --disable_rtti --enable_wcos $(BuildFlags) --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.19041.0 --cmake_path $(Build.BinariesDirectory)\cmake-3.24.3-windows-x86_64\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake-3.24.3-windows-x86_64\bin\ctest.exe + $(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.7.9\tools\python.exe "$(Build.SourcesDirectory)\tools\ci_build\build.py" --build_dir $(Build.BinariesDirectory) --build_shared_lib --enable_onnx_tests --ms_experimental --use_dml --use_winml --cmake_generator "Visual Studio 17 2022" --update --config RelWithDebInfo --enable_lto --use_telemetry --disable_rtti --enable_wcos $(BuildFlags) --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.19041.0 --cmake_path $(Build.BinariesDirectory)\cmake-3.24.3-windows-x86_64\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake-3.24.3-windows-x86_64\bin\ctest.exe workingDirectory: '$(Build.BinariesDirectory)' displayName: 'Generate cmake config' @@ -97,7 +97,7 @@ jobs: maximumCpuCount: true logProjectEvents: true workingFolder: '$(Build.BinariesDirectory)\RelWithDebInfo' - createLogFile: true + createLogFile: true - ${{ if eq(parameters.Runtime, 'dynamic') }}: - script: | @@ -114,8 +114,8 @@ jobs: copy $(Build.SourcesDirectory)\onnxruntime\test\testdata\sequence_length.onnx $(Build.ArtifactStagingDirectory)\test_artifact\ copy $(Build.SourcesDirectory)\onnxruntime\test\testdata\sequence_construct.onnx $(Build.ArtifactStagingDirectory)\test_artifact\ displayName: 'Copy WinML test collateral to artifact directory' - - + + - ${{ if eq(parameters.BuildArch, 'x64') }}: - script: | call $(vsdevcmd) @@ -129,8 +129,8 @@ jobs: signing_profile: 'external_distribution' files_to_sign: '**/*.exe;**/*.dll' search_root: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo' - displayName: 'Sign runtime DLLs' - + displayName: 'Sign runtime DLLs' + - ${{ if eq(parameters.BuildArch, 'x64') }}: - script: | call $(vsdevcmd) From 742658d171fdabb9032f4f2e5a847687188231b6 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 7 Feb 2023 07:49:15 -0800 Subject: [PATCH 20/68] Stable Diffusion CUDA optimizations Part 2 (#14597) ### Description This is a follow-up of https://github.com/microsoft/onnxruntime/pull/14428 for Stable Diffusion CUDA optimizations: (1) use NchwConv to replace Conv in onnx graph and add Tranpose nodes accordingly (2) reduce sequential Transpose nodes to at most one. (3) symbolic shape infer of NchwConv (4) fix add bias transpose which causes CUDA error (launching more than 1024 threads per block) in inferencing fp32 model. (5) add models (bert, bart, stable_diffusion subdirectories) to package; (6) remove option --disable_channels_last Note that (1) We can add a few graph transformations to reduce Transpose nodes further. It is not done in this PR due to time limit. (2) Stable diffusion 2.1 model outputs black images. It seems that forcing Attention to float32 could avoid the issue. However it is much slow to use float32 Attention. ### Motivation and Context --- cmake/onnxruntime_python.cmake | 21 +++++ .../cuda/bert/add_bias_transpose.cu | 24 ++--- .../python/tools/symbolic_shape_infer.py | 33 +++++-- .../tools/transformers/fusion_nhwc_conv.py | 90 +++++++++++++++++++ .../tools/transformers/fusion_reshape.py | 7 +- .../tools/transformers/fusion_transpose.py | 81 +++++++++++++++++ .../python/tools/transformers/fusion_utils.py | 31 ++++++- .../models/stable_diffusion/benchmark.py | 24 ++--- .../stable_diffusion/optimize_pipeline.py | 24 ++--- .../models/stable_diffusion/requirements.txt | 14 +++ .../python/tools/transformers/onnx_model.py | 12 ++- .../tools/transformers/onnx_model_unet.py | 83 ++++++++++++++++- setup.py | 3 + 13 files changed, 386 insertions(+), 61 deletions(-) create mode 100644 onnxruntime/python/tools/transformers/fusion_nhwc_conv.py create mode 100644 onnxruntime/python/tools/transformers/fusion_transpose.py create mode 100644 onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake index 809a076443609..c24b6b9be548a 100644 --- a/cmake/onnxruntime_python.cmake +++ b/cmake/onnxruntime_python.cmake @@ -467,12 +467,21 @@ file(GLOB onnxruntime_python_quantization_cal_table_flatbuffers_src CONFIGURE_DE file(GLOB onnxruntime_python_transformers_src CONFIGURE_DEPENDS "${ONNXRUNTIME_ROOT}/python/tools/transformers/*.py" ) +file(GLOB onnxruntime_python_transformers_models_bart_src CONFIGURE_DEPENDS + "${ONNXRUNTIME_ROOT}/python/tools/transformers/models/bart/*.py" +) +file(GLOB onnxruntime_python_transformers_models_bert_src CONFIGURE_DEPENDS + "${ONNXRUNTIME_ROOT}/python/tools/transformers/models/bert/*.py" +) file(GLOB onnxruntime_python_transformers_models_gpt2_src CONFIGURE_DEPENDS "${ONNXRUNTIME_ROOT}/python/tools/transformers/models/gpt2/*.py" ) file(GLOB onnxruntime_python_transformers_models_longformer_src CONFIGURE_DEPENDS "${ONNXRUNTIME_ROOT}/python/tools/transformers/models/longformer/*.py" ) +file(GLOB onnxruntime_python_transformers_models_stable_diffusion_src CONFIGURE_DEPENDS + "${ONNXRUNTIME_ROOT}/python/tools/transformers/models/stable_diffusion/*.py" +) file(GLOB onnxruntime_python_transformers_models_t5_src CONFIGURE_DEPENDS "${ONNXRUNTIME_ROOT}/python/tools/transformers/models/t5/*.py" ) @@ -526,8 +535,11 @@ add_custom_command( COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/tools/ort_format_model/ort_flatbuffers_py COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/transformers COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/transformers/models + COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/transformers/models/bart + COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/transformers/models/bert COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/transformers/models/gpt2 COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/transformers/models/longformer + COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/transformers/models/stable_diffusion COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/transformers/models/t5 COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/quantization COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/quantization/operators @@ -606,12 +618,21 @@ add_custom_command( COMMAND ${CMAKE_COMMAND} -E copy ${onnxruntime_python_transformers_src} $/onnxruntime/transformers/ + COMMAND ${CMAKE_COMMAND} -E copy + ${onnxruntime_python_transformers_models_bart_src} + $/onnxruntime/transformers/models/bart/ + COMMAND ${CMAKE_COMMAND} -E copy + ${onnxruntime_python_transformers_models_bert_src} + $/onnxruntime/transformers/models/bert/ COMMAND ${CMAKE_COMMAND} -E copy ${onnxruntime_python_transformers_models_gpt2_src} $/onnxruntime/transformers/models/gpt2/ COMMAND ${CMAKE_COMMAND} -E copy ${onnxruntime_python_transformers_models_longformer_src} $/onnxruntime/transformers/models/longformer/ + COMMAND ${CMAKE_COMMAND} -E copy + ${onnxruntime_python_transformers_models_stable_diffusion_src} + $/onnxruntime/transformers/models/stable_diffusion/ COMMAND ${CMAKE_COMMAND} -E copy ${onnxruntime_python_transformers_models_t5_src} $/onnxruntime/transformers/models/t5/ diff --git a/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.cu b/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.cu index e86736726c224..8f271ecfcbfa8 100644 --- a/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.cu +++ b/onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.cu @@ -519,6 +519,7 @@ void InvokeAddBiasTranspose( cudaStream_t stream, const int num_matrices, const int format, const int max_threads_per_block, const int batch_size, const int sequence_length, const int num_heads, const int qk_head_size, const T* input, const T* biases, T* output, T* qkv_add_bias, const int v_head_size, int total_matrix_count) { + assert(num_heads <= max_threads_per_block); const dim3 grid(sequence_length, batch_size, num_matrices); if (qk_head_size * num_heads <= max_threads_per_block) { const dim3 block(qk_head_size, num_heads, 1); @@ -544,7 +545,7 @@ void InvokeAddBiasTranspose( AddBiasTranspose<<>>(input, biases, output); } } else { - const dim3 block(CeilDiv(max_threads_per_block, num_heads), num_heads, 1); + const dim3 block(max_threads_per_block / num_heads, num_heads, 1); if (format == 2) { AddBiasTransposeTrtLarge<<>>(qk_head_size, input, biases, output); } else if (format == 1) { @@ -577,7 +578,7 @@ void LaunchAddBiasTranspose( const half* input, const half* biases, half* output, bool enable_half4, const int v_head_size, half* qkv_add_bias, int total_matrix_count) { total_matrix_count = std::max(num_matrices, total_matrix_count); - if (enable_half4 && 0 == (qk_head_size % 4) && 0 == (v_head_size % 4)) { + if (enable_half4 && 0 == (qk_head_size % 4) && (v_head_size == -1 || 0 == (v_head_size % 4))) { const int H = qk_head_size / 4; const int H_v = v_head_size / 4; const Half4* input2 = reinterpret_cast(input); @@ -587,7 +588,7 @@ void LaunchAddBiasTranspose( InvokeAddBiasTranspose(stream, num_matrices, format, max_threads_per_block, batch_size, sequence_length, num_heads, H, input2, biases2, output2, qkv_add_bias2, H_v, total_matrix_count); - } else if (0 == (qk_head_size & 1) && 0 == (v_head_size & 1)) { + } else if (0 == (qk_head_size & 1) && (v_head_size == -1 || 0 == (v_head_size & 1))) { const int H = qk_head_size / 2; const int H_v = v_head_size / 2; const half2* input2 = reinterpret_cast(input); @@ -612,7 +613,7 @@ void LaunchAddBiasTranspose( const float* input, const float* biases, float* output, bool /*enable_half4*/, const int v_head_size, float* qkv_add_bias, int total_matrix_count) { total_matrix_count = std::max(num_matrices, total_matrix_count); - if (0 == (qk_head_size % 4) && 0 == (v_head_size % 4)) { + if (0 == (qk_head_size % 4) && (v_head_size == -1 || 0 == (v_head_size % 4))) { const int H = qk_head_size / 4; const float4* input2 = reinterpret_cast(input); const float4* biases2 = reinterpret_cast(biases); @@ -622,7 +623,7 @@ void LaunchAddBiasTranspose( stream, num_matrices, format, max_threads_per_block, batch_size, sequence_length, num_heads, H, input2, biases2, output2, qkv_add_bias2, v_head_size / 4, total_matrix_count); - } else if (0 == (qk_head_size & 1) && 0 == (v_head_size & 1)) { + } else if (0 == (qk_head_size & 1) && (v_head_size == -1 || 0 == (v_head_size & 1))) { const int H = qk_head_size / 2; const float2* input2 = reinterpret_cast(input); const float2* biases2 = reinterpret_cast(biases); @@ -654,7 +655,7 @@ void InvokeAddBiasTransposeTrt( const dim3 block(head_size, num_heads, 1); AddBiasTransposeTrt<<>>(query, key, value, biases, output); } else { - const dim3 block(CeilDiv(max_threads_per_block, num_heads), num_heads, 1); + const dim3 block(max_threads_per_block / num_heads, num_heads, 1); AddBiasTransposeTrtLarge<<>>(head_size, query, key, value, biases, output); } } else { // cross attention @@ -666,7 +667,7 @@ void InvokeAddBiasTransposeTrt( const dim3 block(head_size, num_heads, 1); AddBiasTransposeTrt<<>>(query, biases, output); } else { - const dim3 block(CeilDiv(max_threads_per_block, num_heads), num_heads, 1); + const dim3 block(max_threads_per_block / num_heads, num_heads, 1); AddBiasTransposeTrtLarge<<>>(head_size, query, biases, output); } } @@ -680,7 +681,7 @@ void InvokeAddBiasTransposeTrt( const dim3 block(head_size, num_heads, 1); AddBiasTransposeTrtKV<<>>(key, value, biases, packed_kv); } else { - const dim3 block(CeilDiv(max_threads_per_block, num_heads), num_heads, 1); + const dim3 block(max_threads_per_block / num_heads, num_heads, 1); AddBiasTransposeTrtKVLarge<<>>(head_size, key, value, biases, packed_kv); } } @@ -737,6 +738,7 @@ void InvokeAddBias( const int batch_size, const int sequence_length, const int kv_sequence_length, const int num_heads, const int head_size, const int v_head_size, const T* biases, const T* query, const T* key, const T* value, T* q, T* k, T* v) { + assert(num_heads <= max_threads_per_block); constexpr int num_matrices = 1; // Q { @@ -745,7 +747,7 @@ void InvokeAddBias( const dim3 block(head_size, num_heads, 1); AddBiasTransposeTrt<<>>(query, biases, q); } else { - const dim3 block(CeilDiv(max_threads_per_block, num_heads), num_heads, 1); + const dim3 block(max_threads_per_block / num_heads, num_heads, 1); AddBiasTransposeTrtLarge<<>>(head_size, query, biases, q); } } @@ -758,7 +760,7 @@ void InvokeAddBias( const dim3 block(head_size, num_heads, 1); AddBiasTransposeTrt<<>>(key, biases_k, k); } else { - const dim3 block(CeilDiv(max_threads_per_block, num_heads), num_heads, 1); + const dim3 block(max_threads_per_block / num_heads, num_heads, 1); AddBiasTransposeTrtLarge<<>>(head_size, key, biases_k, k); } } @@ -772,7 +774,7 @@ void InvokeAddBias( const dim3 block(v_head_size, num_heads, 1); AddBiasTransposeTrt<<>>(value, biases_v, v); } else { - const dim3 block(CeilDiv(max_threads_per_block, num_heads), num_heads, 1); + const dim3 block(max_threads_per_block / num_heads, num_heads, 1); AddBiasTransposeTrtLarge<<>>(v_head_size, value, biases_v, v); } } diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py index ae320279d724a..842d5cd943226 100755 --- a/onnxruntime/python/tools/symbolic_shape_infer.py +++ b/onnxruntime/python/tools/symbolic_shape_infer.py @@ -203,6 +203,7 @@ def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""): "SkipSimplifiedLayerNormalization": self._infer_SkipLayerNormalization, "GroupNorm": self._infer_GroupNorm, "BiasSplitGelu": self._infer_BiasSplitGelu, + "NhwcConv": self._infer_NhwcConv, } self.aten_op_dispatcher_ = { "embedding": self._infer_Gather, @@ -442,6 +443,7 @@ def _onnx_infer_single_node(self, node): "MultiHeadAttention", "GroupNorm", "BiasSplitGelu", + "NhwcConv", ] if not skip_infer: @@ -623,13 +625,13 @@ def _new_symbolic_dim_from_output(self, node, out_idx=0, dim=0): def _new_symbolic_shape(self, rank, node, out_idx=0): return [self._new_symbolic_dim_from_output(node, out_idx, i) for i in range(rank)] - def _compute_conv_pool_shape(self, node): + def _compute_conv_pool_shape(self, node, channels_last=False): sympy_shape = self._get_sympy_shape(node, 0) if len(node.input) > 1: W_shape = self._get_sympy_shape(node, 1) rank = len(W_shape) - 2 # number of spatial axes - kernel_shape = W_shape[-rank:] - sympy_shape[1] = W_shape[0] + kernel_shape = W_shape[-rank - 1 : -1] if channels_last else W_shape[-rank:] + sympy_shape[3 if channels_last else 1] = W_shape[0] else: W_shape = None kernel_shape = get_attribute(node, "kernel_shape") @@ -638,13 +640,17 @@ def _compute_conv_pool_shape(self, node): assert len(sympy_shape) == rank + 2 # only need to symbolic shape inference if input has symbolic dims in spatial axes - is_symbolic_dims = [not is_literal(i) for i in sympy_shape[-rank:]] + spatial_shape = sympy_shape[-rank - 1 : -1] if channels_last else sympy_shape[-rank:] + is_symbolic_dims = [not is_literal(i) for i in spatial_shape] if not any(is_symbolic_dims): shape = get_shape_from_value_info(self.known_vi_[node.output[0]]) if len(shape) > 0: assert len(sympy_shape) == len(shape) - sympy_shape[-rank:] = [sympy.Integer(d) for d in shape[-rank:]] + if channels_last: + sympy_shape[-rank - 1 : -1] = [sympy.Integer(d) for d in shape[-rank - 1 : -1]] + else: + sympy_shape[-rank:] = [sympy.Integer(d) for d in shape[-rank:]] return sympy_shape dilations = get_attribute(node, "dilations", [1] * rank) @@ -675,7 +681,7 @@ def _compute_conv_pool_shape(self, node): ceil_mode = get_attribute(node, "ceil_mode", 0) for i in range(rank): - effective_input_size = sympy_shape[-rank + i] + effective_input_size = sympy_shape[-rank + i + (-1 if channels_last else 0)] if len(total_pads) > 0: effective_input_size = effective_input_size + total_pads[i] if ceil_mode: @@ -684,7 +690,7 @@ def _compute_conv_pool_shape(self, node): ) else: strided_kernel_positions = (effective_input_size - effective_kernel_shape[i]) // strides[i] - sympy_shape[-rank + i] = strided_kernel_positions + 1 + sympy_shape[-rank + i + (-1 if channels_last else 0)] = strided_kernel_positions + 1 return sympy_shape def _check_merged_dims(self, dims, allow_broadcast=True): @@ -918,6 +924,18 @@ def _infer_Conv(self, node): ) ) + def _infer_NhwcConv(self, node): + sympy_shape = self._compute_conv_pool_shape(node, channels_last=True) + self._update_computed_dims(sympy_shape) + vi = self.known_vi_[node.output[0]] + vi.CopyFrom( + helper.make_tensor_value_info( + node.output[0], + self.known_vi_[node.input[0]].type.tensor_type.elem_type, + get_shape_from_sympy_shape(sympy_shape), + ) + ) + def _infer_Einsum(self, node): # ref:https://github.com/onnx/onnx/blob/623dfaa0151b2e4ce49779c3ec31cbd78c592b80/onnx/defs/math/defs.cc#L3275 equation = get_attribute(node, "equation") @@ -2459,6 +2477,7 @@ def infer_shapes(in_mp, int_max=2**31 - 1, auto_merge=False, guess_output_rank=F all_shapes_inferred = symbolic_shape_inference._infer_impl() symbolic_shape_inference._update_output_from_vi() if not all_shapes_inferred: + onnx.save_model(symbolic_shape_inference.out_mp_, "sym_shape_infer_temp.onnx", save_as_external_data=True) raise Exception("Incomplete symbolic shape inference") return symbolic_shape_inference.out_mp_ diff --git a/onnxruntime/python/tools/transformers/fusion_nhwc_conv.py b/onnxruntime/python/tools/transformers/fusion_nhwc_conv.py new file mode 100644 index 0000000000000..d8ecb652800f6 --- /dev/null +++ b/onnxruntime/python/tools/transformers/fusion_nhwc_conv.py @@ -0,0 +1,90 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +from logging import getLogger +from typing import List + +from fusion_base import Fusion +from onnx import TensorProto, helper, numpy_helper +from onnx_model import OnnxModel + +logger = getLogger(__name__) + + +class FusionNhwcConv(Fusion): + """Convert Conv to NhwcConv""" + + def __init__(self, model: OnnxModel, update_weight=False): + super().__init__(model, "NhwcConv", ["Conv"], "NhwcConv") + self.update_weight = update_weight + + def create_transpose_node(self, input_name: str, perm: List[int], output_name=None): + """Append a Transpose node after an input""" + node_name = self.model.create_node_name("Transpose") + + if output_name is None: + output_name = node_name + "_out" + "-" + input_name + + transpose_node = helper.make_node("Transpose", inputs=[input_name], outputs=[output_name], name=node_name) + transpose_node.attribute.extend([helper.make_attribute("perm", perm)]) + + return transpose_node + + def fuse(self, conv, input_name_to_nodes, output_name_to_node): + # Add Transpose node to convert input from NCHW to NHWC + input_transpose_node = self.create_transpose_node(conv.input[0], [0, 2, 3, 1]) + + nhwc_conv_input = input_transpose_node.output[0] + + # Create a tensor for transposed weights (already in NHWC format). + node_name = self.model.create_node_name("NhwcConv") + + # Make sure the weights is 4D + weight_tensor = self.model.get_initializer(conv.input[1]) + if weight_tensor is None: + return + weight = numpy_helper.to_array(weight_tensor) + if len(weight.shape) != 4: + return + + if self.update_weight: + # Transpose weights from NCHW to NHWC + weight = weight.transpose(0, 2, 3, 1) + + weight_name = node_name + "_weight_NHWC" + nhwc_weight = helper.make_tensor( + name=weight_name, + data_type=TensorProto.FLOAT, + dims=list(weight.shape), + vals=weight.flatten().tolist(), + ) + self.model.add_initializer(nhwc_weight, self.this_graph_name) + weight_transpose_node = None + else: + weight_transpose_node = self.create_transpose_node(conv.input[1], [0, 2, 3, 1]) + weight_name = weight_transpose_node.output[0] + + nhwc_output_name = node_name + "_out" + "-" + conv.output[0] + nhwc_conv = helper.make_node( + "NhwcConv", + inputs=[nhwc_conv_input, weight_name] + conv.input[2:], + outputs=[nhwc_output_name], + name=node_name + "-" + conv.name, + ) + nhwc_conv.attribute.extend(conv.attribute) + nhwc_conv.domain = "com.microsoft" + + output_transpose_node = self.create_transpose_node(nhwc_conv.output[0], [0, 3, 1, 2], conv.output[0]) + + self.nodes_to_remove.append(conv) + + nodes_to_add = [input_transpose_node, nhwc_conv, output_transpose_node] + if weight_transpose_node: + nodes_to_add.append(weight_transpose_node) + for node in nodes_to_add: + self.node_name_to_graph_name[node.name] = self.this_graph_name + self.nodes_to_add.extend(nodes_to_add) + + self.increase_counter("NhwcConv") diff --git a/onnxruntime/python/tools/transformers/fusion_reshape.py b/onnxruntime/python/tools/transformers/fusion_reshape.py index 75caa255b1c24..853038f7460d7 100644 --- a/onnxruntime/python/tools/transformers/fusion_reshape.py +++ b/onnxruntime/python/tools/transformers/fusion_reshape.py @@ -119,16 +119,15 @@ def fuse(self, reshape_node, input_name_to_nodes, output_name_to_node): shape_nodes.extend([path2[-1], path3[-1]]) shape.append(-1) elif len(concat_node.input) > 2: - concat_2 = self.model.get_initializer(concat_node.input[2]) - if concat_2 is None: + concat_value = self.model.get_constant_value(concat_node.input[2]) + if concat_value is None: return - concat_value = numpy_helper.to_array(concat_2) if isinstance(concat_value, np.ndarray): shape.extend(concat_value.tolist()) else: shape.append(concat_value) - if len(concat_node.input) == 4 and self.model.get_initializer(concat_node.input[3]) is None: + if len(concat_node.input) == 4 and self.model.get_constant_value(concat_node.input[3]) is None: if -1 in shape: return diff --git a/onnxruntime/python/tools/transformers/fusion_transpose.py b/onnxruntime/python/tools/transformers/fusion_transpose.py new file mode 100644 index 0000000000000..d92ddd5f8e678 --- /dev/null +++ b/onnxruntime/python/tools/transformers/fusion_transpose.py @@ -0,0 +1,81 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +from logging import getLogger +from typing import Dict, List + +from fusion_base import Fusion +from fusion_utils import FusionUtils +from onnx import NodeProto, helper +from onnx_model import OnnxModel + +logger = getLogger(__name__) + + +class FusionTranspose(Fusion): + def __init__(self, model: OnnxModel): + super().__init__(model, "Transpose", "Transpose") + + def fuse( + self, + transpose_node: NodeProto, + input_name_to_nodes: Dict[str, List[NodeProto]], + output_name_to_node: Dict[str, NodeProto], + ): + """ + Case 1: + (input)-->Transpose(perm=a)-->Transpose(perm=b)--> + After: + (input)-->Transpose(perm=a)--> (this path can be removed if the output is not used anymore) + | + +----->Transpose(perm=a*b)--> + + Case 2 (Cast has only one child): + (input)-->Transpose(perm=a)--> Cast -->Transpose(perm=b)--> + After: + (input)-->Transpose(perm=a)--> (this path can be removed if the output is not used anymore) + | + +----->Cast --> Transpose(perm=a*b)--> + + + """ + transpose_b = transpose_node + if transpose_b.input[0] not in output_name_to_node: + return + + transpose_a = output_name_to_node[transpose_b.input[0]] + if transpose_a.op_type != "Cast": + cast_node = None + else: + cast_node = transpose_a + + cast_children = self.model.get_children(cast_node, input_name_to_nodes) + if cast_children and len(cast_children) > 1: + return + transpose_a = output_name_to_node[cast_node.input[0]] + + if transpose_a.op_type != "Transpose": + return + + permutation = OnnxModel.get_node_attribute(transpose_b, "perm") + assert isinstance(permutation, list) + + parent_permutation = OnnxModel.get_node_attribute(transpose_a, "perm") + assert isinstance(parent_permutation, list) + + assert len(parent_permutation) == len(permutation) + + output_permutation = [] + for j, index in enumerate(permutation): + output_permutation.append(parent_permutation[index]) + + if cast_node is None: + if FusionUtils.skip_parent(self.model, transpose_b, transpose_a, input_name_to_nodes): + self.nodes_to_remove.append(transpose_a) + else: + if FusionUtils.skip_parent(self.model, cast_node, transpose_a, input_name_to_nodes): + self.nodes_to_remove.append(transpose_a) + transpose_b.ClearField("attribute") + transpose_b.attribute.extend([helper.make_attribute("perm", output_permutation)]) diff --git a/onnxruntime/python/tools/transformers/fusion_utils.py b/onnxruntime/python/tools/transformers/fusion_utils.py index 8363f2674cd40..07fdf490337a4 100644 --- a/onnxruntime/python/tools/transformers/fusion_utils.py +++ b/onnxruntime/python/tools/transformers/fusion_utils.py @@ -73,6 +73,32 @@ def remove_cast_int32(self, input_name: str): self.model.remove_node(node) self.model.replace_input_of_all_nodes(output_name, input_name) + @staticmethod + def skip_parent(model: OnnxModel, node, parent_node, input_name_to_nodes): + """ + Before: + (input)-->parent-->node-->(output) + After: + (input)-->parent--> + | + +----->node-->(output) + + This function returns a flag about whether the parent node can be removed. + Note that this function assumes the node has first input links from parent! + """ + parent_can_be_removed = False + input_name_to_nodes[node.input[0]].remove(node) + # We can remove the first Transpose if its output is not used (linked to graph output or other nodes) anymore. + if len(input_name_to_nodes[node.input[0]]) == 0 and not model.find_graph_output( + node.input[0] + ): # checks main graph output. TODO: deal with subgraph + parent_can_be_removed = True + # self.nodes_to_remove.append(transpose_a) + + input_name_to_nodes[parent_node.input[0]].append(node) + node.input[0] = parent_node.input[0] + return parent_can_be_removed + @staticmethod def check_node_attribute(node, attribute_name: str, expected_value, default_value=None): """Verify that a node has expected value for an attribute. @@ -228,7 +254,10 @@ def remove_useless_reshape_nodes(self): graph_output_names = set(self.model.get_graphs_output_names()) for node in nodes_to_remove: if bool(set(node.output) & graph_output_names): - if not bool(set(node.input) & graph_input_names): + if ( + not bool(set(node.input) & graph_input_names) + and len(self.model.input_name_to_nodes()[node.input[0]]) == 1 # parent has only one child + ): self.model.replace_output_of_all_nodes(node.input[0], node.output[0]) else: continue diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py index 580c5ef4c3cca..9a00dc8684f32 100755 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py @@ -62,7 +62,7 @@ def get_ort_pipeline(model_name: str, directory: str, provider: str, disable_saf return pipe -def get_torch_pipeline(model_name: str, disable_channels_last: bool, disable_safety_checker: bool): +def get_torch_pipeline(model_name: str, disable_safety_checker: bool): from diffusers import StableDiffusionPipeline from torch import channels_last, float16 @@ -70,8 +70,7 @@ def get_torch_pipeline(model_name: str, disable_channels_last: bool, disable_saf model_name, torch_dtype=float16, revision="fp16", use_auth_token=True ).to("cuda") - if not disable_channels_last: - pipe.unet.to(memory_format=channels_last) # in-place operation + pipe.unet.to(memory_format=channels_last) # in-place operation if disable_safety_checker: pipe.safety_checker = None @@ -144,7 +143,7 @@ def run_ort(model_name: str, directory: str, provider: str, batch_size: int, dis run_ort_pipeline(pipe, batch_size, image_filename_prefix) -def run_torch(model_name: str, batch_size: int, disable_channels_last: bool, disable_safety_checker: bool): +def run_torch(model_name: str, batch_size: int, disable_safety_checker: bool): import torch torch.backends.cudnn.enabled = True @@ -154,13 +153,11 @@ def run_torch(model_name: str, batch_size: int, disable_channels_last: bool, dis torch.set_grad_enabled(False) load_start = time.time() - pipe = get_torch_pipeline(model_name, disable_channels_last, disable_safety_checker) + pipe = get_torch_pipeline(model_name, disable_safety_checker) load_end = time.time() print(f"Model loading took {load_end - load_start} seconds") - image_filename_prefix = get_image_filename_prefix("torch", model_name, batch_size, disable_safety_checker) + ( - "" if disable_channels_last else "_channels_last" - ) + image_filename_prefix = get_image_filename_prefix("torch", model_name, batch_size, disable_safety_checker) with torch.inference_mode(): run_torch_pipeline(pipe, batch_size, image_filename_prefix) @@ -196,15 +193,6 @@ def parse_arguments(): help="Directory of saved onnx pipeline. It could be output directory of optimize_pipeline.py.", ) - parser.add_argument( - "-c", - "--disable_channels_last", - required=False, - action="store_true", - help="Disable channels last for torch. It will be ignored for onnxruntime engine", - ) - parser.set_defaults(disable_channels_last=False) - parser.add_argument( "--enable_safety_checker", required=False, @@ -237,7 +225,7 @@ def main(): provider = "CUDAExecutionProvider" # TODO: use ["CUDAExecutionProvider", "CPUExecutionProvider"] in diffuers run_ort(sd_model, args.pipeline, provider, args.batch_size, not args.enable_safety_checker) else: - run_torch(sd_model, args.batch_size, args.disable_channels_last, not args.enable_safety_checker) + run_torch(sd_model, args.batch_size, not args.enable_safety_checker) if __name__ == "__main__": diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py index 0979f0d2ddcb5..932be4a19ae6b 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py @@ -11,18 +11,15 @@ # huggingface-cli login # wget https://raw.githubusercontent.com/huggingface/diffusers/v0.12.1/scripts/convert_stable_diffusion_checkpoint_to_onnx.py # python convert_stable_diffusion_checkpoint_to_onnx.py --model_path runwayml/stable-diffusion-v1-5 --output_path $ONNX_ROOT/stable-diffusion-v1-5-fp32 -# python convert_stable_diffusion_checkpoint_to_onnx.py --model_path stabilityai/stable-diffusion-2-1 --output_path $ONNX_ROOT/stable-diffusion-v2-1-fp32 -# Note that this script might not be compatible with older or newer version of diffusers/transformers. It is because fusion script need change accordingly when onnx graph is changed. +# Note that this script might not be compatible with older or newer version of diffusers. # Then you can use this script to convert them to float16 like the following: # python optimize_pipeline.py -i $ONNX_ROOT/stable-diffusion-v1-5-fp32 -o $ONNX_ROOT/stable-diffusion-v1-5-fp16 --float16 -# python optimize_pipeline.py -i $ONNX_ROOT/stable-diffusion-v2-1-fp32 -o $ONNX_ROOT/stable-diffusion-v2-1-fp16 --float16 # Or -# pip install -U onnxruntime-gpu >= 1.14 # python -m onnxruntime.transformers.models.stable_diffusion.optimize_pipeline -i $ONNX_ROOT/stable-diffusion-v1-5-fp32 -o $ONNX_ROOT/stable-diffusion-v1-5-fp16 --float16 -# python -m onnxruntime.transformers.models.stable_diffusion.optimize_pipeline -i $ONNX_ROOT/stable-diffusion-v2-1-fp32 -o $ONNX_ROOT/stable-diffusion-v2-1-fp16 --float16 - -# Note that float16 model is for CUDA Execution Provider. It might not run in CPU Execution Provider. +# +# Note that output model is for CUDA Execution Provider. It might not run in CPU Execution Provider. +# Stable diffusion 2.1 model will get black images using float16 Attention. It is a known issue that we are working on. import argparse import logging @@ -40,7 +37,7 @@ logger = logging.getLogger(__name__) -def optimize_stable_diffusion_onnx_pipeline( +def optimize_sd_pipeline( source_dir: Path, target_dir: Path, overwrite: bool, use_external_data_format: bool, float16: bool ): """Optimize onnx models used in stable diffusion onnx pipeline and optionally convert to float16. @@ -66,23 +63,18 @@ def optimize_stable_diffusion_onnx_pipeline( raise RuntimeError(message) continue - num_heads = 0 - hidden_size = 0 - # Graph fusion before fp16 conversion, otherwise they cannot be fused later. # Right now, onnxruntime does not save >2GB model so we use script to optimize unet instead. logger.info(f"optimize {onnx_model_path}...") fusion_options = FusionOptions("unet") - # packed kv requires compute capacity >= 7.5 (like T4, A100, RTX 2060~4090. See https://developer.nvidia.com/cuda-gpus) - # Suggest to disable it if you are using older GPU like V100, RTX 1060/1070/1080, or using float32 model. fusion_options.enable_packed_kv = float16 m = optimize_model( str(onnx_model_path), model_type="unet", - num_heads=num_heads, - hidden_size=hidden_size, + num_heads=0, # will be deduced from graph + hidden_size=0, # will be deduced from graph opt_level=0, optimization_options=fusion_options, use_gpu=False, @@ -211,7 +203,7 @@ def main(): coloredlogs.install(fmt="%(funcName)20s: %(message)s") args = parse_arguments() copy_extra_directory(Path(args.input), Path(args.output), args.overwrite) - optimize_stable_diffusion_onnx_pipeline( + optimize_sd_pipeline( Path(args.input), Path(args.output), args.overwrite, args.use_external_data_format, args.float16 ) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt new file mode 100644 index 0000000000000..8b57df8852765 --- /dev/null +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt @@ -0,0 +1,14 @@ +# Install the following package in python 3.10 +diffusers==0.12.1 +transformers==4.26.0 +numpy==1.24.1 +accelerate==0.15.0 +onnxruntime-gpu>=1.14 +onnx==1.13.0 +coloredlogs +packaging==23.0 +protobuf==3.20.3 +psutil==5.9.4 +sympy==1.11.1 +--extra-index-url https://download.pytorch.org/whl/cu117 +torch==1.13.1+cu117 diff --git a/onnxruntime/python/tools/transformers/onnx_model.py b/onnxruntime/python/tools/transformers/onnx_model.py index 96c22b5894c60..42fd4d5909a30 100644 --- a/onnxruntime/python/tools/transformers/onnx_model.py +++ b/onnxruntime/python/tools/transformers/onnx_model.py @@ -128,6 +128,8 @@ def remove_node(self, node): for graph in self.graphs(): if node in graph.node: graph.node.remove(node) + return + logger.warning("Failed to remove node %s", node) # It might be a bug to hit this line. def remove_nodes(self, nodes_to_remove): for node in nodes_to_remove: @@ -182,6 +184,12 @@ def replace_node_output(node, old_output_name, new_output_name): node.output[j] = new_output_name def replace_output_of_all_nodes(self, old_output_name, new_output_name): + # This function shall be used carefully. For example: + # Add --[old_name]--> Cast ---> [new_name] + # | + # +----[old_name]--> Transpose --> + # If we want to remove the Cast node: replace output of Add to new_name is not enough; + # The input of Transpose shall also be updated to new_name. for node in self.model.graph.node: OnnxModel.replace_node_output(node, old_output_name, new_output_name) @@ -553,7 +561,9 @@ def get_data_type(input_or_output_name): graph_output_names = set(self.get_graphs_output_names()) for node in nodes_to_remove: if bool(set(node.output) & graph_output_names): - if not bool(set(node.input) & graph_input_names): + if (not bool(set(node.input) & graph_input_names)) and len( + self.input_name_to_nodes()[node.input[0]] + ) == 1: self.replace_output_of_all_nodes(node.input[0], node.output[0]) else: continue diff --git a/onnxruntime/python/tools/transformers/onnx_model_unet.py b/onnxruntime/python/tools/transformers/onnx_model_unet.py index feba717bd8f6f..32a98149825c3 100644 --- a/onnxruntime/python/tools/transformers/onnx_model_unet.py +++ b/onnxruntime/python/tools/transformers/onnx_model_unet.py @@ -9,8 +9,11 @@ from fusion_attention_unet import FusionAttentionUnet from fusion_biassplitgelu import FusionBiasSplitGelu from fusion_group_norm import FusionGroupNorm +from fusion_nhwc_conv import FusionNhwcConv from fusion_options import FusionOptions +from fusion_transpose import FusionTranspose from onnx import ModelProto +from onnx_model import OnnxModel from onnx_model_bert import BertOnnxModel logger = getLogger(__name__) @@ -30,10 +33,61 @@ def __init__(self, model: ModelProto, num_heads: int = 0, hidden_size: int = 0): super().__init__(model, num_heads=num_heads, hidden_size=hidden_size) def preprocess(self): - return + self.remove_useless_div() def postprocess(self): + self.merge_sequential_transpose() self.prune_graph() + self.remove_unused_constant() + + def remove_useless_div(self): + """Remove Div by 1""" + div_nodes = [node for node in self.nodes() if node.op_type == "Div"] + + nodes_to_remove = [] + for div in div_nodes: + if self.find_constant_input(div, 1.0) == 1: + nodes_to_remove.append(div) + + for node in nodes_to_remove: + self.replace_input_of_all_nodes(node.output[0], node.input[0]) + + if nodes_to_remove: + self.remove_nodes(nodes_to_remove) + logger.info("Removed %d useless Div (by 1) nodes", len(nodes_to_remove)) + + def convert_conv_to_nhwc(self): + # Do not update weight here since save external data has a bug + conv_to_nhwc_conv = FusionNhwcConv(self, update_weight=False) + conv_to_nhwc_conv.apply() + + def merge_sequential_transpose(self): + fusion_transpose = FusionTranspose(self) + fusion_transpose.apply() + + remove_count = 0 + nodes = self.get_nodes_by_op_type("Transpose") + for node in nodes: + permutation = OnnxModel.get_node_attribute(node, "perm") + assert isinstance(permutation, list) + if permutation != list(range(len(permutation))): + continue + assert not ( + self.find_graph_output(node.output[0]) + or self.find_graph_input(node.input[0]) + or self.find_graph_output(node.input[0]) + ) + + # Let all children nodes skip current Transpose node and link to its parent + # Note that we cannot update parent node output since parent node might have more than one children. + self.replace_input_of_all_nodes(node.output[0], node.input[0]) + + self.remove_node(node) + remove_count += 1 + + total = len(fusion_transpose.nodes_to_remove) + remove_count + if total: + logger.info("Removed %d Transpose nodes", total) def optimize(self, options: Optional[FusionOptions] = None): if (options is not None) and not options.enable_shape_inference: @@ -78,7 +132,7 @@ def optimize(self, options: Optional[FusionOptions] = None): # Remove reshape nodes that having same shape of input and output based on symbolic shape inference. self.utils.remove_useless_reshape_nodes() - self.postprocess() + self.convert_conv_to_nhwc() if (options is None) or options.enable_bias_skip_layer_norm: # Fuse SkipLayerNormalization and Add Bias before it. @@ -87,6 +141,29 @@ def optimize(self, options: Optional[FusionOptions] = None): if options is not None and options.enable_gelu_approximation: self.gelu_approximation() - self.remove_unused_constant() + self.postprocess() logger.info(f"opset version: {self.get_opset_version()}") + + def get_fused_operator_statistics(self): + """ + Returns node count of fused operators. + """ + op_count = {} + ops = [ + "Attention", + "MultiHeadAttention", + "Gelu", + "FastGelu", + "LayerNormalization", + "SkipLayerNormalization", + "BiasSplitGelu", + "GroupNorm", + "NhwcConv", + ] + for op in ops: + nodes = self.get_nodes_by_op_type(op) + op_count[op] = len(nodes) + + logger.info(f"Optimized operators:{op_count}") + return op_count diff --git a/setup.py b/setup.py index 0c10195dc3b62..294b975a56595 100644 --- a/setup.py +++ b/setup.py @@ -481,9 +481,12 @@ def finalize_options(self): "onnxruntime.quantization.operators", "onnxruntime.quantization.CalTableFlatBuffers", "onnxruntime.transformers", + "onnxruntime.transformers.models.bart", + "onnxruntime.transformers.models.bert", "onnxruntime.transformers.models.gpt2", "onnxruntime.transformers.models.longformer", "onnxruntime.transformers.models.t5", + "onnxruntime.transformers.models.stable_diffusion", ] package_data = {"onnxruntime.tools.mobile_helpers": ["*.md", "*.config"]} From 8de885fdb1b070dc85e19c92f1dccdeaee482131 Mon Sep 17 00:00:00 2001 From: Yufeng Li Date: Tue, 7 Feb 2023 09:03:14 -0800 Subject: [PATCH 21/68] reduce cuda library binary size (#14555) ### Description Reduce the cuda library size by: 1. refactoring beam_search_top_k to reduce template instantiation. It saves ~56MB 2. opt out TopK for type uint*, int8_t and int16_t. It saves ~50MB. ### Motivation and Context --- cmake/CMakeLists.txt | 1 + docs/OperatorKernels.md | 6 +-- .../cuda/transformers/beam_search_topk.cu | 53 +------------------ .../cuda/transformers/beam_search_topk.h | 12 ----- .../transformers/generation_device_helper.cc | 47 +++++++++------- onnxruntime/core/providers/cuda/math/topk.cc | 41 +++++++++----- .../core/providers/cuda/math/topk_impl_i16.cu | 5 -- .../core/providers/cuda/math/topk_impl_i8.cu | 5 -- .../core/providers/cuda/math/topk_impl_u16.cu | 5 -- .../core/providers/cuda/math/topk_impl_u32.cu | 5 -- .../core/providers/cuda/math/topk_impl_u64.cu | 5 -- .../core/providers/cuda/math/topk_impl_u8.cu | 5 -- 12 files changed, 61 insertions(+), 129 deletions(-) delete mode 100644 onnxruntime/core/providers/cuda/math/topk_impl_i16.cu delete mode 100644 onnxruntime/core/providers/cuda/math/topk_impl_i8.cu delete mode 100644 onnxruntime/core/providers/cuda/math/topk_impl_u16.cu delete mode 100644 onnxruntime/core/providers/cuda/math/topk_impl_u32.cu delete mode 100644 onnxruntime/core/providers/cuda/math/topk_impl_u64.cu delete mode 100644 onnxruntime/core/providers/cuda/math/topk_impl_u8.cu diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 57abcb04ba0e8..9eab5a2a0fe3b 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -603,6 +603,7 @@ if (onnxruntime_USE_CUDA) list(APPEND ORT_PROVIDER_FLAGS -DUSE_FLASH_ATTENTION=1) list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_FLASH_ATTENTION=1) endif() + endif() if (onnxruntime_USE_VITISAI) list(APPEND ORT_PROVIDER_FLAGS -DUSE_VITISAI=1) diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index 286cad61d599f..618214acae75d 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -768,9 +768,9 @@ Do not modify directly.* |||1+|**T** = tensor(double), tensor(float), tensor(float16)| |Tile|*in* input:**T**
*in* repeats:**T1**
*out* output:**T**

or

*in* input:**T**
*in* tiles:**T**
*in* axis:**T**
*out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)
**T1** = tensor(int64)| |||[6, 12]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)
**T1** = tensor(int64)| -|TopK|*in* X:**T**
*in* K:**tensor(int64)**
*out* Values:**T**
*out* Indices:**I**

or

*in* X:**T**
*out* Values:**T**
*out* Indices:**I**|11+|**I** = tensor(int64)
**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| -|||10|**I** = tensor(int64)
**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| -|||[1, 9]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| +|TopK|*in* X:**T**
*in* K:**tensor(int64)**
*out* Values:**T**
*out* Indices:**I**

or

*in* X:**T**
*out* Values:**T**
*out* Indices:**I**|11+|**I** = tensor(int64)
**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)| +|||10|**I** = tensor(int64)
**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)| +|||[1, 9]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)| |Transpose|*in* data:**T**
*out* transposed:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| |||[1, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| |Trilu|*in* input:**T**
*in* k:**tensor(int64)**
*out* output:**T**|14+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| diff --git a/onnxruntime/contrib_ops/cuda/transformers/beam_search_topk.cu b/onnxruntime/contrib_ops/cuda/transformers/beam_search_topk.cu index 5c54c03a05d1a..dcbc733f2acb2 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/beam_search_topk.cu +++ b/onnxruntime/contrib_ops/cuda/transformers/beam_search_topk.cu @@ -291,7 +291,7 @@ void LaunchBatchTopKKernel(const T* topk_scores, int32_t num_beams, int32_t k, cudaStream_t stream) { - ORT_ENFORCE(k <= 256, "LaunchBatchTopKKernel doesn't support k >= 256"); + ORT_ENFORCE(k <= 64, "LaunchBatchTopKKernel doesn't support k >= 64"); #define BatchTopKKernelLauncher(K) \ BatchTopKKernel<<>>(topk_scores, \ @@ -311,12 +311,8 @@ void LaunchBatchTopKKernel(const T* topk_scores, BatchTopKKernelLauncher(16); } else if (k <= 32) { BatchTopKKernelLauncher(32); - } else if (k <= 64) { - BatchTopKKernelLauncher(64); - } else if (k <= 128) { - BatchTopKKernelLauncher(128); } else { - BatchTopKKernelLauncher(256); + BatchTopKKernelLauncher(64); } } @@ -330,36 +326,6 @@ template void LaunchBatchTopKKernel(const float* topk_scores, int32_t k, cudaStream_t stream); -template void LaunchBatchTopKKernel(const float* topk_scores, - const int64_t* topk_tokens, - int32_t* next_indices, - int32_t* next_tokens, - float* next_scores, - int32_t batch_size, - int32_t num_beams, - int32_t k, - cudaStream_t stream); - -template void LaunchBatchTopKKernel(const half* topk_scores, - const int32_t* topk_tokens, - int32_t* next_indices, - int32_t* next_tokens, - half* next_scores, - int32_t batch_size, - int32_t num_beams, - int32_t k, - cudaStream_t stream); - -template void LaunchBatchTopKKernel(const half* topk_scores, - const int64_t* topk_tokens, - int32_t* next_indices, - int32_t* next_tokens, - half* next_scores, - int32_t batch_size, - int32_t num_beams, - int32_t k, - cudaStream_t stream); - template void BeamSearchTopK( const T* input, @@ -426,21 +392,6 @@ template void BeamSearchTopK( int32_t* output_indices, cudaStream_t stream); -template void BeamSearchTopK( - const half* input, - int32_t batch_size, - int32_t num_beams, - int32_t vocab_size, - int32_t k, - half* tmp_values_1st_stage, - int32_t* tmp_indices_1st_stage, - half* tmp_values_2st_stage, - int32_t* tmp_indices_2st_stage, - half* output_values, - int32_t* output_tokens, - int32_t* output_indices, - cudaStream_t stream); - } // namespace cuda } // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cuda/transformers/beam_search_topk.h b/onnxruntime/contrib_ops/cuda/transformers/beam_search_topk.h index 5e338b417e8a5..096448c002e36 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/beam_search_topk.h +++ b/onnxruntime/contrib_ops/cuda/transformers/beam_search_topk.h @@ -11,18 +11,6 @@ namespace onnxruntime { namespace contrib { namespace cuda { -template -void LaunchBatchTopKKernel( - const T* topk_scores, - const I* topk_indices, - int32_t* next_indices, - int32_t* next_tokens, - T* next_scores, - int32_t batch_size, - int32_t num_beams, - int32_t k, - cudaStream_t stream); - template void BeamSearchTopK( const T* input, diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc index 703bd6a0e90ff..3895d16d4deec 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc +++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc @@ -440,12 +440,16 @@ Status ProcessLogits(const OrtValue& logits, // dumper->Print("next_indices before scorer", beam_state->next_indices.data(), batch_size, 2 * num_beams); dumper->Print("next_scores before scorer", beam_state->next_scores.data(), batch_size, 2 * num_beams); #endif + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(cpu_state->topk_scores.data(), + beam_state->next_scores.data(), + beam_state->next_scores.size_bytes(), + cudaMemcpyDeviceToHost, + cuda_stream)); } else { // Apply top-k selection like the following: // next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size) // next_token_scores, next_tokens = torch.topk(next_token_scores, 2 * num_beams, dim=1, largest=True, sorted=True) - // int64_t next_token_scores_dims[] = {batch_size, num_beams * vocab_size}; - int64_t next_token_scores_dims[] = {batch_size * num_beams, vocab_size}; + int64_t next_token_scores_dims[] = {batch_size, num_beams * vocab_size}; TensorShape next_token_scores_shape(&next_token_scores_dims[0], 2); auto element_type = DataTypeImpl::GetType(); @@ -460,31 +464,36 @@ Status ProcessLogits(const OrtValue& logits, // constexpr bool sorted = true; // results returned in sorted order. std::unique_ptr topk_scores = Tensor::CreateDefault(); - std::unique_ptr topk_tokens = Tensor::CreateDefault(); + std::unique_ptr topk_indices = Tensor::CreateDefault(); ORT_RETURN_IF_ERROR(TopK(&input, axis, top_k, largest, sorted, allocator, ort_stream, thread_pool, - *topk_scores, *topk_tokens)); + *topk_scores, *topk_indices)); #ifdef DEBUG_GENERATION dumper->Print("topk_scores", *(topk_scores.get())); - dumper->Print("topk_tokens", *(topk_tokens.get())); + dumper->Print("topk_indices", *(topk_indices.get())); +#endif + + // Convert indices in range [0, num_beams * vocab_size) to token ID of range [0, vocab_size) like the following: + // next_indices = (next_tokens / vocab_size).long() + // next_tokens = next_tokens % vocab_size + const int64_t* next_token_indices = topk_indices->Data(); + cuda::LaunchNextTokenKernel(next_token_indices, beam_state->next_indices.data(), beam_state->next_tokens.data(), + batch_size, top_k, vocab_size, cuda_stream); + + const float* data = topk_scores->Data(); +#ifdef DEBUG_GENERATION + dumper->Print("next_scores before scorer", data, batch_size, top_k); + dumper->Print("next_tokens before scorer", beam_state->next_tokens.data(), batch_size, top_k); + dumper->Print("next_indices before scorer", beam_state->next_indices.data(), batch_size, top_k); #endif - cuda::LaunchBatchTopKKernel(topk_scores->Data(), - topk_tokens->Data(), - beam_state->next_indices.data(), - beam_state->next_tokens.data(), - beam_state->next_scores.data(), - batch_size, - num_beams, - 2 * num_beams, - cuda_stream); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(cpu_state->topk_scores.data(), + data, + topk_scores->SizeInBytes(), + cudaMemcpyDeviceToHost, + cuda_stream)); } - CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(cpu_state->topk_scores.data(), - beam_state->next_scores.data(), - beam_state->next_scores.size_bytes(), - cudaMemcpyDeviceToHost, - cuda_stream)); CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(cpu_state->topk_tokens.data(), beam_state->next_tokens.data(), beam_state->next_tokens.size_bytes(), diff --git a/onnxruntime/core/providers/cuda/math/topk.cc b/onnxruntime/core/providers/cuda/math/topk.cc index 7ea165c611cb9..3b0edaa559ce9 100644 --- a/onnxruntime/core/providers/cuda/math/topk.cc +++ b/onnxruntime/core/providers/cuda/math/topk.cc @@ -12,7 +12,12 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX( kOnnxDomain, 1, 9, kCudaExecutionProvider, - (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()), + (*KernelDefBuilder::Create()) + .TypeConstraint("T", {DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType()}), TopK); ONNX_OPERATOR_VERSIONED_KERNEL_EX( @@ -20,7 +25,14 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX( kOnnxDomain, 10, 10, kCudaExecutionProvider, - (*KernelDefBuilder::Create()).InputMemoryType(OrtMemTypeCPUInput, 1).TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()).TypeConstraint("I", DataTypeImpl::GetTensorType()), + (*KernelDefBuilder::Create()) + .InputMemoryType(OrtMemTypeCPUInput, 1) + .TypeConstraint("T", {DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType()}) + .TypeConstraint("I", DataTypeImpl::GetTensorType()), TopK); ONNX_OPERATOR_KERNEL_EX( @@ -28,7 +40,14 @@ ONNX_OPERATOR_KERNEL_EX( kOnnxDomain, 11, kCudaExecutionProvider, - (*KernelDefBuilder::Create()).InputMemoryType(OrtMemTypeCPUInput, 1).TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()).TypeConstraint("I", DataTypeImpl::GetTensorType()), + (*KernelDefBuilder::Create()) + .InputMemoryType(OrtMemTypeCPUInput, 1) + .TypeConstraint("T", {DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType()}) + .TypeConstraint("I", DataTypeImpl::GetTensorType()), TopK); template @@ -42,11 +61,11 @@ TopK::TopK(const OpKernelInfo& info) : CudaKernel(info) { } #define IS_PRIM_TYPE(T) utils::IsPrimitiveDataType(prim_type) -#define TOPKIMPL(T) TopKImpl(this, ctx->GetComputeStream(), tensor_X->Data(), \ - static_cast(tensor_V->MutableDataRaw()), \ - static_cast(tensor_I->MutableDataRaw()), \ - elem_nums_cuda, \ - elem_nums.size(), \ +#define TOPKIMPL(T) TopKImpl(this, ctx->GetComputeStream(), tensor_X->Data(), \ + static_cast(tensor_V->MutableDataRaw()), \ + static_cast(tensor_I->MutableDataRaw()), \ + elem_nums_cuda, \ + elem_nums.size(), \ axis, K_, largest_, sorted_, N, dimension) template @@ -87,12 +106,6 @@ Status TopK::ComputeInternal(OpKernelContext* ctx) const { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for TopK operator"); } - if (IS_PRIM_TYPE(uint8_t)) return TOPKIMPL(uint8_t); - if (IS_PRIM_TYPE(uint16_t)) return TOPKIMPL(uint16_t); - if (IS_PRIM_TYPE(uint32_t)) return TOPKIMPL(uint32_t); - if (IS_PRIM_TYPE(uint64_t)) return TOPKIMPL(uint64_t); - if (IS_PRIM_TYPE(int8_t)) return TOPKIMPL(int8_t); - if (IS_PRIM_TYPE(int16_t)) return TOPKIMPL(int16_t); if (IS_PRIM_TYPE(int32_t)) return TOPKIMPL(int32_t); if (IS_PRIM_TYPE(int64_t)) return TOPKIMPL(int64_t); if (IS_PRIM_TYPE(MLFloat16)) return TOPKIMPL(MLFloat16); diff --git a/onnxruntime/core/providers/cuda/math/topk_impl_i16.cu b/onnxruntime/core/providers/cuda/math/topk_impl_i16.cu deleted file mode 100644 index e194bd1bfd15a..0000000000000 --- a/onnxruntime/core/providers/cuda/math/topk_impl_i16.cu +++ /dev/null @@ -1,5 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#define TOPK_IMPL_TYPE int16_t -#include "topk_impl.cuh" diff --git a/onnxruntime/core/providers/cuda/math/topk_impl_i8.cu b/onnxruntime/core/providers/cuda/math/topk_impl_i8.cu deleted file mode 100644 index db32e9e43392f..0000000000000 --- a/onnxruntime/core/providers/cuda/math/topk_impl_i8.cu +++ /dev/null @@ -1,5 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#define TOPK_IMPL_TYPE int8_t -#include "topk_impl.cuh" diff --git a/onnxruntime/core/providers/cuda/math/topk_impl_u16.cu b/onnxruntime/core/providers/cuda/math/topk_impl_u16.cu deleted file mode 100644 index c9ed54e832e9e..0000000000000 --- a/onnxruntime/core/providers/cuda/math/topk_impl_u16.cu +++ /dev/null @@ -1,5 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#define TOPK_IMPL_TYPE uint16_t -#include "topk_impl.cuh" diff --git a/onnxruntime/core/providers/cuda/math/topk_impl_u32.cu b/onnxruntime/core/providers/cuda/math/topk_impl_u32.cu deleted file mode 100644 index fceb367e7eb03..0000000000000 --- a/onnxruntime/core/providers/cuda/math/topk_impl_u32.cu +++ /dev/null @@ -1,5 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#define TOPK_IMPL_TYPE uint32_t -#include "topk_impl.cuh" diff --git a/onnxruntime/core/providers/cuda/math/topk_impl_u64.cu b/onnxruntime/core/providers/cuda/math/topk_impl_u64.cu deleted file mode 100644 index 1a7b3f2aed878..0000000000000 --- a/onnxruntime/core/providers/cuda/math/topk_impl_u64.cu +++ /dev/null @@ -1,5 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#define TOPK_IMPL_TYPE uint64_t -#include "topk_impl.cuh" diff --git a/onnxruntime/core/providers/cuda/math/topk_impl_u8.cu b/onnxruntime/core/providers/cuda/math/topk_impl_u8.cu deleted file mode 100644 index 7fcd4b81b3bf9..0000000000000 --- a/onnxruntime/core/providers/cuda/math/topk_impl_u8.cu +++ /dev/null @@ -1,5 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#define TOPK_IMPL_TYPE uint8_t -#include "topk_impl.cuh" From 585f43e31ddd70ce0cae77cd1207e2752999a12e Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Tue, 7 Feb 2023 09:22:30 -0800 Subject: [PATCH 22/68] Remove Identical Children Consolidation from default transformer uitil. (#14602) ### Description ### Motivation and Context Co-authored-by: Scott McKay --- onnxruntime/core/optimizer/graph_transformer_utils.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc index fdee3c19f2e8e..53545c66508c5 100644 --- a/onnxruntime/core/optimizer/graph_transformer_utils.cc +++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc @@ -200,7 +200,6 @@ InlinedVector> GenerateTransformers( // CSE. For example, if A and B nodes both do Add operation with a same value but different initializers, by // default, CSE will not merge them, because the different initializers are represented by different NodeArg. if (session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsDisableDoubleQDQRemover, "0") == "0"){ - transformers.emplace_back(std::make_unique()); transformers.emplace_back(std::make_unique()); } transformers.emplace_back(std::make_unique()); From b6bec543415f16eaf8b687d016e425f5c470cd48 Mon Sep 17 00:00:00 2001 From: RandySheriffH <48490400+RandySheriffH@users.noreply.github.com> Date: Tue, 7 Feb 2023 09:58:25 -0800 Subject: [PATCH 23/68] Revert mimalloc from v2.0.9 to v2.0.3 (#14603) Revert mimalloc from v2.0.9 to v2.0.3 to silence build error in [post-merge ](https://aiinfra.visualstudio.com/Lotus/_build/results?buildId=273075&view=logs&j=f019f681-ae8f-5ee4-d119-02530df66a84&t=6c90c65c-2ab2-56af-633f-b5631256a8e1&l=351) pipeline. New dependency version was generated [here](https://aiinfra.visualstudio.com/Lotus/_artifacts/feed/Lotus/UPack/onnxruntime_build_dependencies/overview/1.0.29). Co-authored-by: Randy Shuai Co-authored-by: rui-ren --- ThirdPartyNotices.txt | 28 ------------------- cgmanifests/generated/cgmanifest.json | 12 +------- cmake/deps.txt | 3 +- .../templates/download-deps.yml | 4 +-- 4 files changed, 4 insertions(+), 43 deletions(-) diff --git a/ThirdPartyNotices.txt b/ThirdPartyNotices.txt index d1aeed4f51a16..e925f75090a46 100644 --- a/ThirdPartyNotices.txt +++ b/ThirdPartyNotices.txt @@ -5239,34 +5239,6 @@ PERFORMANCE OF THIS SOFTWARE. _____ -microsoft/vcpkg, https://github.com/microsoft/vcpkg - -Copyright (c) Microsoft Corporation - -All rights reserved. - -MIT License - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -_____ - openssl/openssl, https://github.com/openssl/openssl Apache License diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json index 567fe2255df46..378647f273ab9 100644 --- a/cgmanifests/generated/cgmanifest.json +++ b/cgmanifests/generated/cgmanifest.json @@ -282,7 +282,7 @@ "component": { "type": "git", "git": { - "commitHash": "28cf67e5b64c704cad993c71f29a24e781bee544", + "commitHash": "f412df7a2b64421e1f1d61fde6055a6ea288e8f5", "repositoryUrl": "https://github.com/microsoft/mimalloc.git" }, "comments": "mimalloc" @@ -408,16 +408,6 @@ "comments": "cutlass" } }, - { - "component": { - "type": "git", - "git": { - "commitHash": "6f7ffeb18f99796233b958aaaf14ec7bd4fb64b2", - "repositoryUrl": "https://github.com/microsoft/vcpkg.git" - }, - "comments": "vcpkg" - } - }, { "component": { "type": "git", diff --git a/cmake/deps.txt b/cmake/deps.txt index 3a1a691985ea1..d16245ba833cb 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -21,7 +21,7 @@ googlexnnpack;https://github.com/google/XNNPACK/archive/003c580e696a774afdc98499 json;https://github.com/nlohmann/json/archive/refs/tags/v3.10.5.zip;f257f8dc27c5b8c085dc887b40cddd18ae1f725c microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf368104cd22a87b4dd0c80228919bb2df3e2a14 microsoft_wil;https://github.com/microsoft/wil/archive/5f4caba4e7a9017816e47becdd918fcc872039ba.zip;fd119887d0d17c37adf1fc227b054befa28158ad -mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.0.9.zip;9d4205c93805b5525de57c6c7ed7f60e770ffdac +mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.0.3.zip;e4f37b93b2da78a5816c2495603a4188d316214b mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.79.0.zip;c8f04e378535ededbe5af52c8f969d2dedbe73d5 onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.13.0.zip;8dda5079cdb5a134b08b0c73f4592a6404fc2dc6 #use the commit where it's several commits after 8.5-GA branch (https://github.com/onnx/onnx-tensorrt/commit/369d6676423c2a6dbf4a5665c4b5010240d99d3c) @@ -36,7 +36,6 @@ safeint;https://github.com/dcleblanc/SafeInt/archive/ff15c6ada150a5018c5ef217240 tensorboard;https://github.com/tensorflow/tensorboard/archive/373eb09e4c5d2b3cc2493f0949dc4be6b6a45e81.zip;67b833913605a4f3f499894ab11528a702c2b381 cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v2.11.0.zip;be70c559f07251ba7f33c789dba98872b444c10f # below are deps introduced by triton client, might remove after 1.14 release -vcpkg;https://github.com/microsoft/vcpkg/archive/refs/tags/2022.11.14.zip;3f983141351af5db2d6c3ca965959845f27d5d51 openssl;https://github.com/openssl/openssl/archive/refs/tags/openssl-3.0.7.zip;dda8fc81308555410505eb4a9eab3e1da0436a1d rapidjson;https://github.com/Tencent/rapidjson/archive/refs/tags/v1.1.0.zip;0fe7b4f7b83df4b3d517f4a202f3a383af7a0818 boost;https://github.com/boostorg/boost/archive/refs/tags/boost-1.81.0.zip;f6ab0da855f825b4eb1abd949967d01a4c5e4e1b diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml index baace703ca28a..35ff58ba6f7ee 100644 --- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml @@ -11,7 +11,7 @@ steps: packageType: upack feed: '/7424c8e4-5c62-490e-95c4-79446f31017c' definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0' - version: 1.0.28 + version: 1.0.29 downloadPath: $(Build.BinariesDirectory)/deps # The private ADO project @@ -22,7 +22,7 @@ steps: packageType: upack feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325' definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a' - version: 1.0.28 + version: 1.0.29 downloadPath: $(Build.BinariesDirectory)/deps # You can add more ADO accounts at here. From b539c364eec10cf238ffaf2e323ababaa05e433b Mon Sep 17 00:00:00 2001 From: Ye Wang <52801275+wangyems@users.noreply.github.com> Date: Tue, 7 Feb 2023 11:51:06 -0800 Subject: [PATCH 24/68] Some kernel changes for TULR (#14517) ### Description 1. fix a bug in relative position bias kernel where seq_len > 32 2. rename extra_add_qk to relative_position_bias 3. support relative_position_bias in multihead attention (B, N, S, S*) 4. gru_gate support by Lei ### Motivation and Context --------- Co-authored-by: Ubuntu Co-authored-by: Lei Zhang --- cmake/onnxruntime_rocm_hipify.cmake | 4 + docs/ContribOperators.md | 63 ++++- docs/OperatorKernels.md | 11 +- onnxruntime/contrib_ops/cpu/bert/attention.cc | 6 +- .../contrib_ops/cpu/bert/attention_base.cc | 48 ++-- .../contrib_ops/cpu/bert/attention_base.h | 4 +- .../contrib_ops/cpu/bert/attention_cpu_base.h | 38 +-- .../cpu/bert/multihead_attention_helper.h | 32 +++ .../cpu/quantization/attention_quant.cc | 2 +- .../contrib_ops/cuda/bert/attention.cc | 12 +- .../contrib_ops/cuda/bert/attention_impl.cu | 6 +- .../contrib_ops/cuda/bert/attention_impl.h | 2 +- .../contrib_ops/cuda/bert/attention_softmax.h | 10 +- .../cuda/bert/multihead_attention.cc | 7 +- .../cuda/bert/relative_attn_bias.cc | 121 +++++++- .../cuda/bert/relative_attn_bias.h | 12 + .../cuda/bert/relative_attn_bias_impl.cu | 118 +++++++- .../cuda/bert/relative_attn_bias_impl.h | 15 + .../contrib_ops/cuda/cuda_contrib_kernels.cc | 4 + .../quantization/attention_quantization.cc | 4 +- .../qordered_ops/qordered_attention.cc | 2 +- .../qordered_attention_input_enum.h | 2 +- .../contrib_ops/rocm/bert/attention.cc | 6 +- .../contrib_ops/rocm/bert/attention_impl.cu | 14 +- .../contrib_ops/rocm/bert/attention_impl.h | 2 +- .../core/graph/contrib_ops/bert_defs.cc | 44 ++- onnxruntime/core/graph/contrib_ops/ms_opset.h | 2 + .../graph/contrib_ops/quantization_defs.cc | 4 +- .../core/providers/cpu/cpu_provider_shared.cc | 4 +- .../core/providers/cpu/cpu_provider_shared.h | 2 +- .../src/Operators/DmlOperatorAttention.cpp | 6 +- .../provider_bridge_provider.cc | 4 +- .../tools/transformers/fusion_attention.py | 2 +- .../python/tools/transformers/onnx_model.py | 18 +- .../tools/transformers/onnx_model_tnlr.py | 6 +- .../test/contrib_ops/attention_op_test.cc | 30 +- .../contrib_ops/qordered_attention_test.cc | 2 +- .../relative_attention_bias_test.cc | 265 +++++++++++++++++- 38 files changed, 802 insertions(+), 132 deletions(-) diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake index 6eb315c59bc80..b118161b1c45a 100644 --- a/cmake/onnxruntime_rocm_hipify.cmake +++ b/cmake/onnxruntime_rocm_hipify.cmake @@ -15,6 +15,10 @@ set(contrib_ops_excluded_files "bert/fast_gelu_impl.h" "bert/fast_gelu.cc" "bert/fast_gelu.h" + "bert/relative_attn_bias.cc" + "bert/relative_attn_bias.h" + "bert/relative_attn_bias_impl.cu" + "bert/relative_attn_bias_impl.h" "bert/skip_layer_norm.cc" "bert/skip_layer_norm.h" "bert/skip_layer_norm_impl.cu" diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md index 8cd6d4c9e26f1..f01a7ab14a61e 100644 --- a/docs/ContribOperators.md +++ b/docs/ContribOperators.md @@ -30,6 +30,7 @@ Do not modify directly.* * com.microsoft.FusedConv * com.microsoft.FusedGemm * com.microsoft.FusedMatMul + * com.microsoft.GatedRelativePositionBias * com.microsoft.GatherND * com.microsoft.Gelu * com.microsoft.GemmFastGelu @@ -152,7 +153,7 @@ This version of the operator has been available since version 1 of the 'com.micr
Attention mask with shape (batch_size, 1, max_sequence_length, max_sequence_length), (batch_size, total_sequence_length) or (batch_size, sequence_length, total_sequence_length), or index with shape (batch_size) or (2 * batch_size)
past (optional) : T
past state for key and value with shape (2, batch_size, num_heads, past_sequence_length, head_size)When past_present_share_buffer is set, its shape is (2, batch_size, num_heads, max_sequence_length, head_size)
-
extra_add (optional) : T
+
relative_position_bias (optional) : T
additional add to QxK' with shape (batch_size, num_heads, sequence_length, total_sequence_length)
past_sequence_length (optional) : M
When past_present_share_buffer is used, it is required to specify past_sequence_length (could be 0).
@@ -1608,6 +1609,58 @@ This version of the operator has been available since version 1 of the 'com.micr
+### **com.microsoft.GatedRelativePositionBias** + + query_layer = (query_layer + query_bias).reshape(batch_size, seq_len, num_heads, head_size).transpose(1, 2) + gate_u, gate_r = torch.sigmoid( + self.gate_ur_linear(query_layer).view(batch_size, num_head, seq_len, 2, D/2).sum(-1, keepdim=False) + ).chunk(2, dim=-1) + gate_u_1 = gate_u * (gate_r * self.eco_a - 1.0) + 2.0 + rel_pos_bias = gate_u_1 * rel_pos + +#### Version + +This version of the operator has been available since version 1 of the 'com.microsoft' operator set. + +#### Attributes + +
+
num_heads : int (required)
+
Number of attention heads
+
+ +#### Inputs + +
+
query_layer : T
+
tensor with shape (batch_size, seq_len, num_heads x head_size)
+
query_bias : T
+
1-d tensor with shape (num_heads x head_size)
+
rel_pos : T
+
tensor with shape (1, num_head, seq_len, seq_len)
+
weight : T
+
gemm weight for the gated_ur_linear, shape (head_size, D), D is divisible by 2
+
bias : T
+
bias for the gated_ur_linear, shape (D)
+
eco_a : T
+
tensor of shape (1, num_heads, 1, 1)
+
+ +#### Outputs + +
+
output : T
+
output tensor with shape (batch_size, num_heads, seq_len, seq_len)
+
+ +#### Type Constraints + +
+
T : tensor(float), tensor(float16)
+
Constrain input and output types to float tensors.
+
+ + ### **com.microsoft.GatherND** Given `data` tensor of rank r >= 1, and `indices` tensor of rank q >= 1, gather @@ -2222,7 +2275,7 @@ This version of the operator has been available since version 1 of the 'com.micr
Number of attention heads
-#### Inputs (2 - 5) +#### Inputs (2 - 6)
query : T
@@ -2235,6 +2288,8 @@ This version of the operator has been available since version 1 of the 'com.micr
Bias tensor with shape (hidden_size + hidden_size + v_hidden_size) from input projection
key_padding_mask (optional) : M
Key padding mask with shape (batch_size) or (batch_size, kv_sequence_length)
+
relative_position_bias (optional) : T
+
relative position bias: addition to QxK' with shape (batch_size, num_heads, sequence_length, total_sequence_length) or (1, num_heads, sequence_length, total_sequence_length)
#### Outputs @@ -3221,7 +3276,7 @@ This version of the operator has been available since version 1 of the 'com.micr left-side padding, mask_index has shape (2 * batch_size), where the values are the exclusive end positions followed by the inclusive start positions. When unidirectional is 1, and each token only attend to previous tokens. For GPT-2, both past and present state are optional. Present state could appear in output even when past state is not in input. - Current version does not support past/present, extra_add and qkv_hidden_sizes. + Current version does not support past/present, relative_position_bias and qkv_hidden_sizes. TODO: Support them if needed in the future. #### Version @@ -3286,7 +3341,7 @@ This version of the operator has been available since version 1 of the 'com.micr
Attention mask with shape (batch_size, 1, max_sequence_length, max_sequence_length), (batch_size, past_sequence_length + sequence_length)or (batch_size, sequence_length, past_sequence_length + sequence_length), or index with shape (batch_size) or (2 * batch_size).
past (optional) : Q
past state for key and value with shape (2, batch_size, num_heads, past_sequence_length, head_size).
-
extra_add (optional) : S
+
relative_position_bias (optional) : S
additional add to QxK' with shape (batch_size, num_heads, sequence_length, sequence_length).
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index 618214acae75d..00b71d2946215 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -417,7 +417,7 @@ Do not modify directly.* | | | | |**Operator Domain:** *com.microsoft*|||| -|Attention|*in* input:**T**
*in* weights:**T**
*in* bias:**T**
*in* mask_index:**M**
*in* past:**T**
*in* extra_add:**T**
*in* past_sequence_length:**M**
*out* output:**T**
*out* present:**T**|1+|**T** = tensor(float)| +|Attention|*in* input:**T**
*in* weights:**T**
*in* bias:**T**
*in* mask_index:**M**
*in* past:**T**
*in* relative_position_bias:**T**
*in* past_sequence_length:**M**
*out* output:**T**
*out* present:**T**|1+|**T** = tensor(float)| |AttnLSTM|*in* X:**T**
*in* W:**T**
*in* R:**T**
*in* B:**T**
*in* sequence_lens:**T1**
*in* initial_h:**T**
*in* initial_c:**T**
*in* P:**T**
*in* QW:**T**
*in* MW:**T**
*in* V:**T**
*in* M:**T**
*in* memory_seq_lens:**T1**
*in* AW:**T**
*out* Y:**T**
*out* Y_h:**T**
*out* Y_c:**T**|1+|**T** = tensor(double), tensor(float)
**T1** = tensor(int32)| |BeamSearch|*in* input_ids:**I**
*in* max_length:**I**
*in* min_length:**I**
*in* num_beams:**I**
*in* num_return_sequences:**I**
*in* length_penalty:**T**
*in* repetition_penalty:**T**
*in* vocab_mask:**M**
*in* prefix_vocab_mask:**M**
*in* attention_mask:**I**
*out* sequences:**I**
*out* sequences_scores:**T**
*out* scores:**T**|1+|**T** = tensor(float)| |BiasGelu|*in* A:**T**
*in* B:**T**
*out* C:**T**|1+|**T** = tensor(float)| @@ -785,7 +785,7 @@ Do not modify directly.* | | | | |**Operator Domain:** *com.microsoft*|||| -|Attention|*in* input:**T**
*in* weights:**T**
*in* bias:**T**
*in* mask_index:**M**
*in* past:**T**
*in* extra_add:**T**
*in* past_sequence_length:**M**
*out* output:**T**
*out* present:**T**|1+|**T** = tensor(float), tensor(float16)| +|Attention|*in* input:**T**
*in* weights:**T**
*in* bias:**T**
*in* mask_index:**M**
*in* past:**T**
*in* relative_position_bias:**T**
*in* past_sequence_length:**M**
*out* output:**T**
*out* present:**T**|1+|**T** = tensor(float), tensor(float16)| |BeamSearch|*in* input_ids:**I**
*in* max_length:**I**
*in* min_length:**I**
*in* num_beams:**I**
*in* num_return_sequences:**I**
*in* length_penalty:**T**
*in* repetition_penalty:**T**
*in* vocab_mask:**M**
*in* prefix_vocab_mask:**M**
*in* attention_mask:**I**
*out* sequences:**I**
*out* sequences_scores:**T**
*out* scores:**T**|1+|**T** = tensor(float), tensor(float16)| |BiasDropout|*in* data:**T**
*in* bias:**T**
*in* residual:**T**
*in* ratio:**T1**
*in* training_mode:**T2**
*out* output:**T**
*out* mask:**T2**|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)
**T1** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)
**T2** = tensor(bool)| |BiasGelu|*in* A:**T**
*in* B:**T**
*out* C:**T**|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)| @@ -803,6 +803,7 @@ Do not modify directly.* |FastGelu|*in* X:**T**
*in* bias:**T**
*out* Y:**T**|1+|**T** = tensor(bfloat16), tensor(float), tensor(float16)| |FusedConv|*in* X:**T**
*in* W:**T**
*in* B:**T**
*in* Z:**T**
*out* Y:**T**|1+|**T** = tensor(float)| |FusedMatMul|*in* A:**T**
*in* B:**T**
*out* Y:**T**|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)| +|GatedRelativePositionBias|*in* query_layer:**T**
*in* query_bias:**T**
*in* rel_pos:**T**
*in* weight:**T**
*in* bias:**T**
*in* eco_a:**T**
*out* output:**T**|1+|**T** = tensor(float), tensor(float16)| |Gelu|*in* X:**T**
*out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)| |GreedySearch|*in* input_ids:**I**
*in* max_length:**I**
*in* min_length:**I**
*in* repetition_penalty:**T**
*in* vocab_mask:**I**
*in* prefix_vocab_mask:**I**
*in* attention_mask:**I**
*out* sequences:**I**|1+|**T** = tensor(float), tensor(float16)| |GridSample|*in* X:**T1**
*in* Grid:**T1**
*out* Y:**T2**|1+|**T1** = tensor(float)
**T2** = tensor(float)| @@ -810,11 +811,11 @@ Do not modify directly.* |Inverse|*in* X:**T**
*out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)| |Irfft|*in* X:**T**
*out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)| |LongformerAttention|*in* input:**T**
*in* weight:**T**
*in* bias:**T**
*in* mask:**T**
*in* global_weight:**T**
*in* global_bias:**T**
*in* global:**G**
*out* output:**T**|1+|**T** = tensor(float), tensor(float16)| -|MultiHeadAttention|*in* query:**T**
*in* key:**T**
*in* value:**T**
*in* bias:**T**
*in* key_padding_mask:**M**
*out* output:**T**|1+|**T** = tensor(float), tensor(float16)| +|MultiHeadAttention|*in* query:**T**
*in* key:**T**
*in* value:**T**
*in* bias:**T**
*in* key_padding_mask:**M**
*in* relative_position_bias:**T**
*out* output:**T**|1+|**T** = tensor(float), tensor(float16)| |NGramRepeatBlock|*in* input_ids:**Tid**
*in* scores:**T**
*out* scores_out:**T**|1+|**T** = tensor(float)
**Tid** = tensor(int64)| |NhwcConv|*in* X:**T**
*in* W:**T**
*in* B:**T**
*out* Y:**T**|1+|**T** = tensor(float), tensor(float16)| |QAttention|*in* input:**T1**
*in* weight:**T2**
*in* bias:**T3**
*in* input_scale:**T3**
*in* weight_scale:**T3**
*in* mask_index:**T4**
*in* input_zero_point:**T1**
*in* weight_zero_point:**T2**
*in* past:**T3**
*out* output:**T3**
*out* present:**T3**|1+|**T1** = tensor(int8)
**T2** = tensor(int8)
**T3** = tensor(float), tensor(float16)
**T4** = tensor(int32)| -|QOrderedAttention|*in* input:**Q**
*in* scale_input:**S**
*in* scale_Q_gemm:**S**
*in* scale_K_gemm:**S**
*in* scale_V_gemm:**S**
*in* Q_weight:**Q**
*in* K_weight:**Q**
*in* V_weight:**Q**
*in* scale_Q_weight:**S**
*in* scale_K_weight:**S**
*in* scale_V_weight:**S**
*in* Q_bias:**S**
*in* K_bias:**S**
*in* V_bias:**S**
*in* scale_QKT_gemm:**S**
*in* scale_QKT_softmax:**S**
*in* scale_values_gemm:**S**
*in* mask_index:**G**
*in* past:**Q**
*in* extra_add:**S**
*out* output:**Q**|1+|**G** = tensor(int32)
**Q** = tensor(int8)
**S** = tensor(float)| +|QOrderedAttention|*in* input:**Q**
*in* scale_input:**S**
*in* scale_Q_gemm:**S**
*in* scale_K_gemm:**S**
*in* scale_V_gemm:**S**
*in* Q_weight:**Q**
*in* K_weight:**Q**
*in* V_weight:**Q**
*in* scale_Q_weight:**S**
*in* scale_K_weight:**S**
*in* scale_V_weight:**S**
*in* Q_bias:**S**
*in* K_bias:**S**
*in* V_bias:**S**
*in* scale_QKT_gemm:**S**
*in* scale_QKT_softmax:**S**
*in* scale_values_gemm:**S**
*in* mask_index:**G**
*in* past:**Q**
*in* relative_position_bias:**S**
*out* output:**Q**|1+|**G** = tensor(int32)
**Q** = tensor(int8)
**S** = tensor(float)| |QOrderedGelu|*in* X:**Q**
*in* scale_X:**S**
*in* scale_Y:**S**
*out* Y:**Q**|1+|**Q** = tensor(int8)
**S** = tensor(float)| |QOrderedLayerNormalization|*in* X:**Q**
*in* scale_X:**S**
*in* scale:**F**
*in* B:**F**
*in* scale_Y:**S**
*out* Y:**Q**|1+|**F** = tensor(float), tensor(float16)
**Q** = tensor(int8)
**S** = tensor(float)| |QOrderedLongformerAttention|*in* input:**Q**
*in* scale_input:**S**
*in* weight:**Q**
*in* scale_weight:**S**
*in* bias:**S**
*in* scale_bias:**S**
*in* scale_qkv_gemm:**S**
*in* mask:**F**
*in* global_weight:**Q**
*in* scale_global_weight:**S**
*in* global_bias:**S**
*in* scale_global_gemm:**S**
*in* global:**G**
*in* scale_output:**S**
*out* output:**Q**|1+|**F** = tensor(float16)
**G** = tensor(int32)
**Q** = tensor(int8)
**S** = tensor(float)| @@ -1159,7 +1160,7 @@ Do not modify directly.* | | | | |**Operator Domain:** *com.microsoft*|||| -|Attention|*in* input:**T**
*in* weights:**T**
*in* bias:**T**
*in* mask_index:**M**
*in* past:**T**
*in* extra_add:**T**
*in* past_sequence_length:**M**
*out* output:**T**
*out* present:**T**|1+|**M** = tensor(int32)
**T** = tensor(float), tensor(float16)| +|Attention|*in* input:**T**
*in* weights:**T**
*in* bias:**T**
*in* mask_index:**M**
*in* past:**T**
*in* relative_position_bias:**T**
*in* past_sequence_length:**M**
*out* output:**T**
*out* present:**T**|1+|**M** = tensor(int32)
**T** = tensor(float), tensor(float16)| |BiasGelu|*in* A:**T**
*in* B:**T**
*out* C:**T**|1+|**T** = tensor(float), tensor(float16)| |ConvTransposeWithDynamicPads|*in* X:**T**
*in* W:**T**
*in* Pads:**tensor(int64)**
*in* B:**T**
*out* Y:**T**|1+|**T** = tensor(float), tensor(float16)| |DequantizeLinear|*in* x:**T1**
*in* x_scale:**T2**
*in* x_zero_point:**T1**
*out* y:**T2**|1+|**T1** = tensor(float)
**T2** = tensor(uint8)| diff --git a/onnxruntime/contrib_ops/cpu/bert/attention.cc b/onnxruntime/contrib_ops/cpu/bert/attention.cc index 47db3fe558ce8..6aa0e726afe1b 100644 --- a/onnxruntime/contrib_ops/cpu/bert/attention.cc +++ b/onnxruntime/contrib_ops/cpu/bert/attention.cc @@ -198,7 +198,7 @@ Status Attention::Compute(OpKernelContext* context) const { const Tensor* mask_index = context->Input(3); const Tensor* past = context->Input(4); - const Tensor* extra_add_qk = context->Input(5); + const Tensor* relative_position_bias = context->Input(5); const TensorShape& weights_shape = (weights ? weights->Shape() : weight_shape_); @@ -208,7 +208,7 @@ Status Attention::Compute(OpKernelContext* context) const { bias->Shape(), mask_index, past, - extra_add_qk, + relative_position_bias, ¶meters)); const int batch_size = parameters.batch_size; @@ -331,7 +331,7 @@ Status Attention::Compute(OpKernelContext* context) const { return ApplyAttention(Q, K, V, mask_index, past, output, batch_size, sequence_length, parameters.head_size, parameters.v_head_size, parameters.v_hidden_size, - extra_add_qk, context); + relative_position_bias, context); } } // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_base.cc b/onnxruntime/contrib_ops/cpu/bert/attention_base.cc index affe7cab1d858..e75f68ea53c7c 100644 --- a/onnxruntime/contrib_ops/cpu/bert/attention_base.cc +++ b/onnxruntime/contrib_ops/cpu/bert/attention_base.cc @@ -12,7 +12,7 @@ Status AttentionBase::CheckInputs(const TensorShape& input_shape, const TensorShape& bias_shape, const Tensor*& mask_index, const Tensor* past, - const Tensor* extra_add_qk, + const Tensor* relative_position_bias, void* parameters, const Tensor* past_seq_len) const { // Abbreviation and Meanings: @@ -37,7 +37,7 @@ Status AttentionBase::CheckInputs(const TensorShape& input_shape, // bias (Q/K/V) : (D + D + D_v) // mask_index : see below // past (K/V) : (2, B, N, P, H) or NULL - // extra_add_qk : (B, N, S, T) or NULL + // relative_position_bias : (B, N, S, T) or NULL // For mask_index, the following shapes are supported: // NULL, (B, 1), (1, 1) @@ -49,9 +49,9 @@ Status AttentionBase::CheckInputs(const TensorShape& input_shape, // When a model is pruned (like some attention heads are removed in Q/K/V), input_hidden_size could be larger // than hidden dimension of Q, K and V. - if (past != nullptr && extra_add_qk != nullptr) { - // past is used on GPT-2 model with past state, we don't have a case for extra add qk yet - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Attention cannot have both past and extra_add_qk"); + if (past != nullptr && relative_position_bias != nullptr) { + // past is used on GPT-2 model with past state, we don't have a case for relative position bias yet + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Attention cannot have both past and relative_position_bias"); } const auto& dims = input_shape.GetDims(); @@ -191,34 +191,34 @@ Status AttentionBase::CheckInputs(const TensorShape& input_shape, } } - if (extra_add_qk != nullptr) { - const auto& extra_add_qk_dims = extra_add_qk->Shape().GetDims(); + if (relative_position_bias != nullptr) { + const auto& relative_position_bias_dims = relative_position_bias->Shape().GetDims(); - if (extra_add_qk_dims.size() != 4) { + if (relative_position_bias_dims.size() != 4) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Input 'extra_add_qk' is expected to have 4 dimensions, got ", - extra_add_qk_dims.size()); + "Input 'relative_position_bias' is expected to have 4 dimensions, got ", + relative_position_bias_dims.size()); } - if (extra_add_qk_dims[0] != batch_size) { + if (relative_position_bias_dims[0] != batch_size) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Input 'extra_add_qk' dimension 0 should be same as batch_size, got ", - extra_add_qk_dims[0]); + "Input 'relative_position_bias' dimension 0 should be same as batch_size, got ", + relative_position_bias_dims[0]); } - if (extra_add_qk_dims[1] != num_heads_) { + if (relative_position_bias_dims[1] != num_heads_) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Input 'extra_add_qk' dimension 1 should be same as number of heads, got ", - extra_add_qk_dims[1]); + "Input 'relative_position_bias' dimension 1 should be same as number of heads, got ", + relative_position_bias_dims[1]); } - if (extra_add_qk_dims[2] != sequence_length) { + if (relative_position_bias_dims[2] != sequence_length) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Input 'extra_add_qk' dimension 2 should be same as sequence_length, got ", - extra_add_qk_dims[2]); + "Input 'relative_position_bias' dimension 2 should be same as sequence_length, got ", + relative_position_bias_dims[2]); } - if (extra_add_qk_dims[3] != total_sequence_length) { + if (relative_position_bias_dims[3] != total_sequence_length) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Input 'extra_add_qk' dimension 3 should be same as total_sequence_length, got ", - extra_add_qk_dims[3]); + "Input 'relative_position_bias' dimension 3 should be same as total_sequence_length, got ", + relative_position_bias_dims[3]); } } @@ -320,7 +320,7 @@ Status AttentionBase::CheckInputs(const TensorShape& input_shape, const TensorShape& bias_shape, const Tensor*& mask_index, const Tensor* past, - const Tensor* extra_add_qk, + const Tensor* relative_position_bias, void* parameters, const int max_threads_per_block, const Tensor* past_seq_len) const { @@ -328,7 +328,7 @@ Status AttentionBase::CheckInputs(const TensorShape& input_shape, return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "num_heads should be no larger than ", max_threads_per_block); } - return CheckInputs(input_shape, weights_shape, bias_shape, mask_index, past, extra_add_qk, parameters, past_seq_len); + return CheckInputs(input_shape, weights_shape, bias_shape, mask_index, past, relative_position_bias, parameters, past_seq_len); } Tensor* AttentionBase::GetPresent(OpKernelContext* context, diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_base.h b/onnxruntime/contrib_ops/cpu/bert/attention_base.h index 2c49f196d52d8..2e077da2853d0 100644 --- a/onnxruntime/contrib_ops/cpu/bert/attention_base.h +++ b/onnxruntime/contrib_ops/cpu/bert/attention_base.h @@ -18,7 +18,7 @@ class AttentionBase { const TensorShape& bias_shape, const Tensor*& mask_index, // Dummy mask of shape (1 or batch_size, 1) will be updated to nullptr. const Tensor* past, - const Tensor* extra_add_qk, + const Tensor* relative_position_bias, void* parameters, const int max_threads_per_block, // for CUDA const Tensor* past_seq_len = nullptr) const; @@ -61,7 +61,7 @@ class AttentionBase { const TensorShape& bias_shape, const Tensor*& mask_index, // Dummy mask of shape (1 or batch_size, 1) will be updated to nullptr. const Tensor* past, - const Tensor* extra_add_qk, + const Tensor* relative_position_bias, void* parameters, const Tensor* past_seq_len = nullptr) const; diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h b/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h index 0185fa9ea09a0..70d71ffb6ee40 100644 --- a/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h +++ b/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h @@ -19,18 +19,18 @@ class AttentionCPUBase : public AttentionBase { : AttentionBase(info, require_same_hidden_size) {} template - Status ApplyAttention(const T* Q, // Q data with shape BxNxSxH - const T* K, // K data with shape BxNxSxH - const T* V, // V value with size BxNxSxH_v - const Tensor* mask_index, // mask index. nullptr if no mask or its size is B - const Tensor* past, // past state - Tensor* output, // output tensor - int batch_size, // batch size (B) - int sequence_length, // sequence length (S) - int qk_head_size, // head size of Q or K (H) - int v_head_size, // head size of V (H_v) - int v_hidden_size, // hidden size of V (D_v) - const Tensor* extra_add_qk, // extra add in QK. Its size is BxNxSxT + Status ApplyAttention(const T* Q, // Q data with shape BxNxSxH + const T* K, // K data with shape BxNxSxH + const T* V, // V value with size BxNxSxH_v + const Tensor* mask_index, // mask index. nullptr if no mask or its size is B + const Tensor* past, // past state + Tensor* output, // output tensor + int batch_size, // batch size (B) + int sequence_length, // sequence length (S) + int qk_head_size, // head size of Q or K (H) + int v_head_size, // head size of V (H_v) + int v_hidden_size, // hidden size of V (D_v) + const Tensor* relative_position_bias, // bias addition in QK. Its size is BxNxSxT OpKernelContext* context) const { const int kv_sequence_length = sequence_length; @@ -67,16 +67,16 @@ class AttentionCPUBase : public AttentionBase { const T* past_data = past != nullptr ? past->Data() : nullptr; T* present_data = present != nullptr ? present->MutableData() : nullptr; - const T* extra_add_qk_data = nullptr; - if (extra_add_qk != nullptr) { - extra_add_qk_data = extra_add_qk->Data(); + const T* relative_position_bias_data = nullptr; + if (relative_position_bias != nullptr) { + relative_position_bias_data = relative_position_bias->Data(); } ComputeAttentionProbs(static_cast(attention_probs), Q, K, mask_index_data, mask_index_dims, static_cast(mask_data), has_unidirectional, batch_size, sequence_length, past_sequence_length, qk_head_size == 0 ? v_head_size : qk_head_size, - past_data, present_data, tp, extra_add_qk_data); + past_data, present_data, tp, relative_position_bias_data); // Compute the attentionScore * Value: out_tmp(B, N, S, H_v) = attention_probs(B, N, S, T) x V(B, N, T, H_v) auto out_tmp_data = @@ -112,7 +112,7 @@ class AttentionCPUBase : public AttentionBase { const T* past, // past state T* present, // present state ThreadPool* tp, // thread pool - const T* extra_add_qk_data // extra add matrix with shape BxNxSxT + const T* relative_position_bias_data // bias addition matrix with shape BxNxSxT ) const { const int total_sequence_length = past_sequence_length + sequence_length; // T = P + L const size_t past_chunk_length = static_cast(past_sequence_length) * head_size; // P x H @@ -175,9 +175,9 @@ class AttentionCPUBase : public AttentionBase { } } - if (extra_add_qk_data != nullptr) { + if (relative_position_bias_data != nullptr) { for (int j = 0; j < sequence_length * total_sequence_length; j++) { - output[j] += extra_add_qk_data[output_offset + j]; + output[j] += relative_position_bias_data[output_offset + j]; } } } diff --git a/onnxruntime/contrib_ops/cpu/bert/multihead_attention_helper.h b/onnxruntime/contrib_ops/cpu/bert/multihead_attention_helper.h index 8c3af05972c95..ee1720b9f43bb 100644 --- a/onnxruntime/contrib_ops/cpu/bert/multihead_attention_helper.h +++ b/onnxruntime/contrib_ops/cpu/bert/multihead_attention_helper.h @@ -17,6 +17,7 @@ Status CheckInputs(const T* query, const T* value, const T* bias, const T* key_padding_mask, + const T* relative_position_bias, void* parameters, int num_heads, float mask_filter_value, @@ -26,6 +27,7 @@ Status CheckInputs(const T* query, // value (V) : (B, L, D_v) // bias (Q/K/V) : (D + D + D_v) // key_padding_mask (K/V) : (B) or (B, L) or None + // relative_position_bias : (B, 1, S, L) // When packed kv is used: // key (K) : (B, L, N, 2, H) // value (V) : None @@ -120,6 +122,36 @@ Status CheckInputs(const T* query, v_hidden_size = static_cast(value_dims[2]); } + if (relative_position_bias != nullptr) { + const auto& relative_position_bias_dims = relative_position_bias->Shape().GetDims(); + + if (relative_position_bias_dims.size() != 4) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'relative_position_bias' is expected to have 4 dimensions, got ", + relative_position_bias_dims.size()); + } + if (relative_position_bias_dims[0] != batch_size && relative_position_bias_dims[0] != 1) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'relative_position_bias' dimension 0 should be batch_size or 1, got ", + relative_position_bias_dims[0]); + } + if (relative_position_bias_dims[1] != num_heads) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'relative_position_bias' dimension 1 should be same as number of heads, got ", + relative_position_bias_dims[1]); + } + if (relative_position_bias_dims[2] != sequence_length) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'relative_position_bias' dimension 2 should be same as sequence_length, got ", + relative_position_bias_dims[2]); + } + if (relative_position_bias_dims[3] != kv_sequence_length) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Input 'relative_position_bias' dimension 3 should be same as total_sequence_length, got ", + relative_position_bias_dims[3]); + } + } + if (parameters != nullptr) { AttentionParameters* output_parameters = reinterpret_cast(parameters); output_parameters->batch_size = batch_size; diff --git a/onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc b/onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc index 64c17b7767e4f..e7df84c1b0066 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc @@ -160,7 +160,7 @@ Status QAttention::Compute(OpKernelContext* context) const { bias->Shape(), mask_index, past_tensor, - nullptr, // extra_add_qk + nullptr, // relative_position_bias nullptr // parameters )); diff --git a/onnxruntime/contrib_ops/cuda/bert/attention.cc b/onnxruntime/contrib_ops/cuda/bert/attention.cc index 4a6d2dc137139..1ab89b525eae5 100644 --- a/onnxruntime/contrib_ops/cuda/bert/attention.cc +++ b/onnxruntime/contrib_ops/cuda/bert/attention.cc @@ -59,7 +59,7 @@ Status Attention::ComputeInternal(OpKernelContext* context) const { const Tensor* bias = context->Input(2); const Tensor* mask_index = context->Input(3); const Tensor* past = context->Input(kPastInputIndex); - const Tensor* extra_add_qk = context->Input(5); + const Tensor* relative_position_bias = context->Input(5); const Tensor* past_seq_len = context->Input(kPastSequenceLengthInputIndex); auto& device_prop = GetDeviceProp(); @@ -69,7 +69,7 @@ Status Attention::ComputeInternal(OpKernelContext* context) const { bias->Shape(), mask_index, past, - extra_add_qk, + relative_position_bias, ¶meters, device_prop.maxThreadsPerBlock, past_seq_len)); @@ -105,7 +105,7 @@ Status Attention::ComputeInternal(OpKernelContext* context) const { bool is_mask_2d_key_padding = parameters.mask_type == AttentionMaskType::MASK_2D_KEY_PADDING; bool use_causal_fused_runner = !disable_fused_runner_ && (nullptr == mask_index || is_mask_1d_seq_len || is_mask_2d_key_padding) && - nullptr == extra_add_qk && + nullptr == relative_position_bias && parameters.past_sequence_length == 0 && parameters.hidden_size == parameters.v_hidden_size && FusedMHARunnerFP16v2::is_supported(sm, parameters.head_size, sequence_length, @@ -125,7 +125,7 @@ Status Attention::ComputeInternal(OpKernelContext* context) const { (nullptr == mask_index || is_mask_1d_seq_len) && nullptr == past && nullptr == present && - nullptr == extra_add_qk && + nullptr == relative_position_bias && parameters.hidden_size == parameters.v_hidden_size && FusedMHARunnerFP16v2::is_supported(sm, parameters.head_size, sequence_length, enable_trt_flash_attention_, false); @@ -151,7 +151,7 @@ Status Attention::ComputeInternal(OpKernelContext* context) const { nullptr == mask_index && // TODO: support 1D mask nullptr == past && nullptr == present && - nullptr == extra_add_qk && + nullptr == relative_position_bias && (sizeof(T) == 2 || // sequence length threshold is 0 in FP16 parameters.sequence_length >= attention::kMinSequenceLengthForMemoryEfficientAttentionFp32) && has_memory_efficient_attention(sm, sizeof(T) == 2); @@ -203,7 +203,7 @@ Status Attention::ComputeInternal(OpKernelContext* context) const { data.mask_index = (nullptr == mask_index) ? nullptr : mask_index->Data(); data.mask_index_dims = (nullptr == mask_index) ? gsl::span() : mask_index->Shape().GetDims(); data.past = (nullptr == past) ? nullptr : reinterpret_cast(past->Data()); - data.extra_add_qk = (nullptr == extra_add_qk) ? nullptr : reinterpret_cast(extra_add_qk->Data()); + data.relative_position_bias = (nullptr == relative_position_bias) ? nullptr : reinterpret_cast(relative_position_bias->Data()); data.workspace = reinterpret_cast(work_space.get()); data.output = reinterpret_cast(output->MutableData()); data.present = (nullptr == present) ? nullptr : reinterpret_cast(present->MutableData()); diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu index 8c7ef9f919519..fcf86637350b6 100644 --- a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu +++ b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu @@ -665,7 +665,7 @@ Status QkvToContext( T* persistent_softmax_workspace = scratch1; // replace Q*K' in place with masked score for persistent softmax. ORT_RETURN_IF_ERROR( ComputeSoftmaxWithRawMask(stream, total_sequence_length, sequence_length, batch_size, num_heads, - mask_index, nullptr, data.extra_add_qk, scratch1, scratch2, + mask_index, nullptr, data.relative_position_bias, scratch1, scratch2, parameters.is_unidirectional, scale, mask_dimension, parameters.max_sequence_length, use_persistent_softmax, persistent_softmax_workspace, mask_filter_value)); @@ -675,10 +675,10 @@ Status QkvToContext( const int* mask_start = (mask_index_dims[0] > batch_size) ? mask_index + batch_size : nullptr; ORT_RETURN_IF_ERROR(ComputeSoftmaxWithMask1D( stream, total_sequence_length, sequence_length, batch_size, num_heads, - mask_index, mask_start, data.extra_add_qk, scratch1, scratch2, parameters.is_unidirectional)); + mask_index, mask_start, data.relative_position_bias, scratch1, scratch2, parameters.is_unidirectional)); } else { // no mask ORT_RETURN_IF_ERROR( - ComputeSoftmax(stream, total_sequence_length, sequence_length, batch_size, num_heads, data.extra_add_qk, + ComputeSoftmax(stream, total_sequence_length, sequence_length, batch_size, num_heads, data.relative_position_bias, scratch1, scratch2, parameters.is_unidirectional)); } diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_impl.h b/onnxruntime/contrib_ops/cuda/bert/attention_impl.h index d98a0380c479b..2ecda71479c52 100644 --- a/onnxruntime/contrib_ops/cuda/bert/attention_impl.h +++ b/onnxruntime/contrib_ops/cuda/bert/attention_impl.h @@ -41,7 +41,7 @@ struct AttentionData { const int* mask_index; gsl::span mask_index_dims; const T* past; - const T* extra_add_qk; + const T* relative_position_bias; T* workspace; T* output; diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_softmax.h b/onnxruntime/contrib_ops/cuda/bert/attention_softmax.h index 16b3cf053b586..92851c446d48f 100644 --- a/onnxruntime/contrib_ops/cuda/bert/attention_softmax.h +++ b/onnxruntime/contrib_ops/cuda/bert/attention_softmax.h @@ -377,11 +377,7 @@ __device__ inline void SoftmaxWithRawMaskSmall(const int all_sequence_length, float thread_data = -CUDART_INF_F; if (threadIdx.x < all_sequence_length) { - if (add_before_softmax == nullptr) { - thread_data = float(input[index]) * rsqrt_head_size; - } else { - thread_data = float(input[index] + add_before_softmax[index]) * rsqrt_head_size; - } + thread_data = float(input[index]) * rsqrt_head_size; const int sequence_index = blockIdx.x % sequence_length; if (is_unidirectional) { @@ -412,6 +408,10 @@ __device__ inline void SoftmaxWithRawMaskSmall(const int all_sequence_length, thread_data = -CUDART_INF_F; } } + + if (add_before_softmax != nullptr) { + thread_data += float(add_before_softmax[index]); + } } if (skip_softmax) { diff --git a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc index 93e5e59ed00ae..57a3a310a0dd6 100644 --- a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc +++ b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc @@ -62,6 +62,7 @@ Status MultiHeadAttention::ComputeInternal(OpKernelContext* context) const { const Tensor* value = context->Input(2); const Tensor* bias = context->Input(3); const Tensor* key_padding_mask = context->Input(4); + const Tensor* relative_position_bias = context->Input(5); auto& device_prop = GetDeviceProp(); AttentionParameters parameters; @@ -70,6 +71,7 @@ Status MultiHeadAttention::ComputeInternal(OpKernelContext* context) const { value, bias, key_padding_mask, + relative_position_bias, ¶meters, num_heads_, mask_filter_value_, @@ -94,6 +96,7 @@ Status MultiHeadAttention::ComputeInternal(OpKernelContext* context) const { bool use_fused_cross_attention = !disable_fused_cross_attention_ && nullptr == key_padding_mask && + nullptr == relative_position_bias && (value != nullptr || bias == nullptr) && // TODO: new kernel for adding bias to packed KV parameters.hidden_size == parameters.v_hidden_size && has_fused_cross_attention_kernel(sm, parameters.head_size, @@ -112,6 +115,7 @@ Status MultiHeadAttention::ComputeInternal(OpKernelContext* context) const { bool use_fused_runner = !disable_fused_runner_ && fused_cross_attention_kernel == nullptr && + nullptr == relative_position_bias && value != nullptr && // fused runner requires packed qkv instead of packed kv (nullptr == key_padding_mask || is_mask_1d_seq_len) && parameters.hidden_size == parameters.v_hidden_size && @@ -143,6 +147,7 @@ Status MultiHeadAttention::ComputeInternal(OpKernelContext* context) const { !disable_memory_efficient_attention_ && is_long_sequence && nullptr == key_padding_mask && // TODO: support 1D mask + nullptr == relative_position_bias && has_memory_efficient_attention(sm, sizeof(T) == 2); #else constexpr bool use_memory_efficient_attention = false; @@ -171,7 +176,7 @@ Status MultiHeadAttention::ComputeInternal(OpKernelContext* context) const { data.mask_index = (nullptr == key_padding_mask) ? nullptr : key_padding_mask->Data(); data.mask_index_dims = (nullptr == key_padding_mask) ? gsl::span() : key_padding_mask->Shape().GetDims(); data.past = nullptr; - data.extra_add_qk = nullptr; + data.relative_position_bias = (nullptr == relative_position_bias) ? nullptr : reinterpret_cast(relative_position_bias->Data()); data.workspace = reinterpret_cast(work_space.get()); data.output = reinterpret_cast(output->MutableData()); data.present = nullptr; diff --git a/onnxruntime/contrib_ops/cuda/bert/relative_attn_bias.cc b/onnxruntime/contrib_ops/cuda/bert/relative_attn_bias.cc index af13efe0e2fbc..111fed04639e7 100644 --- a/onnxruntime/contrib_ops/cuda/bert/relative_attn_bias.cc +++ b/onnxruntime/contrib_ops/cuda/bert/relative_attn_bias.cc @@ -3,7 +3,15 @@ #include "core/providers/cuda/cuda_common.h" #include "relative_attn_bias.h" +#include "core/common/safeint.h" #include "relative_attn_bias_impl.h" +#include "core/providers/cuda/shared_inc/fpgeneric.h" +#include "contrib_ops/cuda/bert/add_bias_transpose.h" + +using namespace onnxruntime::cuda; +using namespace ::onnxruntime::common; +using namespace ONNX_NAMESPACE; + namespace onnxruntime { namespace contrib { @@ -20,7 +28,16 @@ namespace cuda { .InputMemoryType(OrtMemTypeCPUInput, 1) \ .InputMemoryType(OrtMemTypeCPUInput, 2) \ .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ - RelPosAttnBias); + RelPosAttnBias); \ + ONNX_OPERATOR_TYPED_KERNEL_EX( \ + GatedRelativePositionBias, \ + kMSDomain, \ + 1, \ + T, \ + kCudaExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T", DataTypeImpl::GetTensorType()), \ + GatedRelativePositionBias); REGISTER_KERNEL_TYPED(float) REGISTER_KERNEL_TYPED(MLFloat16) @@ -69,6 +86,108 @@ Status RelPosAttnBias::ComputeInternal(OpKernelContext* context) const { device_prop.maxThreadsPerBlock); } +template +GatedRelativePositionBias::GatedRelativePositionBias(const OpKernelInfo& info) : CudaKernel(info) { + int64_t num_heads = 0; + ORT_ENFORCE(info.GetAttr("num_heads", &num_heads).IsOK() && num_heads > 0); + num_heads_ = SafeInt(num_heads); +} + +template +Status GatedRelativePositionBias::ComputeInternal(OpKernelContext* context) const { + const Tensor& query_tensor = *context->Input(0); + const Tensor& query_bias_tensor = *context->Input(1); + const Tensor& rel_pos_tensor = *context->Input(2); + const Tensor& weight_tensor = *context->Input(3); + const Tensor& bias_tensor = *context->Input(4); + const Tensor& eco_a_tensor = *context->Input(5); + + const auto& query_dims = query_tensor.Shape().GetDims(); + ORT_ENFORCE(query_dims.size() == 3); + ORT_ENFORCE(query_dims[2] > 0); + ORT_ENFORCE(query_dims[2] % num_heads_ == 0); + const auto batch_size = SafeInt(query_dims[0]); + const auto seq_len = SafeInt(query_dims[1]); + const auto head_size = SafeInt(query_dims[2] / num_heads_); + + ORT_ENFORCE(query_bias_tensor.Shape().NumDimensions() == 1); + ORT_ENFORCE(query_bias_tensor.Shape()[0] == query_dims[2]); + + const auto& rel_pos_dims = rel_pos_tensor.Shape().GetDims(); + ORT_ENFORCE(rel_pos_dims.size() == 4); + ORT_ENFORCE(rel_pos_dims[0] == 1); + ORT_ENFORCE(rel_pos_dims[1] == num_heads_); + ORT_ENFORCE(rel_pos_dims[2] == seq_len); + ORT_ENFORCE(rel_pos_dims[3] == seq_len); + + const auto& weight_dims = weight_tensor.Shape().GetDims(); + ORT_ENFORCE(weight_dims.size() == 2); + ORT_ENFORCE(weight_dims[0] == head_size); + ORT_ENFORCE((weight_dims[1] > 0) && (weight_dims[1] % 2 == 0)); + + ORT_ENFORCE(bias_tensor.Shape().NumDimensions() == 1); + ORT_ENFORCE(bias_tensor.Shape()[0] == weight_dims[1]); + + const auto D = SafeInt(weight_dims[1]); + + const auto& eco_a_dims = eco_a_tensor.Shape().GetDims(); + ORT_ENFORCE(eco_a_dims.size() == 4); + ORT_ENFORCE(eco_a_dims[0] == 1); + ORT_ENFORCE(eco_a_dims[1] == num_heads_); + ORT_ENFORCE(eco_a_dims[2] == 1); + ORT_ENFORCE(eco_a_dims[3] == 1); + + Tensor* output = context->Output(0, {batch_size, num_heads_, seq_len, seq_len}); + + auto& device_prop = GetDeviceProp(); + cublasHandle_t cublas = GetCublasHandle(context); + + typedef typename ToCudaType::MappedType CudaT; + const auto BNS = batch_size * num_heads_ * seq_len; + const size_t elements_in_query = (size_t)BNS * (size_t)head_size; + const size_t elements_after_gemm = (size_t)BNS *(size_t)D; + size_t workspace_size = sizeof(T) * (elements_in_query + (seq_len < D) ? elements_after_gemm : (size_t)0); + auto workspace = GetScratchBuffer(workspace_size, context->GetComputeStream()); + + // format 1: BxSx(NH * total_matrix) => matrix_to_transpose * (BxNxSxH) + constexpr int format = 1; + constexpr int total_maxtrix = 1; + constexpr int num_matrix_to_transpose = 1; + LaunchAddBiasTranspose(Stream(context), num_matrix_to_transpose, format, device_prop.maxThreadsPerBlock, + batch_size, seq_len, num_heads_, head_size, + reinterpret_cast(query_tensor.template Data()), + reinterpret_cast(query_bias_tensor.template Data()), + reinterpret_cast(workspace.get()), + false, head_size, reinterpret_cast(static_cast(nullptr)), total_maxtrix); + + // reuse output if possible + CudaT* gemm_output = (seq_len < D) ? (reinterpret_cast(workspace.get()) + elements_in_query) + : reinterpret_cast(output->template MutableData()); + int ld_gemm_output = max(seq_len, D); + + const CudaT one = ToCudaType::FromFloat(1.0f); + const CudaT zero = ToCudaType::FromFloat(0.0f); + + // ([b*n*s, h] * [h, D]), CUDA assumes col-major + CUBLAS_RETURN_IF_ERROR(cublasGemmHelper( + cublas, CUBLAS_OP_N, CUBLAS_OP_N, + D, BNS, head_size, &one, + reinterpret_cast(weight_tensor.template Data()), (int)D, + reinterpret_cast(workspace.get()), (int)head_size, + &zero, gemm_output, ld_gemm_output, device_prop)); + + auto status = LaunchGatedRelativePositionBiasKernel( + device_prop, Stream(context), + reinterpret_cast(output->template MutableData()), + reinterpret_cast(rel_pos_tensor.template Data()), + reinterpret_cast(gemm_output), + reinterpret_cast(bias_tensor.template Data()), + reinterpret_cast(eco_a_tensor.template Data()), + batch_size, num_heads_, seq_len, D, ld_gemm_output); + + return status; +} + } // namespace cuda } // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cuda/bert/relative_attn_bias.h b/onnxruntime/contrib_ops/cuda/bert/relative_attn_bias.h index b9674f6f35091..3bf4e730e29f9 100644 --- a/onnxruntime/contrib_ops/cuda/bert/relative_attn_bias.h +++ b/onnxruntime/contrib_ops/cuda/bert/relative_attn_bias.h @@ -22,6 +22,18 @@ class RelPosAttnBias final : public CudaKernel { bool is_bidirectional_; }; +template +class GatedRelativePositionBias final : public CudaKernel { + public: + GatedRelativePositionBias(const OpKernelInfo& op_kernel_info); + + Status ComputeInternal(OpKernelContext* ctx) const override; + + private: + int num_heads_; +}; + + } // namespace cuda } // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cuda/bert/relative_attn_bias_impl.cu b/onnxruntime/contrib_ops/cuda/bert/relative_attn_bias_impl.cu index e333152cb5bcf..938496b058025 100644 --- a/onnxruntime/contrib_ops/cuda/bert/relative_attn_bias_impl.cu +++ b/onnxruntime/contrib_ops/cuda/bert/relative_attn_bias_impl.cu @@ -36,7 +36,7 @@ __global__ void buildRelativeAttentionBias(T* relative_attention_bias, const bool is_bidirectional, const int max_distance) { const int head_id = blockIdx.x; - for (int seq_id = threadIdx.x; seq_id < seq_len * seq_len; seq_id += blockDim.x * gridDim.y) { + for (int seq_id = blockDim.x * blockIdx.y + threadIdx.x; seq_id < seq_len * seq_len; seq_id += blockDim.x * gridDim.y) { int row_id = seq_id / seq_len; int col_id = seq_id % seq_len; @@ -149,6 +149,122 @@ template Status LaunchRelPosAttnBiasKernel(cudaStream_t stream, const bool is_bidirectional, const int max_threads_per_block); +template +__global__ void GatedRelativePositionBiasKernelSmallD( + T* output, // (batch_size, num_heads, seq_len, seq_len) + const T* rel_pos, // (1, num_heads, seq_len, seq_len) + const T* qw, // (batch_size, num_heads, seq_len, D) + const T* bias, // (D) + const T* eco_a, // (1, num_heads, 1, 1) + const int D, + const int ldqw) { + __shared__ float gate[1]; + + const int seq_len = gridDim.x; + const int num_heads = gridDim.y; + const int s = blockIdx.x; + const int n = blockIdx.y; + const int b = blockIdx.z; + + rel_pos += ((int64_t)n * seq_len + s) * seq_len; + output += ((int64_t)b * num_heads * seq_len + (int64_t)n * seq_len + s) * seq_len; + qw += ((int64_t)b * num_heads * seq_len + (int64_t)n * seq_len + s) * ldqw; + + float val = 0.0f; + if (threadIdx.x < D) { + val = (float)qw[threadIdx.x] + (bias ? (float)bias[threadIdx.x] : 0.0f); + } + + float u = (threadIdx.x < D / 2) ? val : 0.0f; +#pragma unroll + for (int offset = 16; offset > 0; offset /= 2) { + u += __shfl_down_sync(0xffffffff, u, offset); + } + + float r = (threadIdx.x >= D / 2) ? val : 0.0f; +#pragma unroll + for (int offset = 16; offset > 0; offset /= 2) { + r += __shfl_down_sync(0xffffffff, r, offset); + } + + if (threadIdx.x == 0) { + u = 1.0f / (1.0f + expf(-u)); + r = 1.0f / (1.0f + expf(-r)); + gate[0] = u * (r * (float)eco_a[n] - 1.0f) + 2.0f; + } + __syncthreads(); + + for (int idx = threadIdx.x; idx < seq_len; idx += blockDim.x) { + output[idx] = (T)(gate[0] * (float)rel_pos[idx]); + } +} + +template +Status LaunchGatedRelativePositionBiasKernel( + const cudaDeviceProp& device_prop, + cudaStream_t stream, + T* output, + const T* rel_pos, + const T* qw, // query * weight + const T* bias, + const T* eco_a, + const int batch_size, + const int num_heads, + const int seq_len, + const int D, + const int ldqw) { + ORT_ENFORCE(D <= 32 && D > 0 && (D % 2 == 0)); + ORT_ENFORCE(ldqw == seq_len || ldqw == D); + + int tpb = std::max(32, std::max(D, seq_len)); + tpb = std::min(tpb, device_prop.maxThreadsPerBlock); + + // round up tpb to power of 2 + --tpb; + tpb |= (tpb >> 1); + tpb |= (tpb >> 2); + tpb |= (tpb >> 4); + tpb |= (tpb >> 8); + tpb |= (tpb >> 16); + tpb++; + + dim3 block(tpb); + dim3 grid(seq_len, num_heads, batch_size); + + GatedRelativePositionBiasKernelSmallD<<>>( + output, rel_pos, qw, bias, eco_a, D, ldqw); + + return CUDA_CALL(cudaGetLastError()); +} + +template Status LaunchGatedRelativePositionBiasKernel( + const cudaDeviceProp& device_prop, + cudaStream_t stream, + float* output, + const float* rel_pos, + const float* qw, + const float* bias, + const float* eco_a, + const int batch_size, + const int num_heads, + const int seq_len, + const int D, + const int ldqw); + +template Status LaunchGatedRelativePositionBiasKernel( + const cudaDeviceProp& device_prop, + cudaStream_t stream, + half* output, + const half* rel_pos, + const half* qw, + const half* bias, + const half* eco_a, + const int batch_size, + const int num_heads, + const int seq_len, + const int D, + const int ldqw); + } // namespace cuda } // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cuda/bert/relative_attn_bias_impl.h b/onnxruntime/contrib_ops/cuda/bert/relative_attn_bias_impl.h index 5a1a229ab6077..5c7c98f55f3f5 100644 --- a/onnxruntime/contrib_ops/cuda/bert/relative_attn_bias_impl.h +++ b/onnxruntime/contrib_ops/cuda/bert/relative_attn_bias_impl.h @@ -22,6 +22,21 @@ Status LaunchRelPosAttnBiasKernel( const int max_threads_per_block ); +template +Status LaunchGatedRelativePositionBiasKernel( + const cudaDeviceProp& device_prop, + cudaStream_t stream, + T* output, + const T* rel_pos, + const T* qw, // from query * weight + const T* bias, + const T* eco_a, + const int batch_size, + const int num_heads, + const int seq_len, + const int D, + const int ldqw); + } // namespace cuda } // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc index a239e528af148..1254ccd7e1e17 100644 --- a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc @@ -32,6 +32,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, FusedMatMul); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, RelativePositionBias); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, RelativePositionBias); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, GatedRelativePositionBias); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, GatedRelativePositionBias); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, RemovePadding); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, RemovePadding); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, RestorePadding); @@ -162,6 +164,8 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc b/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc index e5ea47a6a2a5b..7cd717efc9fba 100644 --- a/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc +++ b/onnxruntime/contrib_ops/cuda/quantization/attention_quantization.cc @@ -52,7 +52,7 @@ Status QAttention::CheckInputs(const Tensor* input, auto& device_prop = GetDeviceProp(); ORT_RETURN_IF_ERROR(AttentionBase::CheckInputs(input->Shape(), weights->Shape(), bias->Shape(), mask_index, past_tensor, - nullptr, // extra_add_qk + nullptr, // relative_position_bias parameters, device_prop.maxThreadsPerBlock)); @@ -198,7 +198,7 @@ Status QAttention::ComputeInternal(OpKernelContext* context) const { data.mask_index = (nullptr == mask_index) ? nullptr : mask_index->Data(); data.mask_index_dims = (nullptr == mask_index) ? gsl::span() : mask_index->Shape().GetDims(); data.past = (nullptr == past_tensor) ? nullptr : reinterpret_cast(past_tensor->Data()); - data.extra_add_qk = nullptr; // add_qk is not supported in quantized attention + data.relative_position_bias = nullptr; // add_qk is not supported in quantized attention data.workspace = reinterpret_cast(work_space.get()); data.output = reinterpret_cast(output->MutableData()); data.present = (nullptr == present) ? nullptr : reinterpret_cast(present->MutableData()); diff --git a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.cc b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.cc index 204c786cc2c5d..8122b2de5916b 100644 --- a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.cc +++ b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention.cc @@ -212,7 +212,7 @@ Status QOrderedAttention::ComputeInternal(OpKernelContext* context) const { ORT_RETURN_IF_ERROR(CheckInputs(input->Shape(), merged_weights_shape, merged_bias_shape, mask_index, nullptr, // past - nullptr, // extra_add_qk + nullptr, // relative_position_bias nullptr, // parameters device_prop.maxThreadsPerBlock)); diff --git a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention_input_enum.h b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention_input_enum.h index 5fe62ef127800..5fb31be5fe86f 100644 --- a/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention_input_enum.h +++ b/onnxruntime/contrib_ops/cuda/quantization/qordered_ops/qordered_attention_input_enum.h @@ -17,4 +17,4 @@ DefineQOrderedAttentionInput(Scale_QK_Softmax, scale_QKT_softmax, 15), DefineQOrderedAttentionInput(Scale_Values_Gemm, scale_values_gemm, 16), DefineQOrderedAttentionInput(Mask_Index, mask_index, 17), DefineQOrderedAttentionInput(Past, past, 18), -DefineQOrderedAttentionInput(Extra_Add, extra_add, 19) +DefineQOrderedAttentionInput(relative_position_bias, relative_position_bias, 19) diff --git a/onnxruntime/contrib_ops/rocm/bert/attention.cc b/onnxruntime/contrib_ops/rocm/bert/attention.cc index 756919834aef8..afc9fd9237ed7 100644 --- a/onnxruntime/contrib_ops/rocm/bert/attention.cc +++ b/onnxruntime/contrib_ops/rocm/bert/attention.cc @@ -39,7 +39,7 @@ Status Attention::ComputeInternal(OpKernelContext* context) const { const Tensor* bias = context->Input(2); const Tensor* mask_index = context->Input(3); const Tensor* past = context->Input(4); - const Tensor* extra_add_qk = context->Input(5); + const Tensor* relative_position_bias = context->Input(5); auto& device_prop = GetDeviceProp(); ORT_RETURN_IF_ERROR(CheckInputs(input->Shape(), @@ -47,7 +47,7 @@ Status Attention::ComputeInternal(OpKernelContext* context) const { bias->Shape(), mask_index, past, - extra_add_qk, + relative_position_bias, nullptr, device_prop.maxThreadsPerBlock)); @@ -129,7 +129,7 @@ Status Attention::ComputeInternal(OpKernelContext* context) const { nullptr == mask_index ? gsl::span() : mask_index->Shape().GetDims(), mask_filter_value_, nullptr == past ? nullptr : past->Data(), - nullptr == extra_add_qk ? nullptr : extra_add_qk->Data(), + nullptr == relative_position_bias ? nullptr : relative_position_bias->Data(), work_space.get(), output->MutableData(), nullptr == present ? nullptr : present->MutableData()); diff --git a/onnxruntime/contrib_ops/rocm/bert/attention_impl.cu b/onnxruntime/contrib_ops/rocm/bert/attention_impl.cu index 954a129be1c65..fa6cce6a64132 100644 --- a/onnxruntime/contrib_ops/rocm/bert/attention_impl.cu +++ b/onnxruntime/contrib_ops/rocm/bert/attention_impl.cu @@ -89,7 +89,7 @@ Status QkvToContext( bool is_unidirectional, int past_sequence_length, const T* past, - const T* extra_add_qk, + const T* relative_position_bias, T* present, bool use_persistent_softmax) { const int all_sequence_length = past_sequence_length + sequence_length; @@ -158,7 +158,7 @@ Status QkvToContext( T* persistent_softmax_workspace = scratch1; // replace Q*K' in place if persistent softmax is selected. ORT_RETURN_IF_ERROR( ComputeSoftmaxWithRawMask(stream, all_sequence_length, sequence_length, batch_size, num_heads, - mask_index, nullptr, extra_add_qk, scratch1, scratch2, + mask_index, nullptr, relative_position_bias, scratch1, scratch2, is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length, use_persistent_softmax, persistent_softmax_workspace, mask_filter_value)); } else if (nullptr != mask_index) { // 1d mask index @@ -166,10 +166,10 @@ Status QkvToContext( // mask_index has 1D shape: either (batch_size) or (2*batch_size). Only the later one has start postions. const int* mask_start = (mask_index_dims[0] > batch_size) ? mask_index + batch_size : nullptr; ORT_RETURN_IF_ERROR(ComputeSoftmaxWithMask1D(stream, all_sequence_length, sequence_length, batch_size, num_heads, - mask_index, mask_start, extra_add_qk, scratch1, scratch2, is_unidirectional)); + mask_index, mask_start, relative_position_bias, scratch1, scratch2, is_unidirectional)); } else { // no mask ORT_RETURN_IF_ERROR(ComputeSoftmax(stream, all_sequence_length, sequence_length, batch_size, num_heads, - extra_add_qk, scratch1, scratch2, is_unidirectional)); + relative_position_bias, scratch1, scratch2, is_unidirectional)); } // compute P*V (as V*P), and store in scratch3: BxNxSxH @@ -206,7 +206,7 @@ Status LaunchAttentionKernel( gsl::span mask_index_dims, const float mask_filter_value, const void* past, - const void* extra_add_qk, + const void* relative_position_bias, void* workspace, void* output, void* present) { @@ -225,7 +225,7 @@ Status LaunchAttentionKernel( is_unidirectional, past_sequence_length, reinterpret_cast(past), - reinterpret_cast(extra_add_qk), + reinterpret_cast(relative_position_bias), reinterpret_cast<__half*>(present), use_persistent_softmax); } else { @@ -240,7 +240,7 @@ Status LaunchAttentionKernel( is_unidirectional, past_sequence_length, reinterpret_cast(past), - reinterpret_cast(extra_add_qk), + reinterpret_cast(relative_position_bias), reinterpret_cast(present), use_persistent_softmax); } diff --git a/onnxruntime/contrib_ops/rocm/bert/attention_impl.h b/onnxruntime/contrib_ops/rocm/bert/attention_impl.h index 7db692083f5e5..fdc46ce2e7729 100644 --- a/onnxruntime/contrib_ops/rocm/bert/attention_impl.h +++ b/onnxruntime/contrib_ops/rocm/bert/attention_impl.h @@ -42,7 +42,7 @@ Status LaunchAttentionKernel( gsl::span mask_index_dims, // Mask index shape const float mask_filter_value, // Mask value for filtered out positions const void* past, // Past state input - const void* extra_add_qk, // Additional Add + const void* relative_position_bias, // Additional Add void* workspace, // Temporary buffer void* output, // Output tensor void* present // Present state output diff --git a/onnxruntime/core/graph/contrib_ops/bert_defs.cc b/onnxruntime/core/graph/contrib_ops/bert_defs.cc index 68e3985651123..6b00ac94bc10f 100644 --- a/onnxruntime/core/graph/contrib_ops/bert_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/bert_defs.cc @@ -243,7 +243,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA( "T", OpSchema::Optional) .Input(5, - "extra_add", + "relative_position_bias", "additional add to QxK' with shape (batch_size, num_heads, sequence_length, total_sequence_length)", "T", OpSchema::Optional) @@ -313,6 +313,12 @@ ONNX_MS_OPERATOR_SET_SCHEMA( "Key padding mask with shape (batch_size) or (batch_size, kv_sequence_length)", "M", OpSchema::Optional) + .Input(5, + "relative_position_bias", + "relative position bias: addition to QxK' with shape (batch_size, num_heads, sequence_length, total_sequence_length)" + " or (1, num_heads, sequence_length, total_sequence_length)", + "T", + OpSchema::Optional) .Output(0, "output", "3D output tensor with shape (batch_size, sequence_length, v_hidden_size)", @@ -668,5 +674,41 @@ ONNX_MS_OPERATOR_SET_SCHEMA( RestorePaddingTypeAndShapeInference(ctx); })); +constexpr const char* GatedRelativePositionBias_ver1_doc = R"DOC( + query_layer = (query_layer + query_bias).reshape(batch_size, seq_len, num_heads, head_size).transpose(1, 2) + gate_u, gate_r = torch.sigmoid( + self.gate_ur_linear(query_layer).view(batch_size, num_head, seq_len, 2, D/2).sum(-1, keepdim=False) + ).chunk(2, dim=-1) + gate_u_1 = gate_u * (gate_r * self.eco_a - 1.0) + 2.0 + rel_pos_bias = gate_u_1 * rel_pos +)DOC"; + +ONNX_MS_OPERATOR_SET_SCHEMA( + GatedRelativePositionBias, 1, + OpSchema() + .SetDoc(GatedRelativePositionBias_ver1_doc) + .Attr("num_heads", "Number of attention heads", AttributeProto::INT) + .Input(0, "query_layer", "tensor with shape (batch_size, seq_len, num_heads x head_size)", "T") + .Input(1, "query_bias", "1-d tensor with shape (num_heads x head_size)", "T") + .Input(2, "rel_pos", "tensor with shape (1, num_head, seq_len, seq_len)", "T") + .Input(3, "weight", "gemm weight for the gated_ur_linear, shape (head_size, D), D is divisible by 2", "T") + .Input(4, "bias", "bias for the gated_ur_linear, shape (D)", "T") + .Input(5, "eco_a", "tensor of shape (1, num_heads, 1, 1)", "T") + .Output(0, "output", "output tensor with shape (batch_size, num_heads, seq_len, seq_len)", "T") + .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float tensors.") + .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { + propagateElemTypeFromInputToOutput(ctx, 0, 0); + int64_t num_heads = getAttribute(ctx, "num_heads", -1L); + if (hasInputShape(ctx, 0)) { + auto& query_layer_shape = getInputShape(ctx, 0); + TensorShapeProto output_shape; + *output_shape.add_dim() = query_layer_shape.dim(0); + output_shape.add_dim()->set_dim_value(num_heads); + *output_shape.add_dim() = query_layer_shape.dim(1); + *output_shape.add_dim() = query_layer_shape.dim(1); + updateOutputShape(ctx, 0, output_shape); + } + })); + } // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/core/graph/contrib_ops/ms_opset.h b/onnxruntime/core/graph/contrib_ops/ms_opset.h index a511d01fe1624..bd8469909fe7f 100644 --- a/onnxruntime/core/graph/contrib_ops/ms_opset.h +++ b/onnxruntime/core/graph/contrib_ops/ms_opset.h @@ -81,6 +81,7 @@ class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, MurmurHash3); class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, NGramRepeatBlock); class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Pad); class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, RelativePositionBias); +class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GatedRelativePositionBias); class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, RemovePadding); class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, RestorePadding); class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Rfft); @@ -171,6 +172,7 @@ class OpSet_Microsoft_ver1 { fn(GetOpSchema()); fn(GetOpSchema()); fn(GetOpSchema()); + fn(GetOpSchema()); fn(GetOpSchema()); fn(GetOpSchema()); fn(GetOpSchema()); diff --git a/onnxruntime/core/graph/contrib_ops/quantization_defs.cc b/onnxruntime/core/graph/contrib_ops/quantization_defs.cc index 6111afbd5d817..91e4f5d8ff81a 100644 --- a/onnxruntime/core/graph/contrib_ops/quantization_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/quantization_defs.cc @@ -1140,7 +1140,7 @@ where value of each element is the end position, or valid length of actual seque left-side padding, mask_index has shape (2 * batch_size), where the values are the exclusive end positions followed by the inclusive start positions. When unidirectional is 1, and each token only attend to previous tokens. For GPT-2, both past and present state are optional. Present state could appear in output even when past state is not in input. -Current version does not support past/present, extra_add and qkv_hidden_sizes. +Current version does not support past/present, relative_position_bias and qkv_hidden_sizes. TODO: Support them if needed in the future. )DOC"; @@ -1202,7 +1202,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA( .Input(18, "past", "past state for key and value with shape (2, batch_size, num_heads, past_sequence_length, head_size).", "Q", OpSchema::Optional) - .Input(19, "extra_add", + .Input(19, "relative_position_bias", "additional add to QxK' with shape (batch_size, num_heads, sequence_length, sequence_length).", "S", OpSchema::Optional) .Output(0, "output", "3D output tensor with shape (batch_size, sequence_length, hidden_size)", "Q") diff --git a/onnxruntime/core/providers/cpu/cpu_provider_shared.cc b/onnxruntime/core/providers/cpu/cpu_provider_shared.cc index b4a92019992b5..c0a75fc50b07e 100644 --- a/onnxruntime/core/providers/cpu/cpu_provider_shared.cc +++ b/onnxruntime/core/providers/cpu/cpu_provider_shared.cc @@ -198,12 +198,12 @@ struct ProviderHostCPUImpl : ProviderHostCPU { const TensorShape& bias_shape, const Tensor*& mask_index, const Tensor* past, - const Tensor* extra_add_qk, + const Tensor* relative_position_bias, void* parameters, const int max_threads_per_block, const Tensor* past_seq_len) override { return p->contrib::AttentionBase::CheckInputs(input_shape, weights_shape, bias_shape, mask_index, past, - extra_add_qk, + relative_position_bias, parameters, max_threads_per_block, past_seq_len); diff --git a/onnxruntime/core/providers/cpu/cpu_provider_shared.h b/onnxruntime/core/providers/cpu/cpu_provider_shared.h index 2490789dd31a2..f12e080adf30a 100644 --- a/onnxruntime/core/providers/cpu/cpu_provider_shared.h +++ b/onnxruntime/core/providers/cpu/cpu_provider_shared.h @@ -145,7 +145,7 @@ struct ProviderHostCPU { const TensorShape& bias_shape, const Tensor*& mask_index, const Tensor* past, - const Tensor* extra_add_qk, + const Tensor* relative_position_bias, void* parameters, const int max_threads_per_block, const Tensor* past_seq_len) = 0; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorAttention.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorAttention.cpp index 63bae80c51a67..af93808248032 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorAttention.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorAttention.cpp @@ -401,15 +401,15 @@ class DmlOperatorAttention : public DmlOperator void CALLBACK QueryAttention(IMLOperatorSupportQueryContextPrivate* context, /*out*/ bool* isSupported) { *isSupported = false; - // Fall back to CPU if input 'past' and 'extra_add' is present because there is no current use case for this. + // Fall back to CPU if input 'past' and 'relative_position_bias' is present because there is no current use case for this. // and it will make the implementation more complex. // Also fall back to CPU if output 'present' is present for same reason as above. if (context->GetInputCount() > 4 || context->GetOutputCount() > 1) { return; } - // Checking input count alone is not sufficient to fallback to CPU if input 'past' and 'extra_add' is present - // because input 'mask_index', 'past', and 'extra_add' all are optional. + // Checking input count alone is not sufficient to fallback to CPU if input 'past' and 'relative_position_bias' is present + // because input 'mask_index', 'past', and 'relative_position_bias' all are optional. if (context->IsInputValid(4) || context->IsInputValid(5)) { return; diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc index b772fe95d6ecc..30be10ea7e15f 100644 --- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc +++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc @@ -563,12 +563,12 @@ Status AttentionBase::CheckInputs(const TensorShape& input_shape, const TensorShape& bias_shape, const Tensor*& mask_index, const Tensor* past, - const Tensor* extra_add_qk, + const Tensor* relative_position_bias, void* parameters, const int max_threads_per_block, const Tensor* past_seq_len) const { return g_host_cpu.AttentionBase__CheckInputs(this, input_shape, weights_shape, bias_shape, - mask_index, past, extra_add_qk, parameters, + mask_index, past, relative_position_bias, parameters, max_threads_per_block, past_seq_len); } Tensor* AttentionBase::GetPresent(OpKernelContext* context, const Tensor* past, int batch_size, int head_size, diff --git a/onnxruntime/python/tools/transformers/fusion_attention.py b/onnxruntime/python/tools/transformers/fusion_attention.py index 245ea9322ad61..342d43306e699 100644 --- a/onnxruntime/python/tools/transformers/fusion_attention.py +++ b/onnxruntime/python/tools/transformers/fusion_attention.py @@ -337,7 +337,7 @@ def create_attention_node( # For MultiHeadAttention operator, use separated inputs for query, key and value, and no weights. if self.use_multi_head_attention: if add_qk_str is not None: - logger.debug("MultiHeadAttention does not support extra_add_qk: cannot fuse the attention.") + logger.debug("MultiHeadAttention does not support relative_position_bias: cannot fuse the attention.") return None attention_inputs = [ diff --git a/onnxruntime/python/tools/transformers/onnx_model.py b/onnxruntime/python/tools/transformers/onnx_model.py index 42fd4d5909a30..276a9428ecf72 100644 --- a/onnxruntime/python/tools/transformers/onnx_model.py +++ b/onnxruntime/python/tools/transformers/onnx_model.py @@ -327,7 +327,7 @@ def match_parent_path( self, node, parent_op_types, - parent_input_index, + parent_input_index=None, output_name_to_node=None, return_indice=None, ): @@ -347,7 +347,8 @@ def match_parent_path( Returns: parents: a list of matched parent node. """ - assert len(parent_input_index) == len(parent_op_types) + if parent_input_index is not None: + assert len(parent_input_index) == len(parent_op_types) if output_name_to_node is None: output_name_to_node = self.output_name_to_node() @@ -358,16 +359,19 @@ def match_parent_path( matched_parent = self.match_parent( current_node, op_type, - parent_input_index[i], + parent_input_index[i] if parent_input_index is not None else None, output_name_to_node, exclude=[], return_indice=return_indice, ) if matched_parent is None: - logger.debug( - f"Failed to match index={i} parent_input_index={parent_input_index[i]} op_type={op_type}", - stack_info=True, - ) + if parent_input_index is not None: + logger.debug( + f"Failed to match index={i} parent_input_index={parent_input_index[i]} op_type={op_type}", + stack_info=True, + ) + else: + logger.debug(f"Failed to match index={i} op_type={op_type}", stack_info=True) return None matched_parents.append(matched_parent) diff --git a/onnxruntime/python/tools/transformers/onnx_model_tnlr.py b/onnxruntime/python/tools/transformers/onnx_model_tnlr.py index dc8f6810914a7..85e510a828990 100644 --- a/onnxruntime/python/tools/transformers/onnx_model_tnlr.py +++ b/onnxruntime/python/tools/transformers/onnx_model_tnlr.py @@ -172,8 +172,8 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): add = k_nodes[-2] matmul = k_nodes[-1] - extra_add_qk_nodes = self.model.match_parent_path(add_qk, ["Reshape", "Where"], [1, 0]) - if extra_add_qk_nodes is None: + relative_position_bias_nodes = self.model.match_parent_path(add_qk, ["Reshape", "Where"], [1, 0]) + if relative_position_bias_nodes is None: return if matmul.input[0] == root_input: @@ -189,7 +189,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): self.hidden_size, root_input, attention_last_node.output[0], - extra_add_qk_nodes[0].input[0], + relative_position_bias_nodes[0].input[0], ) if new_node is None: return diff --git a/onnxruntime/test/contrib_ops/attention_op_test.cc b/onnxruntime/test/contrib_ops/attention_op_test.cc index fb1d8fcfe451a..0ea85dfdaba4f 100644 --- a/onnxruntime/test/contrib_ops/attention_op_test.cc +++ b/onnxruntime/test/contrib_ops/attention_op_test.cc @@ -59,7 +59,7 @@ static void RunAttentionTest( const bool disable_cuda = false, const bool disable_rocm = false, std::vector qkv_sizes = {}, - const std::vector& extra_add_data = {}, + const std::vector& relative_position_bias_data = {}, int kv_sequence_length = 0, bool past_present_share_buffer = false, bool use_scale = false) { @@ -199,12 +199,12 @@ static void RunAttentionTest( } } - std::vector extra_add_data_dims = {batch_size, number_of_heads, sequence_length, sequence_length}; - if (extra_add_data.size() > 0) { + std::vector relative_position_bias_data_dims = {batch_size, number_of_heads, sequence_length, sequence_length}; + if (relative_position_bias_data.size() > 0) { if (use_float16) { - tester.AddInput("extra_add_qk", extra_add_data_dims, ToFloat16(extra_add_data)); + tester.AddInput("relative_position_bias", relative_position_bias_data_dims, ToFloat16(relative_position_bias_data)); } else { - tester.AddInput("extra_add_qk", extra_add_data_dims, extra_add_data); + tester.AddInput("relative_position_bias", relative_position_bias_data_dims, relative_position_bias_data); } } else { if (use_float16) { @@ -264,7 +264,7 @@ static void RunAttentionTest( const bool disable_cuda = false, const bool disable_rocm = false, const std::vector qkv_sizes = {}, - const std::vector& extra_add_data = {}, + const std::vector& relative_position_bias_data = {}, int kv_sequence_length = 0, bool past_present_share_buffer = false, bool use_scale = false) { @@ -272,13 +272,13 @@ static void RunAttentionTest( batch_size, sequence_length, hidden_size, number_of_heads, use_float16, is_unidirectional, use_past_state, past_sequence_length, past_data, present_data, mask_type, input_hidden_size, max_sequence_length, - disable_cpu, disable_cuda, disable_rocm, qkv_sizes, extra_add_data, + disable_cpu, disable_cuda, disable_rocm, qkv_sizes, relative_position_bias_data, kv_sequence_length, past_present_share_buffer, use_scale); RunAttentionTest(input_data, weights_data, true, bias_data, mask_index_data, output_data, batch_size, sequence_length, hidden_size, number_of_heads, use_float16, is_unidirectional, use_past_state, past_sequence_length, past_data, present_data, mask_type, input_hidden_size, max_sequence_length, - disable_cpu, disable_cuda, disable_rocm, qkv_sizes, extra_add_data, + disable_cpu, disable_cuda, disable_rocm, qkv_sizes, relative_position_bias_data, kv_sequence_length, past_present_share_buffer, use_scale); } @@ -390,7 +390,7 @@ TEST(AttentionTest, AttentionBatch1WithQKVAttr2) { 0, false, false, disable_rocm, qkv_sizes); } -TEST(AttentionTest, AttentionBatch1ExtraAdd) { +TEST(AttentionTest, AttentionBatch1RelativePositionBias) { int batch_size = 1; int sequence_length = 2; int hidden_size = 4; @@ -414,7 +414,7 @@ TEST(AttentionTest, AttentionBatch1ExtraAdd) { std::vector mask_index_data = {2L}; - std::vector extra_add_qk = { + std::vector relative_position_bias = { 0.2f, -0.1f, 0.4f, 2.5f, 1.6f, -1.1f, 0.4f, -2.5f}; std::vector output_data = { @@ -427,10 +427,10 @@ TEST(AttentionTest, AttentionBatch1ExtraAdd) { RunAttentionTest(input_data, weight_data, bias_data, mask_index_data, output_data, batch_size, sequence_length, hidden_size, number_of_heads, false, false, false, 0, nullptr, nullptr, AttentionMaskType::MASK_1D_KEY_SEQ_LEN, 0, - 0, disable_cpu, disable_cuda, disable_rocm, qkv_sizes, extra_add_qk); + 0, disable_cpu, disable_cuda, disable_rocm, qkv_sizes, relative_position_bias); } -TEST(AttentionTest, AttentionBatch2ExtraAdd) { +TEST(AttentionTest, AttentionBatch2RelativePositionBias) { int batch_size = 2; int sequence_length = 2; int hidden_size = 4; @@ -456,7 +456,7 @@ TEST(AttentionTest, AttentionBatch2ExtraAdd) { std::vector mask_index_data = {2L, 2L}; - std::vector extra_add_qk = { + std::vector relative_position_bias = { 0.2f, -0.1f, 0.4f, 2.5f, 1.6f, -1.1f, 0.4f, -2.5f, 0.2f, -0.1f, 0.4f, 2.5f, 1.6f, -1.1f, 0.4f, -2.5f}; @@ -472,7 +472,7 @@ TEST(AttentionTest, AttentionBatch2ExtraAdd) { RunAttentionTest(input_data, weight_data, bias_data, mask_index_data, output_data, batch_size, sequence_length, hidden_size, number_of_heads, false, false, false, 0, nullptr, nullptr, AttentionMaskType::MASK_1D_KEY_SEQ_LEN, 0, - 0, disable_cpu, disable_cuda, disable_rocm, qkv_sizes, extra_add_qk); + 0, disable_cpu, disable_cuda, disable_rocm, qkv_sizes, relative_position_bias); } TEST(AttentionTest, AttentionBatch1_Float16) { @@ -1709,7 +1709,7 @@ TEST(AttentionTest, AttentionWithNormFactor) { use_float16, is_unidirectional, use_past_state, past_sequence_length, past_data, present_data, AttentionMaskType::MASK_2D_KEY_PADDING, 0 /*input_hidden_size*/, 0 /*max_sequence_length*/, false /*disable_cpu*/, false /*disable_cuda*/, true /*disable_rocm*/, {} /*qkv_sizes*/, - {} /*extra_add_data*/, 0 /*kv_sequence_length*/, false /*past_present_share_buffer*/, + {} /*relative_position_bias_data*/, 0 /*kv_sequence_length*/, false /*past_present_share_buffer*/, true /*use_scale*/); } diff --git a/onnxruntime/test/contrib_ops/qordered_attention_test.cc b/onnxruntime/test/contrib_ops/qordered_attention_test.cc index cf14a7f9918c6..5257fbdb08809 100644 --- a/onnxruntime/test/contrib_ops/qordered_attention_test.cc +++ b/onnxruntime/test/contrib_ops/qordered_attention_test.cc @@ -278,7 +278,7 @@ TEST(QOrderedTest, Attention_WithData_ROW_ORDER) { test_qorder.AddInput("scale_values_gemm", {}, {attn_out_scale}, true); test_qorder.AddInput("mask_index", {batch_size, sequence_len}, input_mask.data(), input_mask.size()); test_qorder.AddOptionalInputEdge(); // past - test_qorder.AddOptionalInputEdge(); // extra_add + test_qorder.AddOptionalInputEdge(); // relative_position_bias test_qorder.AddOutput("output", {batch_size, sequence_len, hidden_size}, attn_out_q8.data(), attn_out_q8.size()); diff --git a/onnxruntime/test/contrib_ops/relative_attention_bias_test.cc b/onnxruntime/test/contrib_ops/relative_attention_bias_test.cc index 7722291bee653..ba0299e4f3808 100644 --- a/onnxruntime/test/contrib_ops/relative_attention_bias_test.cc +++ b/onnxruntime/test/contrib_ops/relative_attention_bias_test.cc @@ -10,9 +10,9 @@ namespace onnxruntime { namespace test { static void RunRelativePositionBiasTest( - const std::vector& bias_table, // Shape = [num_buckets, num_heads] - const std::vector& sequence_length, // Shape = [1] - const std::vector& output_data, // Shape = [1, num_heads, sequence_length, sequence_length] + const std::vector& bias_table, // Shape = [num_buckets, num_heads] + const std::vector& sequence_length, // Shape = [1] + const std::vector& output_data, // Shape = [1, num_heads, sequence_length, sequence_length] int max_distance, int num_buckets, int num_heads, @@ -155,5 +155,264 @@ TEST(RelativePositionBiasTest, RelativePositionBiasTest_FP16_No_Bidirectional) { true); } +/***************Following scripts is used to generate test data, for your reference************* +import torch + +batch_size = 2 +num_heads = 2 +seq_len = 3 +head_size = 4 +D = 8 + +def dim_string_of(tensor): + return "{" + ", ".join([str(d) for d in tensor.shape]) + "}" + +def value_string_of(tensor): + arr = tensor.flatten().numpy() + lines = ["f, ".join([str(v) for v in arr[i : min(i+8, arr.size)]]) for i in range(0, arr.size, 8)] + return "{\n " + "f,\n ".join(lines) + "f}" + +def print_tensor(name, tensor): + print(f"const std::vector {name}_dim = {dim_string_of(tensor)};") + print(f"const std::vector {name} = {value_string_of(tensor)};") + +torch.manual_seed(0) +query_layer = torch.rand(batch_size, seq_len, num_heads * head_size) +query_bias = torch.rand(num_heads * head_size) +rel_pos = torch.rand(1, num_heads, seq_len, seq_len) +weight = torch.rand(head_size, D) +bias = torch.rand(D) +eco_a = torch.rand(1, num_heads, 1, 1) + +qw = (query_layer + query_bias).reshape(batch_size, seq_len, num_heads, head_size).transpose(1, 2) +gate_u,gate_r = torch.sigmoid( + (torch.matmul(qw, weight) + bias).view(batch_size, num_heads, seq_len,2, D//2).sum(-1, keepdim=False) + ).chunk(2, dim=-1) +gate_u_1 = gate_u * (gate_r * eco_a - 1.0) + 2.0 +output = gate_u_1 * rel_pos + +# output for test case +print(f"const int batch_size = {batch_size};") +print(f"const int num_heads = {num_heads};") +print(f"const int seq_len = {seq_len};") +print(f"const int head_size = {head_size};") +print(f"const int D = {D};") + +print_tensor("query_layer", query_layer) +print_tensor("query_bias", query_bias) +print_tensor("rel_pos", rel_pos) +print_tensor("weight", weight) +print_tensor("bias", bias) +print_tensor("eco_a", eco_a) +print_tensor("output", output) +****************/ + +// .Input(0, "query_layer", "tensor with shape (batch_size, seq_len, num_heads x head_size)", "T") +// .Input(1, "query_bias", "1-d tensor with shape (num_heads x head_size)", "T") +// .Input(2, "rel_pos", "tensor with shape (1, num_head, seq_len, seq_len)", "T") +// .Input(3, "weight", "gemm weight for the gated_ur_linear, shape (head_size, D), D is divisible by 2", "T") +// .Input(4, "bias", "bias for the gated_ur_linear, shape (D)", "T") +// .Input(5, "eco_a", "tensor of shape (1, num_heads, 1, 1)", "T") +// .Output(0, "output", "output tensor with shape (batch_size, num_heads, seq_len, seq_len)", "T") +static void RunGatedRelativePositionBiasTest( + const std::vector& query_layer, + const std::vector& query_bias, + const std::vector& rel_pos, + const std::vector& weight, + const std::vector& bias, + const std::vector& eco_a, + const std::vector& output, + int batch_size, + int seq_len, + int num_heads, + int head_size, + int D, + bool use_float16 = false) { + int min_cuda_architecture = use_float16 ? 530 : 0; + + bool enable_cuda = HasCudaEnvironment(min_cuda_architecture); + if (enable_cuda) { + OpTester tester("GatedRelativePositionBias", 1, onnxruntime::kMSDomain); + tester.AddAttribute("num_heads", static_cast(num_heads)); + + std::vector query_layer_dims = {batch_size, seq_len, num_heads * head_size}; + std::vector query_bias_dims = {num_heads * head_size}; + std::vector rel_pos_dims = {1, num_heads, seq_len, seq_len}; + std::vector weight_dims = {head_size, D}; + std::vector bias_dims = {D}; + std::vector eco_a_dims = {1, num_heads, 1, 1}; + std::vector output_dims = {batch_size, num_heads, seq_len, seq_len}; + + if (use_float16) { + tester.AddInput("query_layer", query_layer_dims, ToFloat16(query_layer)); + tester.AddInput("query_bias", query_bias_dims, ToFloat16(query_bias)); + tester.AddInput("rel_pos", rel_pos_dims, ToFloat16(rel_pos)); + tester.AddInput("weight", weight_dims, ToFloat16(weight)); + tester.AddInput("bias", bias_dims, ToFloat16(bias)); + tester.AddInput("eco_a", eco_a_dims, ToFloat16(eco_a)); + tester.AddOutput("output", output_dims, ToFloat16(output)); + } else { + tester.AddInput("query_layer", query_layer_dims, query_layer); + tester.AddInput("query_bias", query_bias_dims, query_bias); + tester.AddInput("rel_pos", rel_pos_dims, rel_pos); + tester.AddInput("weight", weight_dims, weight); + tester.AddInput("bias", bias_dims, bias); + tester.AddInput("eco_a", eco_a_dims, eco_a); + tester.AddOutput("output", output_dims, output); + } + + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } +} + +TEST(GatedRelativePositionBiasTest, FP16_BSNHD_1x3x2x4x8) { + constexpr int batch_size = 1; + constexpr int num_heads = 2; + constexpr int seq_len = 3; + constexpr int head_size = 4; + constexpr int D = 8; + const std::vector query_layer_dim = {1, 3, 8}; + const std::vector query_layer = { + 0.4962566f, 0.7682218f, 0.08847743f, 0.13203049f, 0.30742282f, 0.6340787f, 0.4900934f, 0.89644474f, + 0.45562798f, 0.6323063f, 0.34889346f, 0.4017173f, 0.022325754f, 0.16885895f, 0.29388845f, 0.5185218f, + 0.6976676f, 0.8000114f, 0.16102946f, 0.28226858f, 0.68160856f, 0.915194f, 0.3970999f, 0.8741559f}; + const std::vector query_bias_dim = {8}; + const std::vector query_bias = { + 0.41940832f, 0.55290705f, 0.9527381f, 0.03616482f, 0.18523103f, 0.37341738f, 0.30510002f, 0.9320004f}; + const std::vector rel_pos_dim = {1, 2, 3, 3}; + const std::vector rel_pos = { + 0.17591017f, 0.26983356f, 0.15067977f, 0.031719506f, 0.20812976f, 0.929799f, 0.7231092f, 0.7423363f, + 0.5262958f, 0.24365824f, 0.58459234f, 0.03315264f, 0.13871688f, 0.242235f, 0.81546897f, 0.7931606f, + 0.27825248f, 0.4819588f}; + const std::vector weight_dim = {4, 8}; + const std::vector weight = { + 0.81978035f, 0.99706656f, 0.6984411f, 0.5675464f, 0.83524317f, 0.20559883f, 0.593172f, 0.112347245f, + 0.15345693f, 0.24170822f, 0.7262365f, 0.7010802f, 0.20382375f, 0.65105355f, 0.774486f, 0.43689132f, + 0.5190908f, 0.61585236f, 0.8101883f, 0.98009706f, 0.11468822f, 0.31676513f, 0.69650495f, 0.9142747f, + 0.93510365f, 0.9411784f, 0.5995073f, 0.06520867f, 0.54599625f, 0.18719733f, 0.034022927f, 0.94424623f}; + const std::vector bias_dim = {8}; + const std::vector bias = { + 0.8801799f, 0.0012360215f, 0.593586f, 0.41577f, 0.41771942f, 0.27112156f, 0.6922781f, 0.20384824f}; + const std::vector eco_a_dim = {1, 2, 1, 1}; + const std::vector eco_a = { + 0.68329567f, 0.75285405f}; + const std::vector output_dim = {1, 2, 3, 3}; + const std::vector output = { + 0.29608122f, 0.45416728f, 0.25361493f, 0.053390637f, 0.3503264f, 1.5650483f, 1.2171557f, 1.2495192f, + 0.88587445f, 0.42708054f, 1.0246648f, 0.05810945f, 0.2430356f, 0.4244021f, 1.428723f, 1.3902748f, + 0.48772895f, 0.84479123f}; + + RunGatedRelativePositionBiasTest(query_layer, query_bias, rel_pos, weight, bias, eco_a, output, + batch_size, seq_len, num_heads, head_size, D, true); +} + +TEST(GatedRelativePositionBiasTest, FP32_BSNHD_2x3x2x4x8) { + constexpr int batch_size = 2; + constexpr int num_heads = 2; + constexpr int seq_len = 3; + constexpr int head_size = 4; + constexpr int D = 8; + const std::vector query_layer_dim = {2, 3, 8}; + const std::vector query_layer = { + 0.4962566f, 0.7682218f, 0.08847743f, 0.13203049f, 0.30742282f, 0.6340787f, 0.4900934f, 0.89644474f, + 0.45562798f, 0.6323063f, 0.34889346f, 0.4017173f, 0.022325754f, 0.16885895f, 0.29388845f, 0.5185218f, + 0.6976676f, 0.8000114f, 0.16102946f, 0.28226858f, 0.68160856f, 0.915194f, 0.3970999f, 0.8741559f, + 0.41940832f, 0.55290705f, 0.9527381f, 0.03616482f, 0.18523103f, 0.37341738f, 0.30510002f, 0.9320004f, + 0.17591017f, 0.26983356f, 0.15067977f, 0.031719506f, 0.20812976f, 0.929799f, 0.7231092f, 0.7423363f, + 0.5262958f, 0.24365824f, 0.58459234f, 0.03315264f, 0.13871688f, 0.242235f, 0.81546897f, 0.7931606f}; + const std::vector query_bias_dim = {8}; + const std::vector query_bias = { + 0.27825248f, 0.4819588f, 0.81978035f, 0.99706656f, 0.6984411f, 0.5675464f, 0.83524317f, 0.20559883f}; + const std::vector rel_pos_dim = {1, 2, 3, 3}; + const std::vector rel_pos = { + 0.593172f, 0.112347245f, 0.15345693f, 0.24170822f, 0.7262365f, 0.7010802f, 0.20382375f, 0.65105355f, + 0.774486f, 0.43689132f, 0.5190908f, 0.61585236f, 0.8101883f, 0.98009706f, 0.11468822f, 0.31676513f, + 0.69650495f, 0.9142747f}; + const std::vector weight_dim = {4, 8}; + const std::vector weight = { + 0.93510365f, 0.9411784f, 0.5995073f, 0.06520867f, 0.54599625f, 0.18719733f, 0.034022927f, 0.94424623f, + 0.8801799f, 0.0012360215f, 0.593586f, 0.41577f, 0.41771942f, 0.27112156f, 0.6922781f, 0.20384824f, + 0.68329567f, 0.75285405f, 0.8579358f, 0.6869556f, 0.005132377f, 0.17565155f, 0.7496575f, 0.6046507f, + 0.10995799f, 0.21209025f, 0.97037464f, 0.83690894f, 0.28198743f, 0.3741576f, 0.023700953f, 0.49101293f}; + const std::vector bias_dim = {8}; + const std::vector bias = { + 0.123470545f, 0.11432165f, 0.4724502f, 0.5750725f, 0.29523486f, 0.7966888f, 0.19573045f, 0.95368505f}; + const std::vector eco_a_dim = {1, 2, 1, 1}; + const std::vector eco_a = { + 0.84264994f, 0.07835853f}; + const std::vector output_dim = {2, 2, 3, 3}; + const std::vector output = { + 1.0928818f, 0.20699267f, 0.28273466f, 0.44534987f, 1.3380982f, 1.2917475f, 0.3755537f, 1.1995932f, + 1.4270226f, 0.47112367f, 0.5597638f, 0.6641071f, 0.87368786f, 1.0569134f, 0.12367705f, 0.34158573f, + 0.75108063f, 0.98591405f, 1.0929474f, 0.2070051f, 0.28275162f, 0.4451845f, 1.3376014f, 1.2912678f, + 0.37552574f, 1.1995038f, 1.4269164f, 0.47112313f, 0.5597632f, 0.6641063f, 0.87367094f, 1.056893f, + 0.12367466f, 0.34158388f, 0.7510766f, 0.98590875f}; + + RunGatedRelativePositionBiasTest(query_layer, query_bias, rel_pos, weight, bias, eco_a, output, + batch_size, seq_len, num_heads, head_size, D, false); +} + +TEST(GatedRelativePositionBiasTest, FP32_LongSeq_BSNHD_2x5x2x4x4) { + constexpr int batch_size = 2; + constexpr int num_heads = 2; + constexpr int seq_len = 5; + constexpr int head_size = 4; + constexpr int D = 4; + const std::vector query_layer_dim = {2, 5, 8}; + const std::vector query_layer = { + 0.4962566f, 0.7682218f, 0.08847743f, 0.13203049f, 0.30742282f, 0.6340787f, 0.4900934f, 0.89644474f, + 0.45562798f, 0.6323063f, 0.34889346f, 0.4017173f, 0.022325754f, 0.16885895f, 0.29388845f, 0.5185218f, + 0.6976676f, 0.8000114f, 0.16102946f, 0.28226858f, 0.68160856f, 0.915194f, 0.3970999f, 0.8741559f, + 0.41940832f, 0.55290705f, 0.9527381f, 0.03616482f, 0.18523103f, 0.37341738f, 0.30510002f, 0.9320004f, + 0.17591017f, 0.26983356f, 0.15067977f, 0.031719506f, 0.20812976f, 0.929799f, 0.7231092f, 0.7423363f, + 0.5262958f, 0.24365824f, 0.58459234f, 0.03315264f, 0.13871688f, 0.242235f, 0.81546897f, 0.7931606f, + 0.27825248f, 0.4819588f, 0.81978035f, 0.99706656f, 0.6984411f, 0.5675464f, 0.83524317f, 0.20559883f, + 0.593172f, 0.112347245f, 0.15345693f, 0.24170822f, 0.7262365f, 0.7010802f, 0.20382375f, 0.65105355f, + 0.774486f, 0.43689132f, 0.5190908f, 0.61585236f, 0.8101883f, 0.98009706f, 0.11468822f, 0.31676513f, + 0.69650495f, 0.9142747f, 0.93510365f, 0.9411784f, 0.5995073f, 0.06520867f, 0.54599625f, 0.18719733f}; + const std::vector query_bias_dim = {8}; + const std::vector query_bias = { + 0.034022927f, 0.94424623f, 0.8801799f, 0.0012360215f, 0.593586f, 0.41577f, 0.41771942f, 0.27112156f}; + const std::vector rel_pos_dim = {1, 2, 5, 5}; + const std::vector rel_pos = { + 0.6922781f, 0.20384824f, 0.68329567f, 0.75285405f, 0.8579358f, 0.6869556f, 0.005132377f, 0.17565155f, + 0.7496575f, 0.6046507f, 0.10995799f, 0.21209025f, 0.97037464f, 0.83690894f, 0.28198743f, 0.3741576f, + 0.023700953f, 0.49101293f, 0.123470545f, 0.11432165f, 0.4724502f, 0.5750725f, 0.29523486f, 0.7966888f, + 0.19573045f, 0.95368505f, 0.84264994f, 0.07835853f, 0.37555784f, 0.5225613f, 0.57295054f, 0.61858714f, + 0.69621414f, 0.5299501f, 0.25603563f, 0.7365945f, 0.02037555f, 0.20364666f, 0.37483507f, 0.25644332f, + 0.32508332f, 0.09018916f, 0.39364243f, 0.6068782f, 0.17426711f, 0.47434032f, 0.8579254f, 0.44859987f, + 0.5138961f, 0.45686555f}; + const std::vector weight_dim = {4, 4}; + const std::vector weight = { + 0.6011907f, 0.81791973f, 0.9736231f, 0.81752795f, 0.97470677f, 0.46383917f, 0.050839245f, 0.2629614f, + 0.8404526f, 0.49675876f, 0.25147682f, 0.11684412f, 0.032073975f, 0.0779959f, 0.39858162f, 0.774203f}; + const std::vector bias_dim = {4}; + const std::vector bias = { + 0.77032053f, 0.017784059f, 0.811891f, 0.10874528f}; + const std::vector eco_a_dim = {1, 2, 1, 1}; + const std::vector eco_a = { + 0.39429486f, 0.29726368f}; + const std::vector output_dim = {2, 2, 5, 5}; + const std::vector output = { + 0.9534052f, 0.28073975f, 0.9410346f, 1.0368304f, 1.181549f, 0.94923383f, 0.0070919087f, 0.24271497f, + 1.0358753f, 0.8355051f, 0.15224966f, 0.29366368f, 1.3435968f, 1.158798f, 0.3904445f, 0.5147038f, + 0.03260383f, 0.67545396f, 0.16985025f, 0.15726471f, 0.64280313f, 0.7824283f, 0.40168867f, 1.0839535f, + 0.26630563f, 1.2391479f, 1.0948771f, 0.101813294f, 0.48797214f, 0.6789776f, 0.7492329f, 0.8089107f, + 0.91042155f, 0.6930023f, 0.3348113f, 0.95611423f, 0.026447866f, 0.2643374f, 0.48654333f, 0.3328685f, + 0.4239932f, 0.117630124f, 0.5134121f, 0.7915271f, 0.22728965f, 0.61497897f, 1.1122944f, 0.5816067f, + 0.6662628f, 0.59232306f, 0.95294285f, 0.2806036f, 0.9405782f, 1.0363276f, 1.1809759f, 0.95289487f, + 0.007119261f, 0.24365108f, 1.0398705f, 0.83872753f, 0.15201466f, 0.29321042f, 1.3415229f, 1.1570094f, + 0.38984182f, 0.51978874f, 0.032925934f, 0.682127f, 0.17152825f, 0.15881838f, 0.6571103f, 0.79984313f, + 0.4106292f, 1.1080796f, 0.2722329f, 1.2398669f, 1.0955123f, 0.101872355f, 0.4882552f, 0.6793715f, + 0.7427765f, 0.8019401f, 0.9025762f, 0.6870305f, 0.33192614f, 0.9568577f, 0.026468432f, 0.26454294f, + 0.48692167f, 0.33312735f, 0.4217717f, 0.117013805f, 0.5107221f, 0.78737986f, 0.22609876f, 0.6166911f, + 1.1153911f, 0.5832259f, 0.6681177f, 0.59397215f}; + + RunGatedRelativePositionBiasTest(query_layer, query_bias, rel_pos, weight, bias, eco_a, output, + batch_size, seq_len, num_heads, head_size, D, false); +} + } // namespace test } // namespace onnxruntime From 8f34c8c8eda2890b502021283c463d0552e45b8d Mon Sep 17 00:00:00 2001 From: "Tang, Cheng" Date: Tue, 7 Feb 2023 13:47:48 -0800 Subject: [PATCH 25/68] Introduce collective ops to ort inference build (#14399) ### Description Introduce collective ops into onnxruntime inference build, including 1) AllReduce and AllGather schema in contrib op, controlled by USE_MPI flag 2) AllReduce and AllGather kernel in cuda EP, controlled by ORT_USE_NCCL flag ### Motivation and Context Enable the collective ops in onnxruntime inference build so we have the ability to run distributed inference with multiple GPUs. The original ncclAllReduce ops in training build require quite complex configurations, which is not suitable for inference case, and it already broken. so we introduce a new implementation. --------- Co-authored-by: Cheng Tang --- cmake/CMakeLists.txt | 157 ++++++------- cmake/onnxruntime_framework.cmake | 6 +- cmake/onnxruntime_providers.cmake | 20 +- cmake/onnxruntime_rocm_hipify.cmake | 3 + .../contrib_ops/cuda/collective/mpi_include.h | 28 +++ .../cuda/collective/nccl_kernels.cc | 151 ++++++++++++ .../cuda/collective/nccl_kernels.h | 74 ++++++ .../contrib_ops/cuda/cuda_contrib_kernels.cc | 51 ++-- .../contrib_ops/rocm/rocm_contrib_kernels.cc | 217 +++++++++--------- .../core/graph/contrib_ops/collective_defs.cc | 63 +++++ .../core/graph/contrib_ops/contrib_defs.cc | 4 + .../core/graph/contrib_ops/contrib_defs.h | 8 +- .../providers/cuda/cuda_provider_factory.cc | 4 +- .../providers/cuda/cuda_provider_factory.h | 4 +- .../core/session/provider_bridge_ort.cc | 8 +- .../python/onnxruntime_test_collective.py | 77 +++++++ .../cuda/communication/nccl_service.cc | 2 +- tools/ci_build/build.py | 4 +- ...ortmodule-distributed-test-ci-pipeline.yml | 160 ++++++++----- 19 files changed, 764 insertions(+), 277 deletions(-) create mode 100644 onnxruntime/contrib_ops/cuda/collective/mpi_include.h create mode 100644 onnxruntime/contrib_ops/cuda/collective/nccl_kernels.cc create mode 100644 onnxruntime/contrib_ops/cuda/collective/nccl_kernels.h create mode 100644 onnxruntime/core/graph/contrib_ops/collective_defs.cc create mode 100644 onnxruntime/test/python/onnxruntime_test_collective.py diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 9eab5a2a0fe3b..e24046fb2b8d5 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -1339,102 +1339,105 @@ if (onnxruntime_ENABLE_TRAINING) add_compile_definitions(ENABLE_STRIDED_TENSORS) add_compile_definitions(ENABLE_TRAINING) - if (UNIX) - if (EXISTS "${onnxruntime_MPI_HOME}") - set(MPI_HOME "${onnxruntime_MPI_HOME}") - elseif (EXISTS "/bert_ort/openmpi") - set(MPI_HOME "/bert_ort/openmpi") - endif() + add_subdirectory(tensorboard EXCLUDE_FROM_ALL) + list(APPEND onnxruntime_EXTERNAL_LIBRARIES tensorboard) +endif() + +if (UNIX AND onnxruntime_USE_MPI) + if (EXISTS "${onnxruntime_MPI_HOME}") + set(MPI_HOME "${onnxruntime_MPI_HOME}") + elseif (EXISTS "/bert_ort/openmpi") + set(MPI_HOME "/bert_ort/openmpi") + endif() - find_package(MPI) + find_package(MPI) - if (onnxruntime_USE_MPI OR onnxruntime_USE_NCCL) - if (MPI_CXX_FOUND) - message( STATUS "MPI Version: ${MPI_CXX_VERSION}") - message( STATUS "MPI (include: ${MPI_CXX_INCLUDE_DIRS}, library: ${MPI_CXX_LIBRARIES})" ) - mark_as_advanced(MPI_CXX_INCLUDE_DIRS MPI_CXX_LIBRARIES) - list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${MPI_CXX_LIBRARIES} ${MPI_CXX_LINK_FLAGS}) - else () - message( + if (MPI_CXX_FOUND) + message( STATUS "MPI Version: ${MPI_CXX_VERSION}") + message( STATUS "MPI (include: ${MPI_CXX_INCLUDE_DIRS}, library: ${MPI_CXX_LIBRARIES})" ) + mark_as_advanced(MPI_CXX_INCLUDE_DIRS MPI_CXX_LIBRARIES) + list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${MPI_CXX_LIBRARIES} ${MPI_CXX_LINK_FLAGS}) + else () + message( FATAL_ERROR "MPI is not found. Please define onnxruntime_MPI_HOME to specify the path of MPI. Otherwise, NCCL will be disabled." ) - endif() - endif() + endif() - # Find NCCL and MPI - if (onnxruntime_USE_NCCL) - if (onnxruntime_USE_CUDA) - set(NCCL_LIBNAME "nccl") - elseif (onnxruntime_USE_ROCM) - set(NCCL_LIBNAME "rccl") + # Find NCCL and MPI + if (onnxruntime_USE_NCCL AND MPI_CXX_FOUND) + if (onnxruntime_USE_CUDA) + set(NCCL_LIBNAME "nccl") + elseif (onnxruntime_USE_ROCM) + set(NCCL_LIBNAME "rccl") + endif() + find_path(NCCL_INCLUDE_DIR + NAMES ${NCCL_LIBNAME}.h + HINTS + ${onnxruntime_NCCL_HOME}/include + $ENV{CUDA_ROOT}/include) + + find_library(NCCL_LIBRARY + NAMES ${NCCL_LIBNAME} + HINTS + ${onnxruntime_NCCL_HOME}/lib/x86_64-linux-gnu + ${onnxruntime_NCCL_HOME}/lib + $ENV{CUDA_ROOT}/lib64) + + include(FindPackageHandleStandardArgs) + find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIR NCCL_LIBRARY) + + if (NCCL_FOUND) + set(NCCL_HEADER_FILE "${NCCL_INCLUDE_DIR}/${NCCL_LIBNAME}.h") + message( STATUS "Determining NCCL version from the header file: ${NCCL_HEADER_FILE}" ) + file (STRINGS ${NCCL_HEADER_FILE} NCCL_MAJOR_VERSION_DEFINED + REGEX "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+[0-9]+.*$" LIMIT_COUNT 1) + if (NCCL_MAJOR_VERSION_DEFINED) + string (REGEX REPLACE "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+" "" + NCCL_MAJOR_VERSION ${NCCL_MAJOR_VERSION_DEFINED}) + message( STATUS "NCCL_MAJOR_VERSION: ${NCCL_MAJOR_VERSION}" ) + endif() + file (STRINGS ${NCCL_HEADER_FILE} NCCL_MINOR_VERSION_DEFINED + REGEX "^[ \t]*#define[ \t]+NCCL_MINOR[ \t]+[0-9]+.*$" LIMIT_COUNT 1) + if (NCCL_MINOR_VERSION_DEFINED) + string (REGEX REPLACE "^[ \t]*#define[ \t]+NCCL_MINOR[ \t]+" "" + NCCL_MINOR_VERSION ${NCCL_MINOR_VERSION_DEFINED}) + message(STATUS "NCCL_MINOR_VERSION: ${NCCL_MINOR_VERSION}") endif() - find_path(NCCL_INCLUDE_DIR - NAMES ${NCCL_LIBNAME}.h - HINTS - ${onnxruntime_NCCL_HOME}/include - $ENV{CUDA_ROOT}/include) - - find_library(NCCL_LIBRARY - NAMES ${NCCL_LIBNAME} - HINTS - ${onnxruntime_NCCL_HOME}/lib/x86_64-linux-gnu - ${onnxruntime_NCCL_HOME}/lib - $ENV{CUDA_ROOT}/lib64) - - include(FindPackageHandleStandardArgs) - find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIR NCCL_LIBRARY) - - if (NCCL_FOUND) - set(NCCL_HEADER_FILE "${NCCL_INCLUDE_DIR}/${NCCL_LIBNAME}.h") - message( STATUS "Determining NCCL version from the header file: ${NCCL_HEADER_FILE}" ) - file (STRINGS ${NCCL_HEADER_FILE} NCCL_MAJOR_VERSION_DEFINED - REGEX "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+[0-9]+.*$" LIMIT_COUNT 1) - if (NCCL_MAJOR_VERSION_DEFINED) - string (REGEX REPLACE "^[ \t]*#define[ \t]+NCCL_MAJOR[ \t]+" "" - NCCL_MAJOR_VERSION ${NCCL_MAJOR_VERSION_DEFINED}) - message( STATUS "NCCL_MAJOR_VERSION: ${NCCL_MAJOR_VERSION}" ) - endif() - file (STRINGS ${NCCL_HEADER_FILE} NCCL_MINOR_VERSION_DEFINED - REGEX "^[ \t]*#define[ \t]+NCCL_MINOR[ \t]+[0-9]+.*$" LIMIT_COUNT 1) - if (NCCL_MINOR_VERSION_DEFINED) - string (REGEX REPLACE "^[ \t]*#define[ \t]+NCCL_MINOR[ \t]+" "" - NCCL_MINOR_VERSION ${NCCL_MINOR_VERSION_DEFINED}) - message(STATUS "NCCL_MINOR_VERSION: ${NCCL_MINOR_VERSION}") + if (NCCL_MAJOR_VERSION_DEFINED AND NCCL_MINOR_VERSION_DEFINED) + if ("${NCCL_MAJOR_VERSION}.${NCCL_MINOR_VERSION}" VERSION_GREATER_EQUAL "2.7") + add_definitions(-DUSE_NCCL_P2P=1) + message( STATUS "NCCL P2P is enabled for supporting ncclSend and ncclRecv." ) endif() + endif() - if (NCCL_MAJOR_VERSION_DEFINED AND NCCL_MINOR_VERSION_DEFINED) - if ("${NCCL_MAJOR_VERSION}.${NCCL_MINOR_VERSION}" VERSION_GREATER_EQUAL "2.7") - add_definitions(-DUSE_NCCL_P2P=1) - message( STATUS "NCCL P2P is enabled for supporting ncclSend and ncclRecv." ) - endif() - endif() + set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR}) + set(NCCL_LIBRARIES ${NCCL_LIBRARY}) + message( STATUS "NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})" ) + mark_as_advanced(NCCL_INCLUDE_DIRS NCCL_LIBRARIES) - set(NCCL_INCLUDE_DIRS ${NCCL_INCLUDE_DIR}) - set(NCCL_LIBRARIES ${NCCL_LIBRARY}) - message( STATUS "NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})" ) - mark_as_advanced(NCCL_INCLUDE_DIRS NCCL_LIBRARIES) + list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${NCCL_LIBRARIES}) - list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${NCCL_LIBRARIES}) - add_definitions(-DORT_USE_NCCL=1) - message( STATUS "NCCL is enabled in Linux GPU Build." ) - else () - message( + add_definitions(-DORT_USE_NCCL=1) + message( STATUS "NCCL is enabled in Linux GPU Build." ) + else () + set(onnxruntime_USE_NCCL OFF) + message( FATAL_ERROR "NCCL is not found. Please use --nccl_home to specify the path of NCCL. Otherwise, NCCL is disabled." ) - endif() endif() endif() +else() + set(onnxruntime_USE_NCCL OFF) + set(onnxruntime_USE_MPI OFF) +message( WARNING "MPI and NCCL disabled on Win build." ) +endif() - if (onnxruntime_USE_MPI) - add_definitions(-DUSE_MPI=1) - endif() - - add_subdirectory(tensorboard EXCLUDE_FROM_ALL) - list(APPEND onnxruntime_EXTERNAL_LIBRARIES tensorboard) +if (onnxruntime_USE_MPI) + add_definitions(-DUSE_MPI=1) endif() # Default version parts for Microsoft.AI.MachineLearning.dll, onnxruntime.dll, onnxruntime_providers_openvino.dll and onnxruntime_providers_shared.dll in non-ADO pipeline local builds diff --git a/cmake/onnxruntime_framework.cmake b/cmake/onnxruntime_framework.cmake index 6a33100d23bde..5c947a52b7838 100644 --- a/cmake/onnxruntime_framework.cmake +++ b/cmake/onnxruntime_framework.cmake @@ -75,9 +75,9 @@ if (onnxruntime_ENABLE_TRAINING_OPS) onnxruntime_add_include_to_target(onnxruntime_framework Python::Module) target_include_directories(onnxruntime_framework PRIVATE ${dlpack_SOURCE_DIR}/include) endif() - if (onnxruntime_USE_NCCL OR onnxruntime_USE_MPI) - target_include_directories(onnxruntime_framework PUBLIC ${MPI_CXX_INCLUDE_DIRS}) - endif() +endif() +if (onnxruntime_USE_MPI) + target_include_directories(onnxruntime_framework PUBLIC ${MPI_CXX_INCLUDE_DIRS}) endif() if (onnxruntime_ENABLE_ATEN) diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake index 6697493fbb3c9..33d13f4476b51 100644 --- a/cmake/onnxruntime_providers.cmake +++ b/cmake/onnxruntime_providers.cmake @@ -383,6 +383,11 @@ if (onnxruntime_USE_CUDA) "${ONNXRUNTIME_ROOT}/contrib_ops/cuda/aten_ops/aten_op.cc" ) endif() + if (NOT onnxruntime_USE_NCCL) + list(REMOVE_ITEM onnxruntime_cuda_contrib_ops_cc_srcs + "${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/nccl_kernels.cc" + ) + endif() # add using ONNXRUNTIME_ROOT so they show up under the 'contrib_ops' folder in Visual Studio source_group(TREE ${ONNXRUNTIME_ROOT} FILES ${onnxruntime_cuda_contrib_ops_cc_srcs} ${onnxruntime_cuda_contrib_ops_cu_srcs}) list(APPEND onnxruntime_providers_cuda_src ${onnxruntime_cuda_contrib_ops_cc_srcs} ${onnxruntime_cuda_contrib_ops_cu_srcs}) @@ -507,14 +512,15 @@ if (onnxruntime_USE_CUDA) if (onnxruntime_ENABLE_TRAINING_OPS) target_include_directories(onnxruntime_providers_cuda PRIVATE ${ORTTRAINING_ROOT} ${MPI_CXX_INCLUDE_DIRS}) - if(onnxruntime_USE_MPI) - target_link_libraries(onnxruntime_providers_cuda PRIVATE ${MPI_LIBRARIES} ${MPI_CXX_LINK_FLAGS}) - endif() + endif() - if (onnxruntime_USE_NCCL) - target_include_directories(onnxruntime_providers_cuda PRIVATE ${NCCL_INCLUDE_DIRS}) - target_link_libraries(onnxruntime_providers_cuda PRIVATE ${NCCL_LIBRARIES}) - endif() + if(onnxruntime_USE_MPI) + target_link_libraries(onnxruntime_providers_cuda PRIVATE ${MPI_LIBRARIES} ${MPI_CXX_LINK_FLAGS}) + endif() + + if (onnxruntime_USE_NCCL) + target_include_directories(onnxruntime_providers_cuda PRIVATE ${NCCL_INCLUDE_DIRS}) + target_link_libraries(onnxruntime_providers_cuda PRIVATE ${NCCL_LIBRARIES}) endif() if (WIN32) diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake index b118161b1c45a..ec3726048f09e 100644 --- a/cmake/onnxruntime_rocm_hipify.cmake +++ b/cmake/onnxruntime_rocm_hipify.cmake @@ -98,6 +98,9 @@ set(contrib_ops_excluded_files if (NOT onnxruntime_ENABLE_ATEN) list(APPEND contrib_ops_excluded_files "aten_ops/aten_op.cc") endif() +if (NOT onnxruntime_USE_NCCL) + list(APPEND contrib_ops_excluded_files "collective/nccl_kernels.cc") +endif() set(provider_excluded_files "atomic/common.cuh" diff --git a/onnxruntime/contrib_ops/cuda/collective/mpi_include.h b/onnxruntime/contrib_ops/cuda/collective/mpi_include.h new file mode 100644 index 0000000000000..ee560bdf4207a --- /dev/null +++ b/onnxruntime/contrib_ops/cuda/collective/mpi_include.h @@ -0,0 +1,28 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#if defined(USE_MPI) +#define OMPI_SKIP_MPICXX 1 // See https://github.com/open-mpi/ompi/issues/5157 +#include +#undef OMPI_SKIP_MPICXX + +namespace onnxruntime { + +#if defined(USE_MPI) +#define MPI_CHECK(condition) \ + do { \ + int error = (condition); \ + ORT_ENFORCE( \ + error == MPI_SUCCESS, \ + "MPI Error at: ", \ + __FILE__, \ + ":", \ + __LINE__, \ + ": ", \ + error); \ + } while (0) +#endif +} // namespace onnxruntime +#endif diff --git a/onnxruntime/contrib_ops/cuda/collective/nccl_kernels.cc b/onnxruntime/contrib_ops/cuda/collective/nccl_kernels.cc new file mode 100644 index 0000000000000..3122f070fd57f --- /dev/null +++ b/onnxruntime/contrib_ops/cuda/collective/nccl_kernels.cc @@ -0,0 +1,151 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +#include "nccl_kernels.h" +#include "mpi_include.h" + +namespace onnxruntime { +namespace contrib { +namespace cuda { + +#define NCCL_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(NCCL_CALL(expr)) + +static ncclDataType_t GetNcclDataType(onnxruntime::MLDataType type) { + if (type == DataTypeImpl::GetType()) { + return ncclUint8; + } else if (type == DataTypeImpl::GetType()) { + return ncclInt8; + } else if (type == DataTypeImpl::GetType()) { + return ncclInt32; + } else if (type == DataTypeImpl::GetType()) { + return ncclInt64; + } else if (type == DataTypeImpl::GetType()) { + return ncclFloat16; + } else if (type == DataTypeImpl::GetType()) { + return ncclFloat32; + } else if (type == DataTypeImpl::GetType()) { + return ncclFloat64; + } else { + ORT_THROW("Tensor type not supported in NCCL."); + } +} + +#ifdef USE_MPI +static Status CreateNcclCommByMPI(int world_size, int rank, ncclComm_t* comm) { + // Create new NCCL communicator + ncclUniqueId nccl_id; + if (rank == 0) { + NCCL_RETURN_IF_ERROR(ncclGetUniqueId(&nccl_id)); + } + MPI_CHECK(MPI_Bcast(&nccl_id, sizeof(nccl_id), MPI_BYTE, 0, MPI_COMM_WORLD)); + NCCL_RETURN_IF_ERROR(ncclCommInitRank(comm, world_size, nccl_id, rank)); + + return Status::OK(); +} +#endif + +NcclContext::NcclContext() { +#ifdef USE_MPI + int is_mpi_initialized = 0; + MPI_Initialized(&is_mpi_initialized); + if (!is_mpi_initialized) { + int mpi_threads_provided = 0; + MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &mpi_threads_provided); + } + + // get world_size and rank from MPI + MPI_Comm_size(MPI_COMM_WORLD, &world_size_); + + MPI_Comm_rank(MPI_COMM_WORLD, &rank_); + + // Initialize global Parallel Group NCCL Communicator + auto ret = CreateNcclCommByMPI(world_size_, rank_, &comm_); + ORT_ENFORCE(ret.IsOK()); + +#else + ORT_THROW("ORT must be built with MPI to use NCCL."); +#endif +} + +NcclContext::~NcclContext() { + if (comm_ != nullptr) { + ncclCommDestroy(comm_); + } + +#ifdef USE_MPI + int is_mpi_finalized = 0; + MPI_Finalized(&is_mpi_finalized); + if (!is_mpi_finalized) { + MPI_Finalize(); + } +#endif +} + +NcclKernel::NcclKernel(const OpKernelInfo& info) : CudaKernel(info) { + static NcclContext context; + nccl_ = &context; +} + +AllReduce::AllReduce(const OpKernelInfo& info) : NcclKernel(info) { +} + +Status AllReduce::ComputeInternal(OpKernelContext* context) const { + ncclComm_t comm = nccl_->Comm(); + + auto input_tensor = context->Input(0); + const void* input_data = input_tensor->DataRaw(); + const auto in_shape = input_tensor->Shape(); + int64_t input_count = in_shape.Size(); + + void* output_data = context->Output(0, in_shape)->MutableDataRaw(); + + ncclDataType_t dtype = GetNcclDataType(input_tensor->DataType()); +#ifdef ORT_USE_NCCL + NCCL_RETURN_IF_ERROR(ncclAllReduce(input_data, output_data, input_count, dtype, ncclSum, comm, Stream(context))); +#endif + return Status::OK(); +} + +AllGather::AllGather(const OpKernelInfo& info) : NcclKernel(info) { + info.GetAttrOrDefault("group_size", &group_size_, static_cast(1)); +} + +Status AllGather::ComputeInternal(OpKernelContext* context) const { + ncclComm_t comm = nccl_->Comm(); + + auto input_tensor = context->Input(0); + const void* input_data = input_tensor->DataRaw(); + const auto in_shape = input_tensor->Shape(); + int64_t input_count = in_shape.Size(); + // construct output shape + TensorShape out_shape(in_shape); + out_shape[0] = group_size_ * out_shape[0]; + + void* output_data = context->Output(0, out_shape)->MutableDataRaw(); + + ncclDataType_t dtype = GetNcclDataType(input_tensor->DataType()); +#ifdef ORT_USE_NCCL + NCCL_RETURN_IF_ERROR(ncclAllGather(input_data, output_data, input_count, dtype, comm, Stream(context))); +#endif + return Status::OK(); +} + +ONNX_OPERATOR_KERNEL_EX(AllReduce, kMSDomain, 1, kCudaExecutionProvider, + (*KernelDefBuilder::Create()) + .VariadicAlias(0, 0) // outputs and inputs are mapped one to one + .AllocateInputsContiguously() + .TypeConstraint("T", DataTypeImpl::AllIEEEFloatTensorTypes()), + AllReduce); + +ONNX_OPERATOR_KERNEL_EX( + AllGather, + kMSDomain, + 1, + kCudaExecutionProvider, + (*KernelDefBuilder::Create()) + .AllocateInputsContiguously() + .TypeConstraint("T", DataTypeImpl::AllIEEEFloatTensorTypes()), + AllGather); + +} // namespace cuda +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cuda/collective/nccl_kernels.h b/onnxruntime/contrib_ops/cuda/collective/nccl_kernels.h new file mode 100644 index 0000000000000..1576f674106e2 --- /dev/null +++ b/onnxruntime/contrib_ops/cuda/collective/nccl_kernels.h @@ -0,0 +1,74 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/providers/cuda/cuda_kernel.h" + +#if defined(ORT_USE_NCCL) +#include +#endif + +namespace onnxruntime { +namespace contrib { +namespace cuda { + +// ----------------------------------------------------------------------- +// Defines a new version of nccl classes +// that independent with training::DistributedRunContext, only rely on MPI +// ----------------------------------------------------------------------- +class NcclContext final { + public: + NcclContext(); + ~NcclContext(); + + ncclComm_t Comm() { + return comm_; + } + + int Rank() const { + return rank_; + } + + int Size() const { + return world_size_; + } + + private: + ncclComm_t comm_; + int rank_; + int world_size_; +}; + +class NcclKernel : public ::onnxruntime::cuda::CudaKernel { + public: + explicit NcclKernel(const OpKernelInfo& info); + + protected: + NcclContext* nccl_ = nullptr; +}; + +/* + * Defines new version of Nccl classes that independent with training::DistributedContext + * only rely on MPI + */ +class AllReduce final : public NcclKernel { + public: + explicit AllReduce(const OpKernelInfo& info); + + Status ComputeInternal(OpKernelContext* context) const override; +}; + +class AllGather final : public NcclKernel { + public: + explicit AllGather(const OpKernelInfo& info); + + Status ComputeInternal(OpKernelContext* context) const override; + + private: + int64_t group_size_; +}; + +} // namespace cuda +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc index 1254ccd7e1e17..cabf890ab341d 100644 --- a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc @@ -132,6 +132,11 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, QOrd class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kPytorchAtenDomain, 1, ATen); #endif +#if defined(USE_MPI) && defined(ORT_USE_NCCL) +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, AllReduce); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, AllGather); +#endif + template <> KernelCreateInfo BuildKernelCreateInfo() { KernelCreateInfo info; @@ -242,29 +247,35 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - // TransposedMatMul is still here for backward compatibility - BuildKernelCreateInfo, // backward compatibility - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // TransposedMatMul is still here for backward compatibility + BuildKernelCreateInfo, // backward compatibility + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #ifdef ENABLE_ATEN - BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif + +#if defined(USE_MPI) && defined(ORT_USE_NCCL) + BuildKernelCreateInfo, + BuildKernelCreateInfo, +#endif + }; for (auto& function_table_entry : function_table) { diff --git a/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc b/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc index e056c8cbfb64d..252f943c43df3 100644 --- a/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc @@ -109,6 +109,11 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kPytorchAtenDomain, 1, ATen); #endif +#if defined(USE_MPI) && defined(ORT_USE_NCCL) +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllReduce); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllGather); +#endif + template <> KernelCreateInfo BuildKernelCreateInfo() { KernelCreateInfo info; @@ -117,116 +122,120 @@ KernelCreateInfo BuildKernelCreateInfo() { Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn function_table[] = { - BuildKernelCreateInfo, //default entry to avoid the list become empty after ops-reducing - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, // backward compatibility - BuildKernelCreateInfo, // backward compatibility - BuildKernelCreateInfo, // backward compatibility - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, // default entry to avoid the list become empty after ops-reducing + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, // backward compatibility + BuildKernelCreateInfo, // backward compatibility + BuildKernelCreateInfo, // backward compatibility + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + BuildKernelCreateInfo, - // These ops were experimental ops in onnx domain which have been removed now. We add them here as - // contrib ops to maintain backward compatibility - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - // BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + // These ops were experimental ops in onnx domain which have been removed now. We add them here as + // contrib ops to maintain backward compatibility + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, - // BuildKernelCreateInfo + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo, + // BuildKernelCreateInfo - BuildKernelCreateInfo, - // TransposedMatMul is still here for backward compatibility - BuildKernelCreateInfo, // backward compatibility - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + // TransposedMatMul is still here for backward compatibility + BuildKernelCreateInfo, // backward compatibility + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #ifdef ENABLE_ATEN - BuildKernelCreateInfo, + BuildKernelCreateInfo, +#endif +#if defined(USE_MPI) && defined(ORT_USE_NCCL) + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif - }; + }; for (auto& function_table_entry : function_table) { KernelCreateInfo info = function_table_entry(); diff --git a/onnxruntime/core/graph/contrib_ops/collective_defs.cc b/onnxruntime/core/graph/contrib_ops/collective_defs.cc new file mode 100644 index 0000000000000..167b80238a3d6 --- /dev/null +++ b/onnxruntime/core/graph/contrib_ops/collective_defs.cc @@ -0,0 +1,63 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/graph/contrib_ops/contrib_defs.h" +#include "core/graph/constants.h" + +namespace onnxruntime { +namespace contrib { + +using ONNX_NAMESPACE::AttributeProto; +using ONNX_NAMESPACE::InferenceContext; +using ONNX_NAMESPACE::OpSchema; +using ONNX_NAMESPACE::OPTIONAL_VALUE; +using ONNX_NAMESPACE::TypeProto; + +void RegisterCollectiveOps() { + ONNX_CONTRIB_OPERATOR_SCHEMA(AllReduce) + .SetDomain(kMSDomain) + .SinceVersion(1) + .Input(0, "input", "tensors to be reduced", "T", OpSchema::Variadic) + .Output(0, "output", "reduced tensors", "T", OpSchema::Variadic) + .TypeConstraint( + "T", + {"tensor(float16)", "tensor(float)", "tensor(double)"}, + "Constrain to float, float16 and double tensors.") + .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { + propagateShapeAndTypeFromFirstInput(ctx); + }); + + ONNX_CONTRIB_OPERATOR_SCHEMA(AllGather) + .SetDomain(kMSDomain) + .SinceVersion(1) + .Attr("group_size", + "total size in the group that need to be gathered.", + AttributeProto::INT, + static_cast(1)) + .Input(0, "input", "tensors to be sent", "T", OpSchema::Variadic) + .Output(0, "output", "gathered tensors", "T", OpSchema::Variadic) + .TypeConstraint( + "T", + {"tensor(float16)", "tensor(float)", "tensor(double)"}, + "Constrain to float, float16 and double tensors.") + .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { + auto group_size = getAttribute(ctx, "group_size", 1); + assert(group_size >= static_cast(1)); + // propagate type for output + propagateElemTypeFromInputToOutput(ctx, 0, 0); + + // propagate shape for output. + // output shape is [group_size * input_shape[0], ...] + auto output_type = ctx.getOutputType(0); + auto input_type = ctx.getInputType(0); + if (hasShape(*input_type)) { + auto shape = input_type->tensor_type().shape(); + auto dim = shape.dim(0) * group_size; + *shape.mutable_dim(0) = dim; + *output_type->mutable_tensor_type()->mutable_shape() = shape; + } + }); +} + +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc index a8c870d1442cf..e80841ef63543 100644 --- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc @@ -2708,6 +2708,10 @@ This op functions in much the same was as Dropout-11 and Dropout-13 do, execpt t RegisterNchwcSchemas(); } #endif + +#ifdef USE_MPI + RegisterCollectiveOps(); +#endif } } // namespace contrib diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.h b/onnxruntime/core/graph/contrib_ops/contrib_defs.h index 7d70c708a9c7b..4c24b284c6ddb 100644 --- a/onnxruntime/core/graph/contrib_ops/contrib_defs.h +++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.h @@ -13,7 +13,7 @@ #define ONNX_MS_OPERATOR_SET_SCHEMA(name, ver, impl) \ ONNX_OPERATOR_SET_SCHEMA_EX(name, Microsoft, ::onnxruntime::kMSDomain, ver, true, impl) -//They are in ONNX domain but they are in our source code +// They are in ONNX domain but they are in our source code #define ONNX_CONTRIB_OPERATOR_SET_SCHEMA(name, ver, impl) \ ONNX_OPERATOR_SET_SCHEMA_EX(name, Onnx, ::ONNX_NAMESPACE::ONNX_DOMAIN, ver, true, impl) @@ -29,7 +29,7 @@ inline bool HasRawData(const ONNX_NAMESPACE::TensorProto& ten_proto) { return ten_proto.data_type() != ONNX_NAMESPACE::TensorProto::UNDEFINED && ten_proto.has_raw_data(); // XXX: Figure out how to do in proto3 } -} +} // namespace utils #define ONNX_CONTRIB_OPERATOR_SCHEMA(name) \ ONNX_CONTRIB_OPERATOR_SCHEMA_UNIQ_HELPER(__COUNTER__, name) @@ -53,6 +53,10 @@ void RegisterContribSchemas(); void RegisterNchwcSchemas(); void RegisterQuantizationSchemas(); +#if defined(USE_MPI) +void RegisterCollectiveOps(); +#endif + constexpr const float kDefaultSkipLayerNormEpsilon = 1e-12f; constexpr const float kDefaultEmbedLayerNormEpsilon = 1e-12f; } // namespace contrib diff --git a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc index d7858fd1c4e24..d059f93cebf1c 100644 --- a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc +++ b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc @@ -28,7 +28,7 @@ using namespace onnxruntime; namespace onnxruntime { -#if defined(USE_CUDA) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) +#if defined(USE_CUDA) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING) namespace cuda { cuda::INcclService& GetINcclService(); } @@ -164,7 +164,7 @@ struct ProviderInfo_CUDA_Impl : ProviderInfo_CUDA { info = CUDAExecutionProviderInfo::FromProviderOptions(options); } -#if defined(USE_CUDA) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) +#if defined(USE_CUDA) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING) cuda::INcclService& GetINcclService() override { return cuda::GetINcclService(); } diff --git a/onnxruntime/core/providers/cuda/cuda_provider_factory.h b/onnxruntime/core/providers/cuda/cuda_provider_factory.h index 259fd199120e7..76b1693d62b9b 100644 --- a/onnxruntime/core/providers/cuda/cuda_provider_factory.h +++ b/onnxruntime/core/providers/cuda/cuda_provider_factory.h @@ -20,7 +20,7 @@ class NvtxRangeCreator; } struct ProviderInfo_CUDA { - virtual ~ProviderInfo_CUDA() {} // This is declared due to a TSA warning, the only instantiation of this class is a global variable of automatic storage. + virtual ~ProviderInfo_CUDA() {} // This is declared due to a TSA warning, the only instantiation of this class is a global variable of automatic storage. virtual OrtStatus* SetCurrentGpuDeviceId(_In_ int device_id) = 0; virtual OrtStatus* GetCurrentGpuDeviceId(_In_ int* device_id) = 0; @@ -43,7 +43,7 @@ struct ProviderInfo_CUDA { virtual int cudaGetDeviceCount() = 0; virtual void CUDAExecutionProviderInfo__FromProviderOptions(const onnxruntime::ProviderOptions& options, onnxruntime::CUDAExecutionProviderInfo& info) = 0; -#if defined(USE_CUDA) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) +#if defined(USE_CUDA) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING) virtual onnxruntime::cuda::INcclService& GetINcclService() = 0; #endif diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index 8209abf5fd37a..cbc0273a1004e 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -42,12 +42,12 @@ #ifdef ENABLE_NVTX_PROFILE #include "core/providers/cuda/nvtx_profile.h" #endif -#if defined(ORT_USE_NCCL) +#if defined(ORT_USE_NCCL) && defined(ENABLE_TRAINING) #include "orttraining/training_ops/cuda/communication/nccl_service.h" #include "orttraining/core/framework/distributed_run_context.h" #endif -#if defined(USE_ROCM) && defined(ORT_USE_NCCL) +#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(ENABLE_TRAINING) #include "orttraining/training_ops/rocm/communication/nccl_service.h" #include "orttraining/core/framework/distributed_run_context.h" #endif @@ -1391,7 +1391,7 @@ void NvtxRangeCreator::EndImpl() { } // namespace profile #endif -#if defined(USE_CUDA) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) +#if defined(USE_CUDA) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING) namespace cuda { INcclService& INcclService::GetInstance() { return GetProviderInfo_CUDA().GetINcclService(); @@ -1399,7 +1399,7 @@ INcclService& INcclService::GetInstance() { } // namespace cuda #endif -#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) +#if defined(USE_ROCM) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING) namespace rocm { INcclService& INcclService::GetInstance() { return GetProviderInfo_ROCM().GetINcclService(); diff --git a/onnxruntime/test/python/onnxruntime_test_collective.py b/onnxruntime/test/python/onnxruntime_test_collective.py new file mode 100644 index 0000000000000..c2031498bd55a --- /dev/null +++ b/onnxruntime/test/python/onnxruntime_test_collective.py @@ -0,0 +1,77 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +import unittest + +import numpy as np +import onnx +from mpi4py import MPI +from onnx import AttributeProto, GraphProto, TensorProto, helper + +import onnxruntime as ort + + +class ORTBertPretrainTest(unittest.TestCase): + def _create_allreduce_ut_model(self, shape): + X = helper.make_tensor_value_info("X", TensorProto.FLOAT, shape) + Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, shape) + node_def = helper.make_node("AllReduce", ["X"], ["Y"], domain="com.microsoft") + graph_def = helper.make_graph( + [node_def], + "", + [X], + [Y], + ) + return helper.make_model(graph_def, producer_name="ort-distributed-inference-unittest") + + def _get_rank_size(self): + comm = MPI.COMM_WORLD + return comm.Get_rank(), comm.Get_size() + + def _create_allgather_ut_model(self, shape): + X = helper.make_tensor_value_info("X", TensorProto.FLOAT, shape) + rank, size = self._get_rank_size() + output_shape = [s * size if _ == 0 else s for _, s in enumerate(shape)] + Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, output_shape) + node_def = helper.make_node("AllGather", ["X"], ["Y"], domain="com.microsoft", group_size=size) + graph_def = helper.make_graph( + [node_def], + "", + [X], + [Y], + ) + return helper.make_model(graph_def, producer_name="ort-distributed-inference-unittest") + + def test_all_reduce(self): + model = self._create_allreduce_ut_model((128, 128)) + rank, size = self._get_rank_size() + ort_sess = ort.InferenceSession( + model.SerializeToString(), + providers=["CUDAExecutionProvider", "CPUExecutionProvider"], + provider_options=[{"device_id": str(rank)}, {}], + ) + + input = np.ones((128, 128), dtype=np.float32) + outputs = ort_sess.run(None, {"X": input}) + assert np.allclose(outputs[0], size * input) + + def test_all_gather(self): + model = self._create_allgather_ut_model((128, 128)) + rank, size = self._get_rank_size() + ort_sess = ort.InferenceSession( + model.SerializeToString(), + providers=["CUDAExecutionProvider", "CPUExecutionProvider"], + provider_options=[{"device_id": str(rank)}, {}], + ) + + input = np.ones((128, 128), dtype=np.float32) * rank + outputs = ort_sess.run(None, {"X": input}) + + expected_output = np.zeros((128, 128), dtype=np.float32) + for _ in range(size - 1): + expected_output = np.concatenate((expected_output, np.ones((128, 128), dtype=np.float32) * (_ + 1))) + + assert np.allclose(outputs[0], expected_output) + + +if __name__ == "__main__": + unittest.main(module=__name__, buffer=True) diff --git a/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc b/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc index efe4612678ada..f604e4c4aaf3e 100644 --- a/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc +++ b/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#if defined(USE_CUDA) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) +#if defined(USE_CUDA) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P) && defined(ENABLE_TRAINING) #include "orttraining/training_ops/cuda/communication/nccl_service.h" #include "core/common/common.h" diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index f421800523667..01ae36affd488 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -192,7 +192,7 @@ def convert_arg_line_to_args(self, arg_line): parser.add_argument("--enable_training_apis", action="store_true", help="Enable ort training apis.") parser.add_argument("--enable_training_ops", action="store_true", help="Enable training ops in inference graph.") - parser.add_argument("--disable_nccl", action="store_false", help="Disable NCCL, by default NCCL is disabled.") + parser.add_argument("--enable_nccl", action="store_true", help="Enable Nccl.") parser.add_argument("--mpi_home", help="Path to MPI installation dir") parser.add_argument("--nccl_home", help="Path to NCCL installation dir") parser.add_argument( @@ -940,7 +940,7 @@ def generate_build_tree( "-Donnxruntime_ENABLE_TRAINING_APIS=" + ("ON" if args.enable_training_apis else "OFF"), # Enable advanced computations such as AVX for some traininig related ops. "-Donnxruntime_ENABLE_CPU_FP16_OPS=" + ("ON" if args.enable_training else "OFF"), - "-Donnxruntime_USE_NCCL=" + ("ON" if args.enable_training and not args.disable_nccl else "OFF"), + "-Donnxruntime_USE_NCCL=" + ("ON" if args.enable_nccl else "OFF"), "-Donnxruntime_BUILD_BENCHMARKS=" + ("ON" if args.build_micro_benchmarks else "OFF"), "-Donnxruntime_USE_ROCM=" + ("ON" if args.use_rocm else "OFF"), "-DOnnxruntime_GCOV_COVERAGE=" + ("ON" if args.code_coverage else "OFF"), diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml index c0e30c13374db..76d14f12c1c55 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml @@ -1,55 +1,109 @@ trigger: none -jobs: -- job: Onnxruntime_Linux_GPU_ORTModule_Distributed_Test - - timeoutInMinutes: 120 - pool: 'Onnxruntime-Linux-GPU-NC24sv3' - - steps: - - checkout: self - clean: true - submodules: recursive - - - template: templates/run-docker-build-steps.yml - parameters: - RunDockerBuildArgs: | - -o ubuntu20.04 -d gpu \ - -t onnxruntime_ortmodule_distributed_tests_image \ - -x " \ - --config RelWithDebInfo \ - --use_cuda --cuda_version=11.6 --cuda_home=/usr/local/cuda-11.6 --cudnn_home=/usr/local/cuda-11.6 \ - --enable_training \ - --update --build \ - --build_wheel \ - " \ - -m \ - -u \ - -e - DisplayName: 'Build' - - - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist" - displayName: 'Mount MNIST' - condition: succeededOrFailed() - - # Entry point for all ORTModule distributed tests - # Refer to orttraining/orttraining/test/python/how_to_add_ortmodule_distributed_ci_pipeline_tests.md for guidelines on how to add new tests to this pipeline. - - script: | - docker run \ - --gpus all \ - --shm-size=1024m \ - --rm \ - --volume $(Build.SourcesDirectory):/onnxruntime_src \ - --volume $(Build.BinariesDirectory):/build \ - --volume /mnist:/mnist \ - onnxruntime_ortmodule_distributed_tests_image \ - bash -c "rm -rf /build/RelWithDebInfo/onnxruntime/ && python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl && python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /build/RelWithDebInfo/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_distributed_tests.py --mnist /mnist' --cwd /build/RelWithDebInfo" \ - displayName: 'Run orttraining_ortmodule_distributed_tests.py' - condition: succeededOrFailed() - timeoutInMinutes: 30 - - - template: templates/component-governance-component-detection-steps.yml - parameters: - condition: 'succeeded' - - - template: templates/clean-agent-build-directory-step.yml +stages: +- stage: ORTModuleDistributedTest + dependsOn: [] + jobs: + - job: Onnxruntime_Linux_GPU_ORTModule_Distributed_Test + + timeoutInMinutes: 120 + pool: 'Onnxruntime-Linux-GPU-NC24sv3' + + steps: + - checkout: self + clean: true + submodules: recursive + + - template: templates/run-docker-build-steps.yml + parameters: + RunDockerBuildArgs: | + -o ubuntu20.04 -d gpu \ + -t onnxruntime_ortmodule_distributed_tests_image \ + -x " \ + --config RelWithDebInfo \ + --use_cuda --cuda_version=11.6 --cuda_home=/usr/local/cuda-11.6 --cudnn_home=/usr/local/cuda-11.6 \ + --enable_training \ + --update --build \ + --build_wheel \ + " \ + -m \ + -u \ + -e + DisplayName: 'Build' + + - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist" + displayName: 'Mount MNIST' + condition: succeededOrFailed() + + # Entry point for all ORTModule distributed tests + # Refer to orttraining/orttraining/test/python/how_to_add_ortmodule_distributed_ci_pipeline_tests.md for guidelines on how to add new tests to this pipeline. + - script: | + docker run \ + --gpus all \ + --shm-size=1024m \ + --rm \ + --volume $(Build.SourcesDirectory):/onnxruntime_src \ + --volume $(Build.BinariesDirectory):/build \ + --volume /mnist:/mnist \ + onnxruntime_ortmodule_distributed_tests_image \ + bash -c "rm -rf /build/RelWithDebInfo/onnxruntime/ && python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl && python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /build/RelWithDebInfo/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_distributed_tests.py --mnist /mnist' --cwd /build/RelWithDebInfo" \ + displayName: 'Run orttraining_ortmodule_distributed_tests.py' + condition: succeededOrFailed() + timeoutInMinutes: 30 + + - template: templates/component-governance-component-detection-steps.yml + parameters: + condition: 'succeeded' + + - template: templates/clean-agent-build-directory-step.yml + +- stage: DistributedInferenceTest + dependsOn: [] + jobs: + - job: Onnxruntime_Linux_GPU_Inference_Distributed_Test + + timeoutInMinutes: 120 + pool: 'Onnxruntime-Linux-GPU-NC24sv3' + + steps: + - checkout: self + clean: true + submodules: recursive + + - template: templates/run-docker-build-steps.yml + parameters: + RunDockerBuildArgs: | + -o ubuntu20.04 -d gpu \ + -t onnxruntime_ortmodule_distributed_tests_image \ + -x " \ + --config RelWithDebInfo \ + --use_cuda --cuda_version=11.6 --cuda_home=/usr/local/cuda-11.6 --cudnn_home=/usr/local/cuda-11.6 \ + --update --build \ + --build_wheel \ + --use_mpi \ + --enable_nccl \ + " \ + -m \ + -u \ + -e + DisplayName: 'Build' + + - script: | + docker run \ + --gpus all \ + --shm-size=1024m \ + --rm \ + --volume $(Build.SourcesDirectory):/onnxruntime_src \ + --volume $(Build.BinariesDirectory):/build \ + --volume /mnist:/mnist \ + onnxruntime_ortmodule_distributed_tests_image \ + bash -c "rm -rf /build/RelWithDebInfo/onnxruntime/ && python3 -m pip install mpi4py && python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl && mpirun -n 4 -x NCCL_DEBUG=INFO python /onnxruntime_src/onnxruntime/test/python/onnxruntime_test_collective.py" \ + displayName: 'Run onnxruntime_test_collective.py' + condition: succeededOrFailed() + timeoutInMinutes: 30 + + - template: templates/component-governance-component-detection-steps.yml + parameters: + condition: 'succeeded' + + - template: templates/clean-agent-build-directory-step.yml From cd7098fdf47e804ff84ca0cc2522e82bce8a2e7f Mon Sep 17 00:00:00 2001 From: Hector Li Date: Tue, 7 Feb 2023 15:33:05 -0800 Subject: [PATCH 26/68] fix snpe build (#14616) ### Description Fix SNPE build issue caused by cmake dependency refactor ### Motivation and Context ### Motivation and Context For release, winai packaing pipeline's container image is revert to old image. So we should revert VS to 2019 --- .pipelines/windowsai-steps.yml | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/.pipelines/windowsai-steps.yml b/.pipelines/windowsai-steps.yml index d246b326764a2..aa9ec339c7b2a 100644 --- a/.pipelines/windowsai-steps.yml +++ b/.pipelines/windowsai-steps.yml @@ -3,12 +3,12 @@ parameters: displayName: BuildArch type: string default: 'x64' - + - name: Runtime displayName: MSVC Runtime, should be 'dynamic' or 'static'. type: string default: 'dynamic' - + - name: PythonPackageName displayName: PythonPackageName on nuget.org to use type: string @@ -17,8 +17,8 @@ parameters: jobs: - job: Windows_Packaging_${{ parameters.BuildArch }}_${{ parameters.Runtime }} pool: - type: windows - + type: windows + variables: ob_outputDirectory: '$(Build.ArtifactStagingDirectory)' ob_sdl_binskim_break: true @@ -40,7 +40,7 @@ jobs: restoreSolution: $(Build.SourcesDirectory)\.pipelines\nuget_config\x64\packages.config ${{ if eq(parameters.BuildArch, 'arm64') }}: restoreSolution: $(Build.SourcesDirectory)\.pipelines\nuget_config\x64\packages.config - + - script: | @echo off set vswherepath="%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" @@ -50,10 +50,10 @@ jobs: set vsdevcmd="%%i\Common7\Tools\vsdevcmd.bat" ) ) - + @echo vslatest %vslatest% @echo vsdevcmd %vsdevcmd% - + @echo ##vso[task.setvariable variable=vslatest]%vslatest% @echo ##vso[task.setvariable variable=vsdevcmd]%vsdevcmd% -arch=${{ parameters.BuildArch }} displayName: 'locate vsdevcmd via vswhere' @@ -80,7 +80,7 @@ jobs: 7z x cmake-3.24.3-windows-x86_64.zip set PYTHONHOME=$(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.7.9\tools set PYTHONPATH=$(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.7.9\tools - $(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.7.9\tools\python.exe "$(Build.SourcesDirectory)\tools\ci_build\build.py" --build_dir $(Build.BinariesDirectory) --build_shared_lib --enable_onnx_tests --ms_experimental --use_dml --use_winml --cmake_generator "Visual Studio 17 2022" --update --config RelWithDebInfo --enable_lto --use_telemetry --disable_rtti --enable_wcos $(BuildFlags) --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.19041.0 --cmake_path $(Build.BinariesDirectory)\cmake-3.24.3-windows-x86_64\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake-3.24.3-windows-x86_64\bin\ctest.exe + $(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.7.9\tools\python.exe "$(Build.SourcesDirectory)\tools\ci_build\build.py" --build_dir $(Build.BinariesDirectory) --build_shared_lib --enable_onnx_tests --ms_experimental --use_dml --use_winml --cmake_generator "Visual Studio 16 2019" --update --config RelWithDebInfo --enable_lto --use_telemetry --disable_rtti --enable_wcos $(BuildFlags) --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.19041.0 --cmake_path $(Build.BinariesDirectory)\cmake-3.24.3-windows-x86_64\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake-3.24.3-windows-x86_64\bin\ctest.exe workingDirectory: '$(Build.BinariesDirectory)' displayName: 'Generate cmake config' @@ -97,7 +97,7 @@ jobs: maximumCpuCount: true logProjectEvents: true workingFolder: '$(Build.BinariesDirectory)\RelWithDebInfo' - createLogFile: true + createLogFile: true - ${{ if eq(parameters.Runtime, 'dynamic') }}: - script: | @@ -114,8 +114,8 @@ jobs: copy $(Build.SourcesDirectory)\onnxruntime\test\testdata\sequence_length.onnx $(Build.ArtifactStagingDirectory)\test_artifact\ copy $(Build.SourcesDirectory)\onnxruntime\test\testdata\sequence_construct.onnx $(Build.ArtifactStagingDirectory)\test_artifact\ displayName: 'Copy WinML test collateral to artifact directory' - - + + - ${{ if eq(parameters.BuildArch, 'x64') }}: - script: | call $(vsdevcmd) @@ -129,8 +129,8 @@ jobs: signing_profile: 'external_distribution' files_to_sign: '**/*.exe;**/*.dll' search_root: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo' - displayName: 'Sign runtime DLLs' - + displayName: 'Sign runtime DLLs' + - ${{ if eq(parameters.BuildArch, 'x64') }}: - script: | call $(vsdevcmd) From 0b52a887b6a20d6a91e0d62128206eb3844d5077 Mon Sep 17 00:00:00 2001 From: Faith Xu Date: Wed, 8 Feb 2023 09:44:20 -0800 Subject: [PATCH 29/68] [Readme] Update table for build pipelines (#14618) ### Description Update list of pipelines to remove obsolete pipelines and reformat Optional pipelines are not included except for Android and iOS ![image](https://user-images.githubusercontent.com/20780999/217395702-f08f1252-e1aa-4fec-ac34-1c0b9859ec20.png) --- README.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index c0505428caeae..68850f4be8ec1 100644 --- a/README.md +++ b/README.md @@ -23,14 +23,15 @@ ## Build Pipeline Status -|System|CPU|GPU|EPs| -|---|---|---|---| -|Windows|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20CPU%20CI%20Pipeline?label=Windows+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=9)|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20CI%20Pipeline?label=Windows+GPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=10)|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20TensorRT%20CI%20Pipeline?label=Windows+GPU+TensorRT)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=47)| -|Linux|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20CPU%20CI%20Pipeline?label=Linux+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=11)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20CPU%20Minimal%20Build%20E2E%20CI%20Pipeline?label=Linux+CPU+Minimal+Build)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=64)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20CPU%20x64%20NoContribops%20CI%20Pipeline?label=Linux+CPU+x64+No+Contrib+Ops)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=110)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/centos7_cpu?label=Linux+CentOS7)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=78)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-ci-pipeline?label=Linux+CPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=86)|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20GPU%20CI%20Pipeline?label=Linux+GPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=12)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20GPU%20TensorRT%20CI%20Pipeline?label=Linux+GPU+TensorRT)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=45)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-distributed?label=Distributed+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=140)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-gpu-ci-pipeline?label=Linux+GPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=84)|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20OpenVINO%20CI%20Pipeline?label=Linux+OpenVINO)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=55)| -|Mac|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/MacOS%20CI%20Pipeline?label=MacOS+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=13)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/MacOS%20NoContribops%20CI%20Pipeline?label=MacOS+NoContribops)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=65)||| -|Android|||[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Android%20CI%20Pipeline?label=Android)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=53)| -|iOS|||[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/iOS%20CI%20Pipeline?label=iOS)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=134)| -|WebAssembly|||[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20WebAssembly%20CI%20Pipeline?label=WASM)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=161)| +|System|Inference|Training| +|---|---|---| +|Windows|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20CPU%20CI%20Pipeline?label=Windows+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=9)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20CI%20Pipeline?label=Windows+GPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=10)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Windows%20GPU%20TensorRT%20CI%20Pipeline?label=Windows+GPU+TensorRT)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=47)|| +|Linux|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20CPU%20CI%20Pipeline?label=Linux+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=11)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20CPU%20Minimal%20Build%20E2E%20CI%20Pipeline?label=Linux+CPU+Minimal+Build)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=64)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20GPU%20CI%20Pipeline?label=Linux+GPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=12)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20GPU%20TensorRT%20CI%20Pipeline?label=Linux+GPU+TensorRT)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=45)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Linux%20OpenVINO%20CI%20Pipeline?label=Linux+OpenVINO)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=55)|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-ci-pipeline?label=Linux+CPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=86)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining-linux-gpu-ci-pipeline?label=Linux+GPU+Training)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=84)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/orttraining/orttraining-ortmodule-distributed?label=Training+Distributed)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=148)| +|Mac|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/MacOS%20CI%20Pipeline?label=MacOS+CPU)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=13)|| +|Android|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/Android%20CI%20Pipeline?label=Android)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=53)|| +|iOS|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/iOS%20CI%20Pipeline?label=iOS)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=134)|| +|Web|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/ONNX%20Runtime%20Web%20CI%20Pipeline?label=Web)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=161)|| +|Other|[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/onnxruntime-binary-size-checks-ci-pipeline?repoName=microsoft%2Fonnxruntime&label=Binary+Size+Check)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=187&repoName=microsoft%2Fonnxruntime)
[![Build Status](https://dev.azure.com/onnxruntime/onnxruntime/_apis/build/status/onnxruntime-python-checks-ci-pipeline?label=Python+Checks)](https://dev.azure.com/onnxruntime/onnxruntime/_build/latest?definitionId=164)|| ## Data/Telemetry From ba8a00f62f96ce93eeae2216353a33832b610742 Mon Sep 17 00:00:00 2001 From: Valery Chernov Date: Wed, 8 Feb 2023 22:02:20 +0400 Subject: [PATCH 30/68] [TVM EP] Support zero copying TVM EP output tensor to ONNX Runtime output tensor (#12593) **Description**: Support new feature of TVM Virtual Machine (method `set_outputs`) on TVM Execution Provider side. It allows to avoid excess copying from TVM EP output tensor to ONNX Runtime one **Motivation and Context** Tests with multiple output topologies and big output tensors shows that there is overheads spent on copying from TVM EP to ONNX Runtime. Returning output(s) on preallocated memory for VirtualMachine was implemented on TVM side. **Details** `set_output_zero_copy` provider option for TVM EP switches on/off this feature. It is true by default. The feature works for both GraphExecutor and VirtualMachine from TVM. --------- Co-authored-by: Valery Chernov --- cmake/external/tvm.cmake | 2 +- onnxruntime/core/providers/tvm/tvm_api.cc | 49 ++++++++++----- onnxruntime/core/providers/tvm/tvm_api.h | 2 + .../core/providers/tvm/tvm_ep_options.cc | 4 ++ .../core/providers/tvm/tvm_ep_options.h | 1 + onnxruntime/core/providers/tvm/tvm_runner.cc | 5 +- .../core/providers/tvm/tvm_runner_impl.cc | 47 +++++++++++---- .../core/providers/tvm/tvm_runner_impl.h | 60 ++++++++++++++----- 8 files changed, 123 insertions(+), 47 deletions(-) diff --git a/cmake/external/tvm.cmake b/cmake/external/tvm.cmake index 1e224a2dad4af..93049c8b85853 100644 --- a/cmake/external/tvm.cmake +++ b/cmake/external/tvm.cmake @@ -21,4 +21,4 @@ if (onnxruntime_USE_TVM) set(tvm_INCLUDE_DIRS ${tvm_SOURCE_DIR}/include) -endif() \ No newline at end of file +endif() diff --git a/onnxruntime/core/providers/tvm/tvm_api.cc b/onnxruntime/core/providers/tvm/tvm_api.cc index 2e841f38ffa64..fc46d0a90e30c 100644 --- a/onnxruntime/core/providers/tvm/tvm_api.cc +++ b/onnxruntime/core/providers/tvm/tvm_api.cc @@ -33,8 +33,7 @@ TvmModule TVMCompile(const TvmEPOptions& options, const std::string& onnx_txt, const std::string& model_path, int opset, - const TVMTensorShapes& input_shapes) -{ + const TVMTensorShapes& input_shapes) { ::tvm::Array shapes; for (size_t i = 0; i < input_shapes.size(); ++i) { @@ -203,8 +202,7 @@ TvmModule TVMSoCompile(const TvmEPOptions& options) { void TVMSetInputs(TvmModule& mod, std::vector& inds, - std::vector& inputs) -{ + std::vector& inputs) { TvmPackedFunc set_input = mod.GetFunction("set_input", false); TvmPackedFunc set_input_zero_copy = mod.GetFunction("set_input_zero_copy", false); for (size_t i = 0; i < inds.size(); ++i) { @@ -218,8 +216,7 @@ void TVMSetInputs(TvmModule& mod, void TVM_VM_SetInputs(TvmModule& mod, std::vector& inds, - std::vector& inputs) -{ + std::vector& inputs) { size_t num_total_args = inputs.size() + 1; std::vector tvm_values(num_total_args); std::vector tvm_type_codes(num_total_args); @@ -235,9 +232,33 @@ void TVM_VM_SetInputs(TvmModule& mod, set_input.CallPacked(::tvm::runtime::TVMArgs(tvm_values.data(), tvm_type_codes.data(), int(num_total_args)), &rv); } +void TVMSetOutputsZeroCopy(TvmModule& mod, + std::vector& outputs) { + TvmPackedFunc set_output = mod.GetFunction("set_output_zero_copy", false); + for (size_t i = 0; i < outputs.size(); ++i) { + set_output(i, &outputs[i]); + } +} + +void TVM_VM_SetOutputsZeroCopy(TvmModule& mod, + std::vector& outputs) { + size_t num_total_args = outputs.size() + 1; + std::vector tvm_values(num_total_args); + std::vector tvm_type_codes(num_total_args); + tvm_rt::TVMArgsSetter setter(tvm_values.data(), tvm_type_codes.data()); + const std::string func_name = "main"; + setter(0, func_name.c_str()); + for (size_t k = 0; k < num_total_args - 1; ++k) { + setter(k+1, &outputs[k]); + } + + TvmPackedFunc set_output = mod.GetFunction("set_outputs", false); + tvm_rt::TVMRetValue rv; + set_output.CallPacked(tvm_rt::TVMArgs(tvm_values.data(), tvm_type_codes.data(), num_total_args), &rv); +} + void TVMGetOutputs(TvmModule& mod, - std::vector& outputs) -{ + std::vector& outputs) { TvmPackedFunc get_output = mod.GetFunction("get_output", false); for (size_t i = 0; i < outputs.size(); ++i) { get_output(i, &outputs[i]); @@ -245,8 +266,7 @@ void TVMGetOutputs(TvmModule& mod, } void TVM_VM_GetOutputs(TvmModule& mod, - std::vector& outputs) -{ + std::vector& outputs) { TvmPackedFunc get_output = mod.GetFunction("get_output", false); for (size_t i = 0; i < outputs.size(); ++i) { // TODO(vvchernov): think about improvement of memory management @@ -256,8 +276,7 @@ void TVM_VM_GetOutputs(TvmModule& mod, } void TVMGetOutputShapes(TvmModule& mod, - TVMTensorShapes& output_shapes) -{ + TVMTensorShapes& output_shapes) { size_t size = output_shapes.size(); TvmPackedFunc get_output = mod.GetFunction("get_output", false); for (size_t i = 0; i < size; ++i) { @@ -272,15 +291,13 @@ void TVMGetOutputShapes(TvmModule& mod, } } -void TVMRun(TvmModule& mod) -{ +void TVMRun(TvmModule& mod) { TvmPackedFunc run = mod.GetFunction("run", false); ORT_ENFORCE(run != nullptr, "Unable to retrieve graph executor run."); run(); } -void TVM_VM_Run(TvmModule& mod) -{ +void TVM_VM_Run(TvmModule& mod) { TvmPackedFunc run = mod.GetFunction("invoke", false); ORT_ENFORCE(run != nullptr, "Unable to retrieve virtual machine invoke."); run("main"); diff --git a/onnxruntime/core/providers/tvm/tvm_api.h b/onnxruntime/core/providers/tvm/tvm_api.h index 810334231abca..a245b6398d69d 100644 --- a/onnxruntime/core/providers/tvm/tvm_api.h +++ b/onnxruntime/core/providers/tvm/tvm_api.h @@ -24,6 +24,8 @@ namespace tvm { void TVMSetInputs(TvmModule& mod, std::vector& inds, std::vector& inputs); void TVM_VM_SetInputs(TvmModule& mod, std::vector& inds, std::vector& inputs); + void TVMSetOutputsZeroCopy(TvmModule& mod, std::vector& outputs); + void TVM_VM_SetOutputsZeroCopy(TvmModule& mod, std::vector& outputs); void TVMGetOutputs(TvmModule& mod, std::vector& outputs); void TVM_VM_GetOutputs(TvmModule& mod, std::vector& outputs); void TVMGetOutputShapes(TvmModule& mod, diff --git a/onnxruntime/core/providers/tvm/tvm_ep_options.cc b/onnxruntime/core/providers/tvm/tvm_ep_options.cc index 1d701dc11d08a..91ed836b60f0d 100644 --- a/onnxruntime/core/providers/tvm/tvm_ep_options.cc +++ b/onnxruntime/core/providers/tvm/tvm_ep_options.cc @@ -23,6 +23,7 @@ constexpr const char* kTarget = "target"; constexpr const char* kTargetHost = "target_host"; constexpr const char* kOptLevel = "opt_level"; constexpr const char* kFreezeWeights = "freeze_weights"; +constexpr const char* kSetOutputZeroCopy = "set_output_zero_copy"; constexpr const char* kToNHWC = "to_nhwc"; constexpr const char* kTuningFilePath = "tuning_file_path"; constexpr const char* kTuningType = "tuning_type"; @@ -38,6 +39,7 @@ static const std::unordered_set valid_keys { std::string{kTargetHost}, std::string{kOptLevel}, std::string{kFreezeWeights}, + std::string{kSetOutputZeroCopy}, std::string{kToNHWC}, std::string{kTuningFilePath}, std::string{kTuningType}, @@ -124,6 +126,7 @@ TvmEPOptions TvmEPOptionsHelper::FromProviderOptions(const ProviderOptions& pr_o .AddAssignmentToReference(tvm::provider_option_names::kTargetHost, options.target_host) .AddAssignmentToReference(tvm::provider_option_names::kOptLevel, options.opt_level) .AddAssignmentToReference(tvm::provider_option_names::kFreezeWeights, options.freeze_weights) + .AddAssignmentToReference(tvm::provider_option_names::kSetOutputZeroCopy, options.set_output_zero_copy) .AddAssignmentToReference(tvm::provider_option_names::kToNHWC, options.to_nhwc) .AddAssignmentToReference(tvm::provider_option_names::kTuningFilePath, options.tuning_file_path) .AddAssignmentToReference(tvm::provider_option_names::kTuningType, options.tuning_type) @@ -261,6 +264,7 @@ std::ostream& operator<<(std::ostream& out, const TvmEPOptions& options) { "target_host: " << options.target_host << "\n" << "opt level: " << options.opt_level << "\n" << "freeze weights: " << options.freeze_weights << "\n" << + "set_output_zero_copy: " << options.set_output_zero_copy << "\n" << "tuning file path: " << options.tuning_file_path << "\n" << "tuning type: " << options.tuning_type << "\n" << "convert layout to NHWC: " << options.to_nhwc << "\n" << diff --git a/onnxruntime/core/providers/tvm/tvm_ep_options.h b/onnxruntime/core/providers/tvm/tvm_ep_options.h index b9810b2734b84..f8444a090b612 100644 --- a/onnxruntime/core/providers/tvm/tvm_ep_options.h +++ b/onnxruntime/core/providers/tvm/tvm_ep_options.h @@ -41,6 +41,7 @@ struct TvmEPOptions { unsigned int opt_level{tvm::default_opt_level}; bool freeze_weights = true; bool to_nhwc = false; + bool set_output_zero_copy = true; std::string tuning_file_path{""}; std::string tuning_type{tvm::default_tuning_type}; std::string input_names_str{""}; diff --git a/onnxruntime/core/providers/tvm/tvm_runner.cc b/onnxruntime/core/providers/tvm/tvm_runner.cc index 8d2797668d781..afeabba477d20 100644 --- a/onnxruntime/core/providers/tvm/tvm_runner.cc +++ b/onnxruntime/core/providers/tvm/tvm_runner.cc @@ -18,8 +18,9 @@ TVMRunner::TVMRunner(const TvmEPOptions& options, runner_ = getTVMRunnerImpl(mod, options, inputs_info, output_tensors); } -common::Status TVMRunner::operator()(FunctionState state, const OrtApi* api, OrtKernelContext* context) { - return runner_->run(api, context); +common::Status TVMRunner::operator()(FunctionState state, const OrtApi* /*api*/, OrtKernelContext* context) { + Ort::KernelContext ctx(context); + return runner_->run(ctx); } } // namespace tvm diff --git a/onnxruntime/core/providers/tvm/tvm_runner_impl.cc b/onnxruntime/core/providers/tvm/tvm_runner_impl.cc index 6b22f7eaf5711..c9a79ffbd094f 100644 --- a/onnxruntime/core/providers/tvm/tvm_runner_impl.cc +++ b/onnxruntime/core/providers/tvm/tvm_runner_impl.cc @@ -18,9 +18,11 @@ std::shared_ptr getTVMRunnerImpl(const std::shared_ptr& m const std::vector output_tensors) { const std::string& name = options.executor; if (name == "graph") { - return std::make_shared(mod, inputs_info, options.output_shapes, output_tensors); + return std::make_shared(mod, inputs_info, options.output_shapes, + output_tensors, options.set_output_zero_copy); } else if (name == "vm") { - return std::make_shared(mod, inputs_info, options.output_shapes, output_tensors); + return std::make_shared(mod, inputs_info, options.output_shapes, + output_tensors, options.set_output_zero_copy); } return nullptr; } @@ -30,10 +32,12 @@ std::shared_ptr getTVMRunnerImpl(const std::shared_ptr& m RunnerImpl::RunnerImpl(const std::shared_ptr& mod, const InputsInfoMap& inputs_info, const TVMTensorShapes output_shapes, - const std::vector output_tensors) : mod_(mod), - inputs_info_(inputs_info), - output_shapes_(output_shapes), - output_tensors_(output_tensors) { + const std::vector output_tensors, + bool set_output_zero_copy) : mod_(mod), + inputs_info_(inputs_info), + output_shapes_(output_shapes), + output_tensors_(output_tensors), + set_output_zero_copy_(set_output_zero_copy) { } void RunnerImpl::convert_input_tensors2dl_tensors(Ort::KernelContext& context, @@ -88,7 +92,9 @@ void RunnerImpl::add_device_type_data2output_tensors(Ort::KernelContext& context GERunnerImpl::GERunnerImpl(const std::shared_ptr& mod, const InputsInfoMap& inputs_info, const TVMTensorShapes output_shapes, - const std::vector output_tensors) : RunnerImpl(mod, inputs_info, output_shapes, output_tensors) { + const std::vector output_tensors, + bool set_output_zero_copy) : + RunnerImpl(mod, inputs_info, output_shapes, output_tensors, set_output_zero_copy) { } void GERunnerImpl::set_input(Ort::KernelContext& context) { @@ -103,8 +109,15 @@ void GERunnerImpl::connect_output_tensors2ort(Ort::KernelContext& context) { add_device_type_data2output_tensors(context); } -void GERunnerImpl::run_and_get_output() { +void GERunnerImpl::set_output_zero_copy() { + tvm::TVMSetOutputsZeroCopy(*mod_, output_tensors_); +} + +void GERunnerImpl::run() { tvm::TVMRun(*mod_); +} + +void GERunnerImpl::get_outputs() { tvm::TVMGetOutputs(*mod_, output_tensors_); } @@ -113,7 +126,9 @@ void GERunnerImpl::run_and_get_output() { VMRunnerImpl::VMRunnerImpl(const std::shared_ptr& mod, const InputsInfoMap& inputs_info, const TVMTensorShapes output_shapes, - const std::vector output_tensors) : RunnerImpl(mod, inputs_info, output_shapes, output_tensors) { + const std::vector output_tensors, + bool set_output_zero_copy) : + RunnerImpl(mod, inputs_info, output_shapes, output_tensors, set_output_zero_copy) { } void VMRunnerImpl::set_input(Ort::KernelContext& context) { @@ -125,20 +140,28 @@ void VMRunnerImpl::set_input(Ort::KernelContext& context) { } void VMRunnerImpl::connect_output_tensors2ort(Ort::KernelContext& context) { - if (!probe_infer_) { + // TODO(vvchernov): try to find more flexible solution + if(!probe_infer_) { infer_once_to_get_output_shapes(); } add_device_type_data2output_tensors(context); } -void VMRunnerImpl::run_and_get_output() { +void VMRunnerImpl::set_output_zero_copy() { + tvm::TVM_VM_SetOutputsZeroCopy(*mod_, output_tensors_); +} + +void VMRunnerImpl::run() { tvm::TVM_VM_Run(*mod_); +} + +void VMRunnerImpl::get_outputs() { tvm::TVM_VM_GetOutputs(*mod_, output_tensors_); } void VMRunnerImpl::infer_once_to_get_output_shapes() { - tvm::TVM_VM_Run(*mod_); + run(); size_t num_outputs = output_tensors_.size(); // TODO(vvchernov): check it output_shapes_.resize(num_outputs); diff --git a/onnxruntime/core/providers/tvm/tvm_runner_impl.h b/onnxruntime/core/providers/tvm/tvm_runner_impl.h index 6fcddddd8a0e0..ff04561d44887 100644 --- a/onnxruntime/core/providers/tvm/tvm_runner_impl.h +++ b/onnxruntime/core/providers/tvm/tvm_runner_impl.h @@ -24,22 +24,43 @@ class RunnerImpl { RunnerImpl(const std::shared_ptr& mod, const InputsInfoMap& inputs_info, const TVMTensorShapes output_shapes, - const std::vector tensors_outputs); + const std::vector tensors_outputs, + bool set_output_zero_copy); virtual ~RunnerImpl() = default; - virtual common::Status run(const OrtApi* /* api */, OrtKernelContext* context) { + virtual common::Status run(Ort::KernelContext& context) { + common::Status res; + if (set_output_zero_copy_) { + res = run_without_output_copying(context); + } else { + res = run_with_output_copying(context); + } + return res; + } + + virtual common::Status run_without_output_copying(Ort::KernelContext& context) { + set_input(context); + connect_output_tensors2ort(context); + set_output_zero_copy(); + run(); + + return Status::OK(); + } - Ort::KernelContext ctx{context}; - set_input(ctx); - connect_output_tensors2ort(ctx); - run_and_get_output(); + virtual common::Status run_with_output_copying(Ort::KernelContext& context) { + set_input(context); + connect_output_tensors2ort(context); + run(); + get_outputs(); return Status::OK(); } - virtual void set_input(Ort::KernelContext& ctx) = 0; + virtual void set_input(Ort::KernelContext& context) = 0; virtual void connect_output_tensors2ort(Ort::KernelContext& context) = 0; - virtual void run_and_get_output() = 0; + virtual void set_output_zero_copy() = 0; + virtual void run() = 0; + virtual void get_outputs() = 0; protected: void convert_input_tensors2dl_tensors(Ort::KernelContext& context, @@ -52,6 +73,7 @@ class RunnerImpl { InputsInfoMap inputs_info_; TVMTensorShapes output_shapes_; std::vector output_tensors_; + bool set_output_zero_copy_; }; @@ -61,12 +83,15 @@ class GERunnerImpl : public RunnerImpl { GERunnerImpl(const std::shared_ptr& mod, const InputsInfoMap& inputs_info, const TVMTensorShapes output_shapes, - const std::vector tensors_outputs); + const std::vector tensors_outputs, + bool set_output_zero_copy); virtual ~GERunnerImpl() = default; - void set_input(Ort::KernelContext& context) override final; - void connect_output_tensors2ort(Ort::KernelContext& context) override final; - void run_and_get_output() override final; + void set_input(Ort::KernelContext& context) final; + void connect_output_tensors2ort(Ort::KernelContext& context) final; + void set_output_zero_copy() final; + void run() final; + void get_outputs() final; }; @@ -76,12 +101,15 @@ class VMRunnerImpl : public RunnerImpl { VMRunnerImpl(const std::shared_ptr& mod, const InputsInfoMap& inputs_info, const TVMTensorShapes output_shapes, - const std::vector tensors_outputs); + const std::vector tensors_outputs, + bool set_output_zero_copy); virtual ~VMRunnerImpl() = default; - void set_input(Ort::KernelContext& context) override final; - void connect_output_tensors2ort(Ort::KernelContext& context) override final; - void run_and_get_output() override final; + void set_input(Ort::KernelContext& context) final; + void connect_output_tensors2ort(Ort::KernelContext& context) final; + void set_output_zero_copy() final; + void run() final; + void get_outputs() final; private: void infer_once_to_get_output_shapes(); From 10ab2529820e0e5ee179180855d253769656b447 Mon Sep 17 00:00:00 2001 From: Alex Kogan <82225080+sakogan@users.noreply.github.com> Date: Wed, 8 Feb 2023 13:02:54 -0500 Subject: [PATCH 31/68] Enable parallel output reordering in MlasReorderOutputNchw() (#13643) ### Description This PR speeds-up the output reordering operation (as implemented in [MlasReorderOutputNchw](https://github.com/microsoft/onnxruntime/blob/9954454c65086c49b7c00f83b23ada76975f3546/onnxruntime/core/mlas/lib/reorder.cpp#L400)) by replacing the sequential implementation with a parallelized one. The parallelization is achieved through the use of the existing [TryBatchParallelFor](https://github.com/microsoft/onnxruntime/blob/9954454c65086c49b7c00f83b23ada76975f3546/include/onnxruntime/core/platform/threadpool.h#L284) construct. ### Motivation and Context The output reordering operation is frequently executed in image processing models. Its implementation can be easily parallelized and therefore sped up when executed on a multi-core machine. The amount of speedup achieved by this PR varies and depends on the actual input. The table below summarizes the results of some of the experiments I have conducted on a 16-core VM running on an AMD EPYC 7742 64-core processor. The experiment is based on the existing [unit test](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/test/mlas/unittest/test_reorder_output.cpp) for the output reordering operation. The first column represents the shape of the output as BatchCount:Channels:Height:Width, and the numbers in other columns represent the latency (in us, on average out of 100 runs) for the tested variants. Specifically, I compare the (sequential) baseline (in second column) with the (parallelized) variants, each using a number of worker threads equal to 1, 2, 4, 8 or 16 (as specified in [the constructor to the threadpool object](https://github.com/microsoft/onnxruntime/blob/9954454c65086c49b7c00f83b23ada76975f3546/onnxruntime/test/mlas/unittest/test_main.cpp#L12)). The numbers in () represent the speedup over the baseline. | Input | baseline | 1 Thread | 2 Threads | 4 Threads | 8 Threads | 16 Threads| | ------------- | ------------- |---------------|---------------|---------------|---------------|---------------| 1:1:112:112 | 20.8 | 21.5 (x0.97) | 21.9 (x0.95) | 22.2 (x0.94) | 22.5 (x0.92) | 23.0 (x0.90) | 1:128:160:84 | 540.4 | 712.5 (x0.76) | 404.0 (x1.34) | 327.8 (x1.65) | 377.9 (x1.43) | 371.8 (x1.45) | 13:240:4:314 | 1484.0 | 1851.1 (x0.80) | 1080.9 (x1.37) | 570.2 (x2.60) | 531.8 (x2.79) | 511.2 (x2.90) | 13:96:4:314 | 471.0 | 679.9 (x0.69) | 427.2 (x1.10) | 372.1 (x1.27) | 445.5 (x1.06) | 428.5 (x1.10) | 1:64:320:168 | 1215.1 | 1497.8 (x0.81) | 863.8 (x1.41) | 456.7 (x2.66) | 435.7 (x2.79) | 462.5 (x2.63) | 30:240:4:140 | 1711.5 | 2181.4 (x0.78) | 1182.6 (x1.45) | 657.4 (x2.60) | 592.5 (x2.89) | 578.0 (x2.96) | 30:336:4:140 | 2432.5 | 3039.2 (x0.80) | 1695.6 (x1.43) | 920.7 (x2.64) | 817.1 (x2.98) | 819.2 (x2.97) | The initial drop between the baseline and the variant using just one worker thread can be attributed to the overhead of invoking the reordering loop as a functor in TryBatchParallelFor. This overhead is compensated by the speedup of parallel processing when the number of worker threads is increased. --- onnxruntime/contrib_ops/cpu/nchwc_ops.cc | 2 +- onnxruntime/core/mlas/inc/mlas.h | 3 +- onnxruntime/core/mlas/lib/reorder.cpp | 213 +++++++++++++----- .../test/mlas/unittest/test_conv2d_nchwc.h | 2 +- .../test/mlas/unittest/test_pool2d_nchwc.h | 2 +- .../mlas/unittest/test_reorder_output.cpp | 2 +- 6 files changed, 164 insertions(+), 60 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/nchwc_ops.cc b/onnxruntime/contrib_ops/cpu/nchwc_ops.cc index 7c4ee548ccaca..c16aaca5e71ea 100644 --- a/onnxruntime/contrib_ops/cpu/nchwc_ops.cc +++ b/onnxruntime/contrib_ops/cpu/nchwc_ops.cc @@ -144,7 +144,7 @@ Status ReorderOutput::Compute(OpKernelContext* context) const { if (channels_last_) { MlasReorderOutputNhwc(Y_shape.data(), x_data, y_data); } else { - MlasReorderOutputNchw(Y_shape.data(), x_data, y_data); + MlasReorderOutputNchw(Y_shape.data(), x_data, y_data, context->GetOperatorThreadPool()); } return Status::OK(); diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h index 5b6756e4fb90b..c1a4d16fd44fb 100644 --- a/onnxruntime/core/mlas/inc/mlas.h +++ b/onnxruntime/core/mlas/inc/mlas.h @@ -1085,7 +1085,8 @@ MLASCALL MlasReorderOutputNchw( const int64_t* OutputShape, const float* S, - float* D + float* D, + MLAS_THREADPOOL* ThreadPool ); void diff --git a/onnxruntime/core/mlas/lib/reorder.cpp b/onnxruntime/core/mlas/lib/reorder.cpp index 0d7fbd97a4a6f..99c1dbac3b692 100644 --- a/onnxruntime/core/mlas/lib/reorder.cpp +++ b/onnxruntime/core/mlas/lib/reorder.cpp @@ -1,6 +1,7 @@ /*++ Copyright (c) Microsoft Corporation. All rights reserved. +Copyright (c) 2019, 2022, Oracle and/or its affiliates. All rights reserved. Licensed under the MIT License. @@ -17,6 +18,20 @@ Module Name: #include "mlasi.h" +// +// Define the parameters to execute segments of a NCHW output reordering +// operation on worker threads. +// + +struct MLAS_REORDER_OUTPUT_NCHW_BLOCK { + ptrdiff_t TargetThreadCount; + const float* S; + float* D; + size_t OutputChannels; + size_t OutputSize; + size_t TasksCount; +}; + MLAS_FORCEINLINE void MlasReorderGatherFloat32x4( @@ -396,25 +411,22 @@ Return Value: } void -MLASCALL -MlasReorderOutputNchw( - const int64_t* OutputShape, - const float* S, - float* D +MlasReorderOutputNchwThreaded( + void* Context, + ptrdiff_t Index ) /*++ Routine Description: - This routine reorders an output buffer from NCHWc to NCHW format. + This routine is invoked from a worker thread to execute a segment of a + NCHW output reordering operation. Arguments: - OutputShape - Supplies the shape of the output tensor. - - S - Supplies the address of the source tensor. + Context - Supplies the pointer to the context for the threaded operation. - D - Supplies the address of the destination tensor. + Index - Supplies the current index of the threaded operation. Return Value: @@ -422,77 +434,168 @@ Return Value: --*/ { + const auto* WorkBlock = (MLAS_REORDER_OUTPUT_NCHW_BLOCK*)Context; + + const size_t OutputChannels = WorkBlock->OutputChannels; + const size_t OutputSize = WorkBlock->OutputSize; + const float* S = WorkBlock->S; + float* D = WorkBlock->D; + const size_t BlockSize = MlasNchwcGetBlockSize(); + const size_t TasksPerBatch = size_t(ceil(((float)OutputChannels) / BlockSize)); + const size_t LastTaskInBatchIndex = TasksPerBatch - 1; - const size_t BatchCount = size_t(OutputShape[0]); - const size_t OutputChannels = size_t(OutputShape[1]); - const size_t OutputSize = size_t(OutputShape[2]) * size_t(OutputShape[3]); + // + // Compute the range of task indices to use for this thread. + // + + size_t TaskStart; + size_t TasksRemaining; + MlasPartitionWork(Index, WorkBlock->TargetThreadCount, WorkBlock->TasksCount, + &TaskStart, &TasksRemaining); + + size_t TaskEnd = TaskStart + TasksRemaining; + // - // Transpose NCHWc blocks from the source buffer to the destination buffer. + // Rebase the pointers to the source and destination buffers for this thread. // - for (size_t batch = 0; batch < BatchCount; batch++) { + size_t FirstBatchIndex = TaskStart / TasksPerBatch; + size_t FirstTaskInBatchIndex = TaskStart % TasksPerBatch; + S += BlockSize * OutputSize * (FirstBatchIndex * TasksPerBatch + FirstTaskInBatchIndex); + D += OutputSize * (FirstBatchIndex * OutputChannels + BlockSize * FirstTaskInBatchIndex); - for (size_t o = OutputChannels; o > 0;) { + // + // Transpose NCHWc blocks associated with tasks in the range [TaskStart, TaskEnd) + // from the source buffer to the destination buffer. + // - const size_t OutputChannelsThisIteration = std::min(o, BlockSize); - const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3); - o -= OutputChannelsThisIteration; + for (size_t t = TaskStart; t < TaskEnd; t++) { + size_t TaskInBatchIndex = t % TasksPerBatch; - const float* s = S; - float* d = D; - size_t OutputSizeRemaining = OutputSize; + const size_t OutputChannelsThisIteration = (TaskInBatchIndex < LastTaskInBatchIndex) ? + BlockSize : OutputChannels - BlockSize * LastTaskInBatchIndex; + const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3); - for (; OutputSizeRemaining >= 4; OutputSizeRemaining -= 4) { + const float* s = S; + float* d = D; + size_t OutputSizeRemaining = OutputSize; - const float* ss = s; - float* dd = d; - size_t bc = 0; + for (; OutputSizeRemaining >= 4; OutputSizeRemaining -= 4) { - for (; bc < AlignedOutputChannelsThisIteration; bc += 4) { - MlasReorderTransposeFloat32x4x4(ss, dd, BlockSize, OutputSize); - ss += 4; - dd += 4 * OutputSize; - } + const float* ss = s; + float* dd = d; + size_t bc = 0; - for (; bc < OutputChannelsThisIteration; bc += 1) { - MlasReorderGatherFloat32x4(ss, dd, BlockSize); - ss += 1; - dd += OutputSize; - } + for (; bc < AlignedOutputChannelsThisIteration; bc += 4) { + MlasReorderTransposeFloat32x4x4(ss, dd, BlockSize, OutputSize); + ss += 4; + dd += 4 * OutputSize; + } - s += 4 * BlockSize; - d += 4; + for (; bc < OutputChannelsThisIteration; bc += 1) { + MlasReorderGatherFloat32x4(ss, dd, BlockSize); + ss += 1; + dd += OutputSize; } - for (; OutputSizeRemaining > 0; OutputSizeRemaining--) { + s += 4 * BlockSize; + d += 4; + } - const float* ss = s; - float* dd = d; - size_t bc = 0; + for (; OutputSizeRemaining > 0; OutputSizeRemaining--) { - for (; bc < AlignedOutputChannelsThisIteration; bc += 4) { - MlasReorderScatterFloat32x4(ss, dd, OutputSize); - ss += 4; - dd += 4 * OutputSize; - } + const float* ss = s; + float* dd = d; + size_t bc = 0; - for (; bc < OutputChannelsThisIteration; bc += 1) { - *dd = *ss++; - dd += OutputSize; - } + for (; bc < AlignedOutputChannelsThisIteration; bc += 4) { + MlasReorderScatterFloat32x4(ss, dd, OutputSize); + ss += 4; + dd += 4 * OutputSize; + } - s += BlockSize; - d += 1; + for (; bc < OutputChannelsThisIteration; bc += 1) { + *dd = *ss++; + dd += OutputSize; } - S += BlockSize * OutputSize; - D += OutputChannelsThisIteration * OutputSize; + s += BlockSize; + d += 1; } + + S += BlockSize * OutputSize; + D += OutputChannelsThisIteration * OutputSize; } } + +void +MLASCALL +MlasReorderOutputNchw( + const int64_t* OutputShape, + const float* S, + float* D, + MLAS_THREADPOOL* ThreadPool + ) +/*++ + +Routine Description: + + This routine reorders an output buffer from NCHWc to NCHW format. + +Arguments: + + OutputShape - Supplies the shape of the output tensor. + + S - Supplies the address of the source tensor. + + D - Supplies the address of the destination tensor. + +Return Value: + + None. + +--*/ +{ + MLAS_REORDER_OUTPUT_NCHW_BLOCK WorkBlock; + + // + // Capture the NCHW reorder output operation parameters to the work block. + // + + WorkBlock.S = S; + WorkBlock.D = D; + + WorkBlock.OutputChannels = size_t(OutputShape[1]); + WorkBlock.OutputSize = size_t(OutputShape[2]) * size_t(OutputShape[3]); + + const size_t BlockSize = MlasNchwcGetBlockSize(); + const size_t TasksPerBatch = size_t(ceil(((float)WorkBlock.OutputChannels) / BlockSize)); + const size_t BatchCount = size_t(OutputShape[0]); + const size_t TasksCount = BatchCount * TasksPerBatch; + WorkBlock.TasksCount = TasksCount; + + // + // Schedule the operation across a set of worker threads if the output + // tensor is sufficienly large. Limit the number of threads to at least + // the number of available tasks. + // + + ptrdiff_t TargetThreadCount = 1; + const size_t BufferSize = BatchCount * WorkBlock.OutputChannels * WorkBlock.OutputSize; + if (BufferSize > 1024 && TasksCount > 1) { + TargetThreadCount = MlasGetMaximumThreadCount(ThreadPool); + if (size_t(TargetThreadCount) > TasksCount) { + TargetThreadCount = ptrdiff_t(TasksCount); + } + } + WorkBlock.TargetThreadCount = TargetThreadCount; + + MlasExecuteThreaded(MlasReorderOutputNchwThreaded, &WorkBlock, TargetThreadCount, ThreadPool); +} + void MLASCALL MlasReorderOutputNhwc( diff --git a/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.h b/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.h index e516fa8a0b698..c125720668381 100644 --- a/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.h +++ b/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.h @@ -137,7 +137,7 @@ class MlasNchwcConv2DTest : public MlasConv2DTest { // Reorder the output buffer. // - MlasReorderOutputNchw(OutputShape, NchwcOutput, Output); + MlasReorderOutputNchw(OutputShape, NchwcOutput, Output, MlasConv2DTest::threadpool_); } const size_t BlockSize = MlasNchwcGetBlockSize(); diff --git a/onnxruntime/test/mlas/unittest/test_pool2d_nchwc.h b/onnxruntime/test/mlas/unittest/test_pool2d_nchwc.h index 10e3f7f927aba..38ac63a68c843 100644 --- a/onnxruntime/test/mlas/unittest/test_pool2d_nchwc.h +++ b/onnxruntime/test/mlas/unittest/test_pool2d_nchwc.h @@ -49,7 +49,7 @@ class MlasNchwcPool2DTest : public MlasPool2DTest { NchwcOutput, nullptr); - MlasReorderOutputNchw(OutputShape, NchwcOutput, Output); + MlasReorderOutputNchw(OutputShape, NchwcOutput, Output, nullptr); } MatrixGuardBuffer BufferNchwcInput; diff --git a/onnxruntime/test/mlas/unittest/test_reorder_output.cpp b/onnxruntime/test/mlas/unittest/test_reorder_output.cpp index a87d2a0fa5721..704333fd27fa0 100644 --- a/onnxruntime/test/mlas/unittest/test_reorder_output.cpp +++ b/onnxruntime/test/mlas/unittest/test_reorder_output.cpp @@ -27,7 +27,7 @@ class MlasReorderOutputTest : public MlasTestBase { std::fill_n(Output, OutputBufferElements, -0.5f); std::fill_n(OutputReference, OutputBufferElements, -0.5f); - MlasReorderOutputNchw(NchwOutputShape, Input, Output); + MlasReorderOutputNchw(NchwOutputShape, Input, Output, GetMlasThreadPool()); ReferenceReorderOutput(BatchCount, Channels, Height, Width, Input, OutputReference, false); ASSERT_EQ(memcmp(Output, OutputReference, OutputBufferElements * sizeof(float)), 0) << " [Nchw] batch=" << BatchCount << ", channels=" << Channels From 767619cf3b823f7ae803cd5307efd4fb7cfa9546 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Wed, 8 Feb 2023 10:05:53 -0800 Subject: [PATCH 32/68] Rework C API to remove new/delete warnings (#14572) ### Description Re-work code so it does not require GSL_SUPPRESS ### Motivation and Context Do things right. --- onnxruntime/core/session/onnxruntime_c_api.cc | 100 ++++++++++-------- 1 file changed, 54 insertions(+), 46 deletions(-) diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc index c8e1fad05a90b..4bcf6e9a5a072 100644 --- a/onnxruntime/core/session/onnxruntime_c_api.cc +++ b/onnxruntime/core/session/onnxruntime_c_api.cc @@ -824,8 +824,10 @@ ORT_API_STATUS_IMPL(OrtApis::Run, _Inout_ OrtSession* sess, _In_opt_ const OrtRu API_IMPL_BEGIN auto session = reinterpret_cast<::onnxruntime::InferenceSession*>(sess); - std::vector feed_names(input_len); - std::vector feeds(input_len); + InlinedVector feed_names; + feed_names.reserve(input_len); + InlinedVector feeds; + feeds.reserve(input_len); for (size_t i = 0; i != input_len; ++i) { if (input_names[i] == nullptr || input_names[i][0] == '\0') { @@ -833,31 +835,34 @@ ORT_API_STATUS_IMPL(OrtApis::Run, _Inout_ OrtSession* sess, _In_opt_ const OrtRu } if (!input[i]) { - std::ostringstream ostr; - ostr << "NULL input supplied for input " << input_names[i]; - return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, ostr.str().c_str()); + return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, + MakeString("NULL input supplied for input ", input_names[i]).c_str()); } - feed_names[i] = input_names[i]; - feeds[i] = *reinterpret_cast(input[i]); + feed_names.emplace_back(input_names[i]); + feeds.emplace_back(*input[i]); } // Create output feed - std::vector output_names(output_names_len); + InlinedVector output_names; + output_names.reserve(output_names_len); for (size_t i = 0; i != output_names_len; ++i) { if (output_names1[i] == nullptr || output_names1[i][0] == '\0') { return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "output name cannot be empty"); } - output_names[i] = output_names1[i]; + output_names.emplace_back(output_names1[i]); } - std::vector fetches(output_names_len); + std::vector fetches; + fetches.reserve(output_names_len); for (size_t i = 0; i != output_names_len; ++i) { if (output[i] != nullptr) { - ::OrtValue& value = *(output[i]); - fetches[i] = value; + fetches.emplace_back(*output[i]); + } else { + fetches.emplace_back(); } } + Status status; if (run_options == nullptr) { OrtRunOptions op; @@ -868,11 +873,24 @@ ORT_API_STATUS_IMPL(OrtApis::Run, _Inout_ OrtSession* sess, _In_opt_ const OrtRu if (!status.IsOK()) return ToOrtStatus(status); + + // We do it in two loops to make sure copy __ctors does not throw + InlinedVector> output_unique_ptrs; + output_unique_ptrs.reserve(output_names_len); for (size_t i = 0; i != output_names_len; ++i) { - ::OrtValue& value = fetches[i]; if (output[i] == nullptr) { - GSL_SUPPRESS(r .11) - output[i] = new OrtValue(value); + output_unique_ptrs.emplace_back(std::make_unique(fetches[i])); + } else { + output_unique_ptrs.emplace_back(); + } + } + + assert(output_unique_ptrs.size() == output_names_len); + + for (size_t i = 0; i != output_names_len; ++i) { + if (output[i] == nullptr) { + assert(output_unique_ptrs[i] != nullptr); + output[i] = output_unique_ptrs[i].release(); } } return nullptr; @@ -912,8 +930,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateIoBinding, _Inout_ OrtSession* sess, _Outptr_ if (!status.IsOK()) { return ToOrtStatus(status); } - GSL_SUPPRESS(r .11) - *out = new OrtIoBinding(std::move(binding)); + *out = std::make_unique(std::move(binding)).release(); return nullptr; API_IMPL_END } @@ -1010,34 +1027,27 @@ ORT_API_STATUS_IMPL(OrtApis::GetBoundOutputValues, _In_ const OrtIoBinding* bind } // Used to destroy and de-allocate on exception - size_t created = 0; IAllocatorUniquePtr ortvalues_alloc(reinterpret_cast(allocator->Alloc(allocator, outputs.size() * sizeof(OrtValue*))), - [&created, allocator](OrtValue** buffer) { - if (buffer) { - while (created > 0) { - auto p = buffer + --created; - delete (*p); - } - allocator->Free(allocator, buffer); - } - }); - + [allocator](OrtValue** p) { if (p) allocator->Free(allocator, p);}); if (!ortvalues_alloc) { return OrtApis::CreateStatus(ORT_FAIL, "Output buffer allocation failed"); } - OrtValue** out_ptr = ortvalues_alloc.get(); + InlinedVector> value_dups; + value_dups.reserve(outputs.size()); + for (const auto& out_value : outputs) { - GSL_SUPPRESS(r .11) - *out_ptr = new OrtValue(out_value); - ++out_ptr; - ++created; + value_dups.push_back(std::make_unique(out_value)); } - assert(created == outputs.size()); - + // The rest is noexcept + OrtValue** out_ptr = ortvalues_alloc.get(); + for (auto& v : value_dups) { + *out_ptr++ = v.release(); + } + *output = ortvalues_alloc.release(); - *output_count = created; + *output_count = outputs.size(); return nullptr; API_IMPL_END } @@ -1369,8 +1379,7 @@ ORT_API_STATUS_IMPL(OrtApis::SessionGetModelMetadata, _In_ const OrtSession* ses auto p = session->GetModelMetadata(); if (!p.first.IsOK()) return ToOrtStatus(p.first); - GSL_SUPPRESS(r .11) - *out = reinterpret_cast(new ModelMetadata(*p.second)); + *out = reinterpret_cast(std::make_unique(*p.second).release()); return nullptr; API_IMPL_END } @@ -2214,12 +2223,12 @@ ORT_API_STATUS_IMPL(OrtApis::SessionGetProfilingStartTimeNs, _In_ const OrtSessi ORT_API_STATUS_IMPL(OrtApis::CreateArenaCfg, _In_ size_t max_mem, int arena_extend_strategy, int initial_chunk_size_bytes, int max_dead_bytes_per_chunk, _Outptr_ OrtArenaCfg** out) { API_IMPL_BEGIN - GSL_SUPPRESS(r .11) - *out = new OrtArenaCfg(); - (*out)->max_mem = max_mem; - (*out)->arena_extend_strategy = arena_extend_strategy; - (*out)->initial_chunk_size_bytes = initial_chunk_size_bytes; - (*out)->max_dead_bytes_per_chunk = max_dead_bytes_per_chunk; + auto cfg = std::make_unique(); + cfg->max_mem = max_mem; + cfg->arena_extend_strategy = arena_extend_strategy; + cfg->initial_chunk_size_bytes = initial_chunk_size_bytes; + cfg->max_dead_bytes_per_chunk = max_dead_bytes_per_chunk; + *out = cfg.release(); return nullptr; API_IMPL_END } @@ -2254,9 +2263,8 @@ ORT_API_STATUS_IMPL(OrtApis::CreateArenaCfgV2, _In_reads_(num_keys) const char* } // Allow using raw new/delete because this is for C. -GSL_SUPPRESS(r .11) ORT_API(void, OrtApis::ReleaseArenaCfg, _Frees_ptr_opt_ OrtArenaCfg* ptr) { - delete ptr; + std::unique_ptr g(ptr); } ORT_API_STATUS_IMPL(OrtApis::CreatePrepackedWeightsContainer, _Outptr_ OrtPrepackedWeightsContainer** out) { From 0a6b22018fd52d927c745cfa1d51d9c7db629f69 Mon Sep 17 00:00:00 2001 From: Kevin Chen <45886021+kevinch-nv@users.noreply.github.com> Date: Wed, 8 Feb 2023 10:19:55 -0800 Subject: [PATCH 33/68] Move TRT include_directories to outside scope (#14622) Signed-off-by: Kevin Chen ### Description Previously `include_directories(${TENSORRT_INCLUDE_DIR})` was only done if `onnxruntime_USE_TENSORRT_BUILTIN_PARSER` was false. This would cause a build failure when the switch was true as the include directory was not added. ### Motivation and Context Fixes TRT build when `onnxruntime_USE_TENSORRT_BUILTIN_PARSER` is true. --------- Signed-off-by: Kevin Chen --- cmake/onnxruntime_providers.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake index 33d13f4476b51..84e429db302c1 100644 --- a/cmake/onnxruntime_providers.cmake +++ b/cmake/onnxruntime_providers.cmake @@ -689,10 +689,11 @@ if (onnxruntime_USE_TENSORRT) target_compile_options(nvonnxparser_static PRIVATE /FIio.h /wd4100) target_compile_options(nvonnxparser PRIVATE /FIio.h /wd4100) endif() - include_directories(${TENSORRT_INCLUDE_DIR}) set(onnxparser_link_libs nvonnxparser_static) endif() + include_directories(${TENSORRT_INCLUDE_DIR}) + set(trt_link_libs cudnn cublas ${CMAKE_DL_LIBS} ${TENSORRT_LIBRARY}) file(GLOB_RECURSE onnxruntime_providers_tensorrt_cc_srcs CONFIGURE_DEPENDS @@ -710,11 +711,10 @@ if (onnxruntime_USE_TENSORRT) add_dependencies(onnxruntime_providers_tensorrt onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES}) if (onnxruntime_USE_TENSORRT_BUILTIN_PARSER) target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${trt_link_libs} cudart ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface ${ABSEIL_LIBS}) - target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} ${TENSORRT_INCLUDE_DIR} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) else() target_link_libraries(onnxruntime_providers_tensorrt PRIVATE ${onnxparser_link_libs} ${trt_link_libs} cudart ${ONNXRUNTIME_PROVIDERS_SHARED} ${PROTOBUF_LIB} flatbuffers::flatbuffers ${ABSEIL_LIBS}) - target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) endif() + target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) if(onnxruntime_CUDNN_HOME) target_include_directories(onnxruntime_providers_tensorrt PRIVATE ${onnxruntime_CUDNN_HOME}/include) endif() From cfda876a3f0f79cc88e57996034acca19d81d30b Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Wed, 8 Feb 2023 12:18:17 -0800 Subject: [PATCH 34/68] Remove torch package from requirements.txt of stable diffusion models (#14630) ### Description Remove torch package from requirements to unblock nuget windowsai pipeline which does not allow --extra-index-url ### Motivation and Context --- .../transformers/models/stable_diffusion/requirements.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt index 8b57df8852765..45190f2fb9912 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt @@ -10,5 +10,6 @@ packaging==23.0 protobuf==3.20.3 psutil==5.9.4 sympy==1.11.1 ---extra-index-url https://download.pytorch.org/whl/cu117 -torch==1.13.1+cu117 +#Tested with PyTorch 1.13.1+cu117 (see pytorch.org for more download options). +#--extra-index-url https://download.pytorch.org/whl/cu117 +#torch==1.13.1+cu117 From 30ec8b038f69cdafa59ec851258be31b8c4448c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= Date: Wed, 8 Feb 2023 23:11:31 +0100 Subject: [PATCH 35/68] Test and fix optimizers LayerNormFusion, BiasSoftmaxFusion, Transpose for opset 18 (#14542) ### Description Due to the changes introduced in opset 18 on Reduce operators (axes is an input and not an attribute), the following optimizers are not catching the pattern they are supposed to optimize. This PR addresses that. * layer_norm_fusion.cc: the optimizer was not detecting the pattern it was suppose to optimize * bias_softmax_fusion.cc: the optimizer was not detecting the pattern it was suppose to optimize * transpose_optimizer.cc: the optimizer was not optimize Reduce operators other than ReduceSum ### Motivation and Context Better performance. --------- Signed-off-by: xadupre --- .../core/optimizer/bias_softmax_fusion.cc | 1 + .../core/optimizer/layer_norm_fusion.cc | 35 +- .../transpose_optimizer.cc | 16 +- .../test/optimizer/graph_transform_test.cc | 31 +- .../optimizer/graph_transform_test_builder.cc | 67 ++- .../optimizer/graph_transform_test_builder.h | 33 ++ .../test/optimizer/nhwc_transformer_test.cc | 16 + onnxruntime/test/optimizer/qdq_test_utils.h | 6 + .../test/optimizer/qdq_transformer_test.cc | 186 +++++++- .../optimizer/transpose_optimizer_test.cc | 442 +++++++++++------- 10 files changed, 605 insertions(+), 228 deletions(-) diff --git a/onnxruntime/core/optimizer/bias_softmax_fusion.cc b/onnxruntime/core/optimizer/bias_softmax_fusion.cc index 80603cdbd3270..7c34449d583cc 100755 --- a/onnxruntime/core/optimizer/bias_softmax_fusion.cc +++ b/onnxruntime/core/optimizer/bias_softmax_fusion.cc @@ -135,6 +135,7 @@ bool TrySelectInputAndBiasWithAlignment(Node& add_node, Node& softmax_node, Node new_axis = (int)HandleNegativeAxis(axis, rank); // The axis attribute for Softmax in OpSet-11 and OpSet-13 are different. + // Details in function documentatin. if (is_since_opset_13 && new_axis != rank - 1) return false; int singlebatch_rank = rank - new_axis; diff --git a/onnxruntime/core/optimizer/layer_norm_fusion.cc b/onnxruntime/core/optimizer/layer_norm_fusion.cc index 9895918dd2653..25feb5b8d702c 100644 --- a/onnxruntime/core/optimizer/layer_norm_fusion.cc +++ b/onnxruntime/core/optimizer/layer_norm_fusion.cc @@ -4,6 +4,7 @@ #include "core/optimizer/layer_norm_fusion.h" #include "core/graph/graph_utils.h" #include "core/optimizer/utils.h" +#include "core/optimizer/transpose_optimizer/optimizer_api.h" #include "float.h" #include @@ -16,12 +17,17 @@ static constexpr std::array supported_data_types{"tensor(fl // Default epsilon static constexpr float DEFAULT_LAYERNORM_EPSILON = 1e-5f; -static bool IsSupportedDataType(const Node& node) { +static bool IsSupportedDataType(const Node& node, int first_n_inputs=-1) { + int input_index = 0; for (const auto& input_arg : node.InputDefs()) { + if (first_n_inputs != -1 && input_index >= first_n_inputs) { + return true; + } if (std::find(supported_data_types.begin(), supported_data_types.end(), *(input_arg->Type())) == supported_data_types.end()) { return false; } + ++input_index; } return true; } @@ -99,11 +105,11 @@ Status LayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, Node& reduce_mean_node = *p_reduce_mean; ORT_RETURN_IF_ERROR(Recurse(reduce_mean_node, modified, graph_level, logger)); - if (!graph_utils::IsSupportedOptypeVersionAndDomain(reduce_mean_node, "ReduceMean", {1, 11, 13}) || + if (!graph_utils::IsSupportedOptypeVersionAndDomain(reduce_mean_node, "ReduceMean", {1, 11, 13, 18}) || !graph_utils::IsSupportedProvider(reduce_mean_node, GetCompatibleExecutionProviders()) || (reduce_mean_node.GetOutputEdgesCount() != 1 && reduce_mean_node.GetOutputEdgesCount() != 2) || graph.NodeProducesGraphOutput(reduce_mean_node) || - !IsSupportedDataType(reduce_mean_node)) { + !IsSupportedDataType(reduce_mean_node, 1)) { continue; } nodes_to_remove.push_back(reduce_mean_node); @@ -263,10 +269,10 @@ Status LayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, continue; } Node& reduce_mean2_node = *graph.GetNode(p_reduce_mean2->Index()); - if (!graph_utils::IsSupportedOptypeVersionAndDomain(reduce_mean2_node, "ReduceMean", {1, 11, 13}) || + if (!graph_utils::IsSupportedOptypeVersionAndDomain(reduce_mean2_node, "ReduceMean", {1, 11, 13, 18}) || reduce_mean2_node.GetExecutionProviderType() != reduce_mean_node.GetExecutionProviderType() || !optimizer_utils::CheckOutputEdges(graph, reduce_mean2_node, 1) || - !IsSupportedDataType(reduce_mean2_node) || + !IsSupportedDataType(reduce_mean2_node, 1) || reduce_mean2_node.GetInputEdgesCount() == 0) { continue; } @@ -333,8 +339,16 @@ Status LayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, // get axes attributes const onnxruntime::NodeAttributes& attributes = reduce_mean_node.GetAttributes(); std::vector axes_values; + // TODO: modify this codes when opset >= 18 (axes is an input). if (attributes.find("axes") != attributes.end()) { axes_values = RetrieveValues(attributes.at("axes")); + } else if (reduce_mean_node.InputDefs().size() == 2) { + auto axes = reduce_mean_node.InputDefs()[1]; + auto axes_const = graph.GetConstantInitializer(axes->Name(), true); + if (axes_const != nullptr) { + Initializer initializer{*axes_const, graph.ModelPath()}; + axes_values.insert(axes_values.end(), initializer.DataAsSpan().begin(), initializer.DataAsSpan().end()); + } } // Get the inputs for the new LayerNormalization node. @@ -485,9 +499,9 @@ Status SimplifiedLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int gr continue; } Node& reduce_mean_node = *graph.GetNode(p_reduce_mean->Index()); - if (!graph_utils::IsSupportedOptypeVersionAndDomain(reduce_mean_node, "ReduceMean", {1, 11, 13}) || + if (!graph_utils::IsSupportedOptypeVersionAndDomain(reduce_mean_node, "ReduceMean", {1, 11, 13, 18}) || reduce_mean_node.GetExecutionProviderType() != pow_node.GetExecutionProviderType() || - !optimizer_utils::CheckOutputEdges(graph, reduce_mean_node, 1) || !IsSupportedDataType(reduce_mean_node) || + !optimizer_utils::CheckOutputEdges(graph, reduce_mean_node, 1) || !IsSupportedDataType(reduce_mean_node, 1) || reduce_mean_node.GetInputEdgesCount() == 0) { continue; } @@ -585,6 +599,13 @@ Status SimplifiedLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int gr std::vector axes_values; if (attributes.find("axes") != attributes.end()) { axes_values = RetrieveValues(attributes.at("axes")); + } else if (reduce_mean_node.InputDefs().size() == 2) { + auto axes = reduce_mean_node.InputDefs()[1]; + auto axes_const = graph.GetConstantInitializer(axes->Name(), true); + if (axes_const != nullptr && axes_const->data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT64) { + Initializer initializer{*axes_const, graph.ModelPath()}; + axes_values.insert(axes_values.end(), initializer.DataAsSpan().begin(), initializer.DataAsSpan().end()); + } } // Get the inputs for the new LayerNormalization node. diff --git a/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc b/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc index 0ac7cbb8fa058..700c91ab85974 100644 --- a/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc +++ b/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc @@ -1040,7 +1040,7 @@ static bool HandlePad(HandlerArgs& args) { constexpr HandlerInfo pad_handler = {&FirstInput, &HandlePad}; -static bool HandleReduceOp(HandlerArgs& args) { +static bool HandleReduceOpWithArg(HandlerArgs& args) { int64_t keepdims = args.node.GetAttributeIntDefault("keepdims", 1); std::optional> axes = args.node.GetAttributeInts("axes"); @@ -1078,11 +1078,11 @@ static bool HandleReduceOp(HandlerArgs& args) { return true; } -constexpr HandlerInfo reduce_op_handler = {&FirstInput, &HandleReduceOp}; - -static bool HandleReduceSum(HandlerArgs& args) { - if (args.ctx.opset < 13) { - return HandleReduceOp(args); +static bool HandleReduceOps(HandlerArgs& args) { + if ((args.node.OpType() == "ReduceSum" && args.ctx.opset < 13) || + // or all other reduce operators since opset 18 + (args.node.OpType() != "ReduceSum" && args.ctx.opset < 18)) { + return HandleReduceOpWithArg(args); } bool keepdims = args.node.GetAttributeIntDefault("keepdims", 1) != 0; @@ -1147,7 +1147,7 @@ static bool HandleReduceSum(HandlerArgs& args) { return true; } -constexpr HandlerInfo reduce_sum_handler = {&FirstInput, &HandleReduceSum}; +constexpr HandlerInfo reduce_op_handler = {&FirstInput, &HandleReduceOps}; static bool HandleSqueeze(HandlerArgs& args) { std::vector new_axes; @@ -1709,7 +1709,7 @@ static const std::unordered_map handler_ma #if !defined(USE_CUDA) && !defined(USE_ROCM) {"Resize", resize_handler}, #endif - {"ReduceSum", reduce_sum_handler}, + {"ReduceSum", reduce_op_handler}, {"ReduceLogSum", reduce_op_handler}, {"ReduceLogSumExp", reduce_op_handler}, diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc index fde8392d943cd..9df487726ed8b 100755 --- a/onnxruntime/test/optimizer/graph_transform_test.cc +++ b/onnxruntime/test/optimizer/graph_transform_test.cc @@ -95,7 +95,6 @@ namespace onnxruntime { namespace test { #define MODEL_FOLDER ORT_TSTR("testdata/transform/") - TEST_F(GraphTransformationTests, IdentityElimination) { constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "abs-id-max.onnx"; std::shared_ptr model; @@ -4390,11 +4389,12 @@ TEST_F(GraphTransformationTests, ReshapeFusionOpsetTest) { return Status::OK(); }; - const std::vector opsets{11, 12, 13, 14, 15, 15}; + const std::vector opsets{11, 12, 13, 14, 15, 18}; bool shape_test_for_opset15 = false; - for (auto& opset_version : opsets) { + for (auto& opset : opsets) { auto build_test_case = [&](ModelTestBuilder& builder) { + auto opset_version = builder.DomainToVersionMap().find(kOnnxDomain)->second; auto* input_arg0 = builder.MakeInput({{batch_size, seq_lenth, hidden_size}}); auto* input_arg1 = builder.MakeInput({{hidden_size}}); auto* scalar_int_0 = builder.MakeInitializer({}, {0}); @@ -4414,7 +4414,7 @@ TEST_F(GraphTransformationTests, ReshapeFusionOpsetTest) { auto* out = builder.MakeOutput(); builder.AddNode("Add", {input_arg0, input_arg1}, {add_out}); - if (opset_version == 15) { + if (opset_version >= 15) { if (shape_test_for_opset15) { auto& shape_1 = builder.AddNode("Shape", {add_out}, {shape_out}); shape_1.AddAttribute("start", (int64_t)1); @@ -4442,11 +4442,11 @@ TEST_F(GraphTransformationTests, ReshapeFusionOpsetTest) { }; std::unique_ptr transformer = std::make_unique(); - if (opset_version == 15 && shape_test_for_opset15) { - ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, opset_version, *logger_, std::move(transformer), TransformerLevel::Level1, 1, + if (opset >= 15 && shape_test_for_opset15) { + ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, opset, *logger_, std::move(transformer), TransformerLevel::Level1, 1, pre_graph_checker, pre_graph_checker)); } else { - ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, opset_version, *logger_, std::move(transformer), TransformerLevel::Level1, 1, + ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, opset, *logger_, std::move(transformer), TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker)); } } @@ -4610,13 +4610,24 @@ TEST_F(GraphTransformationTests, LayerNormWithCastFusionTest_5) { auto* cast_out_2 = builder.MakeIntermediate(); auto* mul_out = builder.MakeIntermediate(); auto* add_out_2 = builder.MakeOutput(); + auto opset = builder.DomainToVersionMap().find(kOnnxDomain)->second; + onnxruntime::NodeArg* axes = nullptr; - builder.AddNode("ReduceMean", {data_arg}, {reduce_mean_out_1}).AddAttribute("axes", std::vector{-1}); + if (opset >= 18) { + axes = builder.MakeInitializer({1}, {-1}); + builder.AddNode("ReduceMean", {data_arg, axes}, {reduce_mean_out_1}); + } else { + builder.AddNode("ReduceMean", {data_arg}, {reduce_mean_out_1}).AddAttribute("axes", std::vector{-1}); + } builder.AddNode("Sub", {data_arg, reduce_mean_out_1}, {sub_out}); builder.AddNode("Cast", {sub_out}, {cast_out_1}) .AddAttribute("to", static_cast(ONNX_NAMESPACE::TensorProto_DataType_FLOAT)); builder.AddNode("Pow", {cast_out_1, pow_initializer}, {pow_out}); - builder.AddNode("ReduceMean", {pow_out}, {reduce_mean_out_2}).AddAttribute("axes", std::vector{-1}); + if (opset >= 18) { + builder.AddNode("ReduceMean", {pow_out, axes}, {reduce_mean_out_2}); + } else { + builder.AddNode("ReduceMean", {pow_out}, {reduce_mean_out_2}).AddAttribute("axes", std::vector{-1}); + } builder.AddNode("Add", {reduce_mean_out_2, add_initializer}, {add_out_1}); builder.AddNode("Sqrt", {add_out_1}, {sqrt_out}); builder.AddNode("Div", {cast_out_1, sqrt_out}, {div_out}); @@ -4652,7 +4663,7 @@ TEST_F(GraphTransformationTests, LayerNormWithCastFusionTest_5) { }; std::unique_ptr transformer = std::make_unique(); - ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger_, std::move(transformer), TransformerLevel::Level1, + ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, {14, 18}, *logger_, std::move(transformer), TransformerLevel::Level1, 1, pre_graph_checker, post_graph_checker)); } diff --git a/onnxruntime/test/optimizer/graph_transform_test_builder.cc b/onnxruntime/test/optimizer/graph_transform_test_builder.cc index 274b9184e037a..80f17fdda3936 100644 --- a/onnxruntime/test/optimizer/graph_transform_test_builder.cc +++ b/onnxruntime/test/optimizer/graph_transform_test_builder.cc @@ -17,6 +17,31 @@ namespace onnxruntime { namespace test { +void TransformerTester(const std::function& build_test_case, + const std::function& check_transformed_graph, + TransformerLevel baseline_level, + TransformerLevel target_level, + const std::vector& opset_versions, + double per_sample_tolerance, + double relative_per_sample_tolerance, + std::unique_ptr transformer, + const std::function& add_session_options, + const InlinedHashSet& disabled_optimizers) { + ASSERT_TRUE(transformer == nullptr); + for (auto opset_version : opset_versions) { + TransformerTester(build_test_case, + check_transformed_graph, + baseline_level, + target_level, + opset_version, + per_sample_tolerance, + relative_per_sample_tolerance, + nullptr, + add_session_options, + disabled_optimizers); + } +} + void TransformerTester(const std::function& build_test_case, const std::function& check_transformed_graph, TransformerLevel baseline_level, @@ -101,22 +126,36 @@ Status TestGraphTransformer(const std::function& const logging::Logger& logger, std::unique_ptr transformer, TransformerLevel level, unsigned steps, const std::function& pre_graph_checker, const std::function& post_graph_checker) { - // Build the model for this test. - std::unordered_map domain_to_version; - domain_to_version[kOnnxDomain] = opset_version; - domain_to_version[kMSDomain] = 1; - Model model("TransformerTester", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), - domain_to_version, {}, logger); - Graph& graph = model.MainGraph(); - ModelTestBuilder helper(graph); - build_test_case(helper); - helper.SetGraphOutputs(); - ORT_RETURN_IF_ERROR(graph.Resolve()); - ORT_RETURN_IF_ERROR(pre_graph_checker(graph)); + const std::vector opset_versions{opset_version}; + return TestGraphTransformer(build_test_case, opset_versions, logger, std::move(transformer), + level, steps, pre_graph_checker, post_graph_checker); +} + +Status TestGraphTransformer(const std::function& build_test_case, + const std::vector& opset_versions, + const logging::Logger& logger, std::unique_ptr transformer, + TransformerLevel level, unsigned steps, const std::function& pre_graph_checker, + const std::function& post_graph_checker) { onnxruntime::GraphTransformerManager graph_transformation_mgr{steps}; ORT_RETURN_IF_ERROR(graph_transformation_mgr.Register(std::move(transformer), level)); - ORT_RETURN_IF_ERROR(graph_transformation_mgr.ApplyTransformers(graph, level, logger)); - ORT_RETURN_IF_ERROR(post_graph_checker(graph)); + + for (auto opset : opset_versions) { + // Build the model for this test. + std::unordered_map domain_to_version; + domain_to_version[kOnnxDomain] = opset; + domain_to_version[kMSDomain] = 1; + Model model("TransformerTester", false, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), + domain_to_version, {}, logger); + Graph& graph = model.MainGraph(); + ModelTestBuilder helper(graph); + build_test_case(helper); + helper.SetGraphOutputs(); + ORT_RETURN_IF_ERROR(graph.Resolve()); + ORT_RETURN_IF_ERROR(pre_graph_checker(graph)); + ORT_RETURN_IF_ERROR(graph_transformation_mgr.ApplyTransformers(graph, level, logger)); + ORT_RETURN_IF_ERROR(post_graph_checker(graph)); + } + return Status::OK(); } diff --git a/onnxruntime/test/optimizer/graph_transform_test_builder.h b/onnxruntime/test/optimizer/graph_transform_test_builder.h index 199f86e056bcb..14c73b2b558af 100644 --- a/onnxruntime/test/optimizer/graph_transform_test_builder.h +++ b/onnxruntime/test/optimizer/graph_transform_test_builder.h @@ -50,6 +50,10 @@ class ModelTestBuilder { ModelTestBuilder(Graph& graph) : graph_(graph) { } + const std::unordered_map& DomainToVersionMap() const noexcept { + return graph_.DomainToVersionMap(); + } + template NodeArg* MakeInput(const std::vector& shape, const std::vector& data) { ONNX_NAMESPACE::TypeProto type_proto; @@ -356,6 +360,17 @@ void TransformerTester(const std::function& buil const std::function& add_session_options = {}, const InlinedHashSet& disabled_optimizers = {}); +void TransformerTester(const std::function& build_test_case, + const std::function& check_transformed_graph, + TransformerLevel baseline_level, + TransformerLevel target_level, + const std::vector& opset_versions, + double per_sample_tolerance = 0.0, + double relative_per_sample_tolerance = 0.0, + std::unique_ptr transformer = nullptr, // must be null in this case. + const std::function& add_session_options = {}, + const InlinedHashSet& disabled_optimizers = {}); + /** * @brief Apply a GraphTransformer to a graph, and run graph checkers before and after applying the transformer. * @@ -372,5 +387,23 @@ Status TestGraphTransformer(const std::function& const logging::Logger& logger, std::unique_ptr transformer, TransformerLevel level, unsigned steps, const std::function& pre_graph_checker, const std::function& post_graph_checker); + +/** + * @brief Apply a GraphTransformer to a graph, and run graph checkers before and after applying the transformer. + * + * @param build_test_case The function to build a graph for testing + * @param opset_versions A graph is created and tested for every opset in this set + * @param logger The logger + * @param transformer The GraphTransformer to be applied + * @param level The transformer level on which the transformer will be applied + * @param steps The step count of the GraphTransformerManager + * @param pre_graph_checker The graph checker function before applying the transformer + * @param post_graph_checker The graph checker function after applying the transformer + */ +Status TestGraphTransformer(const std::function& build_test_case, + const std::vector& opset_versions, + const logging::Logger& logger, std::unique_ptr transformer, + TransformerLevel level, unsigned steps, const std::function& pre_graph_checker, + const std::function& post_graph_checker); } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/optimizer/nhwc_transformer_test.cc b/onnxruntime/test/optimizer/nhwc_transformer_test.cc index cbb4de74bfa15..99e94cff6275d 100644 --- a/onnxruntime/test/optimizer/nhwc_transformer_test.cc +++ b/onnxruntime/test/optimizer/nhwc_transformer_test.cc @@ -278,6 +278,9 @@ TEST(NhwcTransformerTests, ConvSplit) { conv_output_arg, .37f, 131); conv_node.AddAttribute("pads", std::vector{1, 1, 1, 1}); Node& split_node = builder.AddNode("Split", {conv_output_arg}, {split_output1_arg, split_output2_arg}); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + split_node.AddAttribute("num_outputs", static_cast(2)); + } split_node.AddAttribute("axis", static_cast(axis)); builder.AddQLinearBinaryNode("QLinearAdd", split_output1_arg, .37f, 131, @@ -302,6 +305,11 @@ TEST(NhwcTransformerTests, ConvSplit) { check_nhwc_graph, TransformerLevel::Level2, TransformerLevel::Level3); + TransformerTester(build_test_case, + check_nhwc_graph, + TransformerLevel::Level2, + TransformerLevel::Level3, + 18); } } @@ -323,6 +331,9 @@ TEST(NhwcTransformerTests, ConvSplitQLinearConcat) { conv_node.AddAttribute("pads", std::vector{1, 1, 1, 1}); Node& split_node = builder.AddNode("Split", {conv_output_arg}, {split_output1_arg, split_output2_arg}); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + split_node.AddAttribute("num_outputs", static_cast(2)); + } split_node.AddAttribute("axis", static_cast(axis)); Node& qlconcat_node = builder.AddQLinearConcatLike( @@ -346,6 +357,11 @@ TEST(NhwcTransformerTests, ConvSplitQLinearConcat) { check_nhwc_graph, TransformerLevel::Level2, TransformerLevel::Level3); + TransformerTester(build_test_case, + check_nhwc_graph, + TransformerLevel::Level2, + TransformerLevel::Level3, + 18); } } diff --git a/onnxruntime/test/optimizer/qdq_test_utils.h b/onnxruntime/test/optimizer/qdq_test_utils.h index cb19a1e69e8f8..0ba991a4d22e1 100644 --- a/onnxruntime/test/optimizer/qdq_test_utils.h +++ b/onnxruntime/test/optimizer/qdq_test_utils.h @@ -378,6 +378,9 @@ GetQDQTestCaseFn BuildConsolidationTestCase( auto* split_output_3 = builder.MakeIntermediate(); Node& split_node = builder.AddNode("Split", {upper_dq_output}, {split_output_1, split_output_2, split_output_3}); split_node.AddAttribute("axis", axis); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + split_node.AddAttribute("num_outputs", static_cast(3)); + } // add Q auto* lower_q_output_1 = builder.MakeIntermediate(); @@ -456,6 +459,9 @@ GetQDQTestCaseFn BuildQDQSplitTestCase( auto* split_output_3 = builder.MakeIntermediate(); Node& split_node = builder.AddNode("Split", {dq_output}, {split_output_1, split_output_2, split_output_3}); split_node.AddAttribute("axis", axis); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + split_node.AddAttribute("num_outputs", static_cast(3)); + } // add Q auto* q_split_output_1 = builder.MakeOutput(); diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc index b253273c5bbc2..e2dcc7fac29ca 100644 --- a/onnxruntime/test/optimizer/qdq_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc @@ -67,6 +67,14 @@ void QDQTransformerConvTests() { 0.01 /*per_sample_tolerance*/, 0.01 /*relative_per_sample_tolerance*/, std::make_unique(QDQIsInt8Allowed())); + TransformerTester(BuildQDQConvTestCase(input_shape, weights_shape), + check_graph, + TransformerLevel::Level1, + TransformerLevel::Level2, + 18 /*opset_version*/, + 0.01 /*per_sample_tolerance*/, + 0.01 /*relative_per_sample_tolerance*/, + std::make_unique(QDQIsInt8Allowed())); }; test_case({1, 12, 37}, {32, 12, 5}); @@ -157,10 +165,13 @@ TEST(QDQTransformerTests, ConvMaxPoolReshape_UInt8) { test_case({1, 12, 37}, {32, 12, 5}, 11); test_case({1, 12, 37}, {32, 12, 5}, 12); + test_case({1, 12, 37}, {32, 12, 5}, 18); test_case({1, 23, 13, 13}, {30, 23, 3, 3}, 11); test_case({1, 23, 13, 13}, {30, 23, 3, 3}, 12); + test_case({1, 23, 13, 13}, {30, 23, 3, 3}, 18); test_case({1, 22, 11, 13, 15}, {30, 22, 5, 3, 3}, 11); test_case({1, 22, 11, 13, 15}, {30, 22, 5, 3, 3}, 12); + test_case({1, 22, 11, 13, 15}, {30, 22, 5, 3, 3}, 18); } TEST(QDQTransformerTests, ConvMaxPoolReshape_Int8) { @@ -292,6 +303,14 @@ void QDQTransformerAveragePoolTests() { 0.01 /*per_sample_tolerance*/, 0.01 /*relative_per_sample_tolerance*/, std::make_unique(QDQIsInt8Allowed())); + TransformerTester(BuildQDQAveragePoolTestCase(input_shape), + check_graph, + TransformerLevel::Level1, + TransformerLevel::Level2, + 18 /*opset_version*/, + 0.01 /*per_sample_tolerance*/, + 0.01 /*relative_per_sample_tolerance*/, + std::make_unique(QDQIsInt8Allowed())); }; test_case({1, 12, 37}); @@ -341,6 +360,14 @@ void QDQTransformerGlobalAveragePoolTests() { 0.01 /*per_sample_tolerance*/, 0.01 /*relative_per_sample_tolerance*/, std::make_unique(QDQIsInt8Allowed())); + TransformerTester(BuildQDQGlobalAveragePoolTestCase(input_shape), + check_graph, + TransformerLevel::Level1, + TransformerLevel::Level2, + 18 /*opset_version*/, + 0.01 /*per_sample_tolerance*/, + 0.01 /*relative_per_sample_tolerance*/, + std::make_unique(QDQIsInt8Allowed())); }; test_case({1, 12, 37}); @@ -391,6 +418,14 @@ void QDQTransformerBinaryOpTests(const std::string& op_type) { 0.01 /*per_sample_tolerance*/, 0.01 /*relative_per_sample_tolerance*/, std::make_unique(QDQIsInt8Allowed())); + TransformerTester(BuildBinaryOpTestCase(input_shape, op_type), + check_graph, + TransformerLevel::Level1, + TransformerLevel::Level2, + 18 /*opset_version*/, + 0.01 /*per_sample_tolerance*/, + 0.01 /*relative_per_sample_tolerance*/, + std::make_unique(QDQIsInt8Allowed())); }; test_case({1, 12, 37}); @@ -522,6 +557,14 @@ void QDQTransformerMatMulTests(bool has_output_q) { 0.01 /*per_sample_tolerance*/, 0.01 /*relative_per_sample_tolerance*/, std::make_unique(QDQIsInt8Allowed())); + TransformerTester(build_test_case, + check_graph, + TransformerLevel::Level1, + TransformerLevel::Level2, + 18 /*opset_version*/, + 0.01 /*per_sample_tolerance*/, + 0.01 /*relative_per_sample_tolerance*/, + std::make_unique(QDQIsInt8Allowed())); }; test_case({1, 2, 2}, {1, 2, 4}); @@ -677,6 +720,14 @@ void QDQTransformerGemmTests(bool has_output_q, bool has_bias, bool beta_not_one 0.01 /*per_sample_tolerance*/, 0.01 /*relative_per_sample_tolerance*/, std::make_unique(QDQIsInt8Allowed())); + TransformerTester(build_test_case, + check_binary_op_graph, + TransformerLevel::Level1, + TransformerLevel::Level2, + 18 /*opset_version*/, + 0.01 /*per_sample_tolerance*/, + 0.01 /*relative_per_sample_tolerance*/, + std::make_unique(QDQIsInt8Allowed())); }; test_case({2, 2}, {2, 4}); @@ -813,6 +864,14 @@ TEST(QDQTransformerTests, DoubleQDQ) { 12, (scale_1 + scale_3) / 2, 0.01); + TransformerTester( + BuildDoubleQDQTestCases(zp_1, zp_2, zp_3, zp_4, scale_1, scale_2, scale_3, scale_4), + succeed ? expect_succeed : expect_fail, + TransformerLevel::Default, + TransformerLevel::Level1, + 18, + (scale_1 + scale_3) / 2, + 0.01); }; auto test_case_2u8_2s8_failed = [&](uint8_t zp_1, uint8_t zp_2, int8_t zp_3, int8_t zp_4, @@ -870,7 +929,8 @@ TEST(QDQTransformerTests, Split) { TransformerTester(BuildQDQSplitTestCase(input_shape, axis), check_graph, TransformerLevel::Level1, - TransformerLevel::Level2); + TransformerLevel::Level2, + {12, 18}); }; test_case({6, 18, 54}, 0); } @@ -887,7 +947,7 @@ TEST(QDQTransformerTests, Split_without_IdenticalChildrenConsolidation) { TransformerTester(BuildConsolidationTestCase(input_shape, axis), check_graph, TransformerLevel::Level1, - TransformerLevel::Level2, 12, {}, {}, nullptr, {}, + TransformerLevel::Level2, {12, 18}, {}, {}, nullptr, {}, {"IdenticalChildrenConsolidation"}); }; test_case({6, 18, 54}, 0); @@ -904,7 +964,8 @@ TEST(QDQTransformerTests, Split_with_IdenticalChildrenConsolidation) { TransformerTester(BuildConsolidationTestCase(input_shape, axis), check_graph, TransformerLevel::Level1, - TransformerLevel::Level2); + TransformerLevel::Level2, + {12, 18}); }; test_case({6, 18, 54}, 0); } @@ -1509,7 +1570,7 @@ TEST(QDQTransformerTests, ConvAveragePoolReshape_Int8_Fail) { check_graph, TransformerLevel::Level1, TransformerLevel::Level2, - 12 /*opset_version*/, + {12, 18} /*opset_version*/, 0.01f /*per_sample_tolerance*/, 0.01f /*relative_per_sample_tolerance*/); }; @@ -1566,6 +1627,14 @@ void QDQTransformerLeakyReluTests() { 0.01 /*per_sample_tolerance*/, 0.01 /*relative_per_sample_tolerance*/, std::make_unique(QDQIsInt8Allowed())); + TransformerTester(build_test_case, + check_graph, + TransformerLevel::Level1, + TransformerLevel::Level2, + 18 /*opset_version*/, + 0.01 /*per_sample_tolerance*/, + 0.01 /*relative_per_sample_tolerance*/, + std::make_unique(QDQIsInt8Allowed())); }; test_case({1, 12, 37}); @@ -1635,6 +1704,14 @@ void QDQTransformerSigmoidTests() { 0.01 /*per_sample_tolerance*/, 0.01 /*relative_per_sample_tolerance*/, std::make_unique(QDQIsInt8Allowed())); + TransformerTester(build_test_case, + check_graph, + TransformerLevel::Level1, + TransformerLevel::Level2, + 18 /*opset_version*/, + 0.01 /*per_sample_tolerance*/, + 0.01 /*relative_per_sample_tolerance*/, + std::make_unique(QDQIsInt8Allowed())); }; test_case({1, 12, 37}); @@ -1907,7 +1984,7 @@ TEST(QDQTransformerTests, DQForward_MutilpleSteps) { TEST(QDQTransformerTests, Clip) { constexpr float epsilon = std::numeric_limits::epsilon(); - auto test_case = [&](float scale, auto zero_point, int clip_count, int opset_version = 12) { + auto test_case = [&](float scale, auto zero_point, int clip_count, int opset_version) { auto build_test_case = [&](ModelTestBuilder& builder) { auto* input_arg = builder.MakeInput({1, 32, 112, 112}, std::numeric_limits::min(), @@ -1922,7 +1999,9 @@ TEST(QDQTransformerTests, Clip) { auto* clip_output = builder.MakeIntermediate(); constexpr float min = .0f; constexpr float max = 6.0f; - if (opset_version >= 11) { + auto opset = builder.DomainToVersionMap().find(kOnnxDomain)->second; + EXPECT_EQ(opset_version, opset); + if (opset >= 11) { auto* min_initializer = builder.MakeScalarInitializer(min); auto* max_initializer = builder.MakeScalarInitializer(max); builder.AddNode("Clip", {dq_output, min_initializer, max_initializer}, {clip_output}); @@ -1953,18 +2032,21 @@ TEST(QDQTransformerTests, Clip) { epsilon); }; - test_case(.0235294122248888f, static_cast(-128), 0); // [0, 6] - test_case(.02f, static_cast(-128), 0); // [0, 5.1] - test_case(.03f, static_cast(-128), 1); // [0, 7.65] - test_case(.02f, static_cast(127), 1); // [-5.1 , 0] - test_case(.02f, static_cast(0), 1); // [-2.56, 2.54] - test_case(.04f, static_cast(-97), 1); // [-1.24, 8.96] - test_case(.02352941176f, static_cast(0), 0); // [0, 6] - test_case(.02f, static_cast(0), 0); // [0, 5.1] - test_case(.03f, static_cast(0), 1); // [0, 7.65] - test_case(.02f, static_cast(255), 1); // [-5.1, 0] - test_case(.02f, static_cast(128), 1); // [-2.56, 2.54] - test_case(.04f, static_cast(31), 1); // [-1.24, 8.96] + std::vector opsets{12, 18}; + for (auto opset : opsets) { + test_case(.0235294122248888f, static_cast(-128), 0, opset); // [0, 6] + test_case(.02f, static_cast(-128), 0, opset); // [0, 5.1] + test_case(.03f, static_cast(-128), 1, opset); // [0, 7.65] + test_case(.02f, static_cast(127), 1, opset); // [-5.1 , 0] + test_case(.02f, static_cast(0), 1, opset); // [-2.56, 2.54] + test_case(.04f, static_cast(-97), 1, opset); // [-1.24, 8.96] + test_case(.02352941176f, static_cast(0), 0, opset); // [0, 6] + test_case(.02f, static_cast(0), 0, opset); // [0, 5.1] + test_case(.03f, static_cast(0), 1, opset); // [0, 7.65] + test_case(.02f, static_cast(255), 1, opset); // [-5.1, 0] + test_case(.02f, static_cast(128), 1, opset); // [-2.56, 2.54] + test_case(.04f, static_cast(31), 1, opset); // [-1.24, 8.96] + } // opset_version = 10 test_case(.02f, static_cast(-128), 0, 10); // [0, 5.1] @@ -1973,10 +2055,12 @@ TEST(QDQTransformerTests, Clip) { test_case(.03f, static_cast(0), 1, 10); // [0, 7.65] // difference between lower/upper and min/max are within epsilon - test_case(epsilon, static_cast(-127), 0); // [-epsilon, x] (x <= 6 + epsilon) - test_case((6 + epsilon) / 255, static_cast(-128), 0); // [0, 6 + epsilon] - test_case(epsilon, static_cast(1), 0); // [-epsilon, x] (x <= 6 + epsilon) - test_case((6 + epsilon) / 255, static_cast(0), 0); // [0, 6 + epsilon] + for (auto opset : opsets) { + test_case(epsilon, static_cast(-127), 0, opset); // [-epsilon, x] (x <= 6 + epsilon) + test_case((6 + epsilon) / 255, static_cast(-128), 0, opset); // [0, 6 + epsilon] + test_case(epsilon, static_cast(1), 0, opset); // [-epsilon, x] (x <= 6 + epsilon) + test_case((6 + epsilon) / 255, static_cast(0), 0, opset); // [0, 6 + epsilon] + } } TEST(QDQTransformerTests, Concat) { @@ -2536,7 +2620,7 @@ TEST(QDQTransformerTests, QDQ_Selector_Test) { // regression test to validate TransposeOptimizer and QDQ Propagation don't loop // see https://github.com/microsoft/onnxruntime/issues/11605 -TEST(QDQTransformerTests, QDQPropagation_GH11605) { +TEST(QDQTransformerTests, QDQPropagation_GH11605_Opset12) { auto test_case = [&]() { auto build_test_case = [&](ModelTestBuilder& builder) { auto* input_arg = builder.MakeInput({1, 4, 4}, @@ -2585,7 +2669,61 @@ TEST(QDQTransformerTests, QDQPropagation_GH11605) { TransformerTester(build_test_case, check_graph, TransformerLevel::Default, - TransformerLevel::Level2); + TransformerLevel::Level2, + 12); + }; + + test_case(); +} + +TEST(QDQTransformerTests, QDQPropagation_GH11605_Opset13) { + auto test_case = [&]() { + auto build_test_case = [&](ModelTestBuilder& builder) { + auto* input_arg = builder.MakeInput({1, 4, 4}, + std::numeric_limits::min(), + std::numeric_limits::max()); + // add DQ + auto* dq_output = builder.MakeIntermediate(); + builder.AddDequantizeLinearNode(input_arg, 0.123f, uint8_t(0), dq_output); + + // add Transpose 0, 2, 1 + const std::vector& perms{0, 2, 1}; + auto* transpose_output = builder.MakeIntermediate(); + Node& transpose_node = builder.AddNode("Transpose", {dq_output}, {transpose_output}); + transpose_node.AddAttribute("perm", perms); + + // add Softmax with axis=2 (to block the Transpose moving past it due to the transpose perms) + auto* softmax_output = builder.MakeIntermediate(); + Node& softmax_node = builder.AddNode("Softmax", {transpose_output}, {softmax_output}); + softmax_node.AddAttribute("axis", int64_t(2)); + + // add second Transpose. this is so the check in TransposeOptimizer::ProcessTranspose for outputs leading to + // a Transpose is satisfied, allowing the first Transpose to move past the Q/DQ inserted by QDQ Propagation + Node& transpose_node2 = builder.AddNode("Transpose", {softmax_output}, {builder.MakeOutput()}); + transpose_node2.AddAttribute("perm", perms); + }; + + // check that an edge case where transpose optimization gets blocked is handled gracefully. + // Original: DQ -> Tr -> SoftM -> Tr + // QDQ Prop inserts a Q/DQ pair to create a QDQ node group for the Transpose: DQ -> Tr -> Q -> DQ -> SoftM -> Tr + // Transpose opt phase 1 moves the Tr down until it blocks on the SoftMax: DQ -> Q -> DQ -> Tr -> SoftM -> Tr + // Transpose opt phase 2 flips the Tr to prior to the DQ as it's not part of a QDQ node group at that point, as + // running the transpose on 8-bit data should be cheaper: DQ -> Q -> Tr -> DQ -> SoftM -> Tr + // QDQ cleanup in Level2 removes the unnecessary DQ/Q pair at the start: Tr -> DQ -> SoftM -> Tr + // this is the optimal result as the Transpose is using 8-bit data and we have no surplus Q/DQ pairs + auto check_graph = [&](InferenceSessionWrapper& session) { + std::vector expected_op_types_in_order{ + "DequantizeLinear", + "Softmax"}; + const auto op_types_in_order = GetNodeOpTypesInTopologicalOrder(session.GetGraph()); + EXPECT_EQ(op_types_in_order, expected_op_types_in_order); + }; + + TransformerTester(build_test_case, + check_graph, + TransformerLevel::Default, + TransformerLevel::Level2, + 13); }; test_case(); diff --git a/onnxruntime/test/optimizer/transpose_optimizer_test.cc b/onnxruntime/test/optimizer/transpose_optimizer_test.cc index 980ac01b9d1f2..1fab4e3502bad 100644 --- a/onnxruntime/test/optimizer/transpose_optimizer_test.cc +++ b/onnxruntime/test/optimizer/transpose_optimizer_test.cc @@ -94,6 +94,9 @@ TEST(TransposeOptimizerTests, TestSplit) { transpose_1.AddAttribute("perm", std::vector{1, 2, 0}); auto& split_1 = builder.AddNode("Split", {transpose_1_out_0}, {split_1_out_0, split_1_out_1}); split_1.AddAttribute("axis", (int64_t)1); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + split_1.AddAttribute("num_outputs", static_cast(2)); + } auto& transpose_2 = builder.AddNode("Transpose", {split_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{2, 0, 1}); auto& transpose_3 = builder.AddNode("Transpose", {split_1_out_1}, {transpose_3_out_0}); @@ -109,7 +112,7 @@ TEST(TransposeOptimizerTests, TestSplit) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSplitDefaultAxis) { @@ -123,7 +126,10 @@ TEST(TransposeOptimizerTests, TestSplitDefaultAxis) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{1, 2, 0}); - builder.AddNode("Split", {transpose_1_out_0}, {split_1_out_0, split_1_out_1}); + auto& split_1 = builder.AddNode("Split", {transpose_1_out_0}, {split_1_out_0, split_1_out_1}); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + split_1.AddAttribute("num_outputs", static_cast(2)); + } auto& transpose_2 = builder.AddNode("Transpose", {split_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{2, 0, 1}); auto& transpose_3 = builder.AddNode("Transpose", {split_1_out_1}, {transpose_3_out_0}); @@ -139,7 +145,7 @@ TEST(TransposeOptimizerTests, TestSplitDefaultAxis) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSplitNegativeAxis) { @@ -155,6 +161,9 @@ TEST(TransposeOptimizerTests, TestSplitNegativeAxis) { transpose_1.AddAttribute("perm", std::vector{1, 2, 0}); auto& split_1 = builder.AddNode("Split", {transpose_1_out_0}, {split_1_out_0, split_1_out_1}); split_1.AddAttribute("axis", (int64_t)1); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + split_1.AddAttribute("num_outputs", static_cast(2)); + } auto& transpose_2 = builder.AddNode("Transpose", {split_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{2, 0, 1}); auto& transpose_3 = builder.AddNode("Transpose", {split_1_out_1}, {transpose_3_out_0}); @@ -170,7 +179,7 @@ TEST(TransposeOptimizerTests, TestSplitNegativeAxis) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestConcat) { @@ -201,7 +210,7 @@ TEST(TransposeOptimizerTests, TestConcat) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestPad) { @@ -213,10 +222,17 @@ TEST(TransposeOptimizerTests, TestPad) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& pad_1 = builder.AddNode("Pad", {transpose_1_out_0}, {pad_1_out_0}); - pad_1.AddAttribute("mode", "constant"); - pad_1.AddAttribute("value", (float)2.3); - pad_1.AddAttribute("pads", std::vector{1, -2, 3, 4, 5, 6, 7, 8}); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* value = builder.MakeInitializer({1}, {(float)2.3}); + auto* pads = builder.MakeInitializer({8}, {1, -2, 3, 4, 5, 6, 7, 8}); + auto& pad_1 = builder.AddNode("Pad", {transpose_1_out_0, pads, value}, {pad_1_out_0}); + pad_1.AddAttribute("mode", "constant"); + } else { + auto& pad_1 = builder.AddNode("Pad", {transpose_1_out_0}, {pad_1_out_0}); + pad_1.AddAttribute("mode", "constant"); + pad_1.AddAttribute("value", (float)2.3); + pad_1.AddAttribute("pads", std::vector{1, -2, 3, 4, 5, 6, 7, 8}); + } auto& transpose_2 = builder.AddNode("Transpose", {pad_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{0, 2, 3, 1}); }; @@ -230,7 +246,7 @@ TEST(TransposeOptimizerTests, TestPad) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 10); + /*opset_version*/ {10, 18}); } TEST(TransposeOptimizerTests, TestPadOpset15) { @@ -259,7 +275,7 @@ TEST(TransposeOptimizerTests, TestPadOpset15) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestPadNonconst) { @@ -291,7 +307,7 @@ TEST(TransposeOptimizerTests, TestPadNonconst) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 11); + /*opset_version*/ {11, 18}); } // The CUDA Resize kernel assumes that the input is NCHW and @@ -312,10 +328,15 @@ TEST(TransposeOptimizerTests, TestResize) { auto* transpose_1_out_0 = builder.MakeIntermediate(); auto* resize_1_out_0 = builder.MakeIntermediate(); auto* transpose_2_out_0 = builder.MakeOutput(); + auto empty_arg = NodeArg("", nullptr); auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - builder.AddNode("Resize", {transpose_1_out_0, const_1}, {resize_1_out_0}); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 11) { + builder.AddNode("Resize", {transpose_1_out_0, &empty_arg, const_1}, {resize_1_out_0}); + } else { + builder.AddNode("Resize", {transpose_1_out_0, const_1}, {resize_1_out_0}); + } auto& transpose_2 = builder.AddNode("Transpose", {resize_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{0, 2, 3, 1}); }; @@ -329,7 +350,7 @@ TEST(TransposeOptimizerTests, TestResize) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 10); + /*opset_version*/ {10, 18}); } TEST(TransposeOptimizerTests, TestResizeOpset11) { @@ -357,7 +378,7 @@ TEST(TransposeOptimizerTests, TestResizeOpset11) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 11); + /*opset_version*/ {11, 18}); } TEST(TransposeOptimizerTests, TestResizeOpset15) { @@ -385,7 +406,7 @@ TEST(TransposeOptimizerTests, TestResizeOpset15) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestResizeSizeRoi) { @@ -415,7 +436,7 @@ TEST(TransposeOptimizerTests, TestResizeSizeRoi) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestResizeRoiScalesZeroRank0) { @@ -448,7 +469,8 @@ TEST(TransposeOptimizerTests, TestResizeRoiScalesZeroRank0) { TransformerTester(build_test_case_1, check_optimized_graph_1, TransformerLevel::Default, - TransformerLevel::Level1); + TransformerLevel::Level1, + {12, 18}); } TEST(TransposeOptimizerTests, TestResizeNonconst) { @@ -477,7 +499,7 @@ TEST(TransposeOptimizerTests, TestResizeNonconst) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 11); + /*opset_version*/ {11, 18}); } TEST(TransposeOptimizerTests, TestResizeNonconstOpset13) { @@ -506,7 +528,7 @@ TEST(TransposeOptimizerTests, TestResizeNonconstOpset13) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 13); + /*opset_version*/ {13, 18}); } #endif @@ -534,7 +556,7 @@ TEST(TransposeOptimizerTests, TestAdd) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestShape) { @@ -557,7 +579,7 @@ TEST(TransposeOptimizerTests, TestShape) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 7); + /*opset_version*/ {7, 18}); } TEST(TransposeOptimizerTests, TestShapeOpset15) { @@ -580,7 +602,7 @@ TEST(TransposeOptimizerTests, TestShapeOpset15) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestShapeSliceNoStart) { @@ -604,7 +626,7 @@ TEST(TransposeOptimizerTests, TestShapeSliceNoStart) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestShapeSliceNegativeEnd) { @@ -628,7 +650,7 @@ TEST(TransposeOptimizerTests, TestShapeSliceNegativeEnd) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestShapeSliceNegativeStartNoEnd) { @@ -652,7 +674,7 @@ TEST(TransposeOptimizerTests, TestShapeSliceNegativeStartNoEnd) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestShapeSliceStartAndEnd) { @@ -677,7 +699,7 @@ TEST(TransposeOptimizerTests, TestShapeSliceStartAndEnd) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestShapeSliceEmptyResult) { @@ -702,7 +724,7 @@ TEST(TransposeOptimizerTests, TestShapeSliceEmptyResult) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceSumKeepdimsTrue) { @@ -714,9 +736,15 @@ TEST(TransposeOptimizerTests, TestReduceSumKeepdimsTrue) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducesum_1 = builder.AddNode("ReduceSum", {transpose_1_out_0}, {reducesum_1_out_0}); - reducesum_1.AddAttribute("axes", std::vector{0, -2}); - reducesum_1.AddAttribute("keepdims", (int64_t)1); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* init = builder.MakeInitializer({2}, {0, -2}); + auto& reducesum_1 = builder.AddNode("ReduceSum", {transpose_1_out_0, init}, {reducesum_1_out_0}); + reducesum_1.AddAttribute("keepdims", (int64_t)1); + } else { + auto& reducesum_1 = builder.AddNode("ReduceSum", {transpose_1_out_0}, {reducesum_1_out_0}); + reducesum_1.AddAttribute("axes", std::vector{0, -2}); + reducesum_1.AddAttribute("keepdims", (int64_t)1); + } auto& transpose_2 = builder.AddNode("Transpose", {reducesum_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{0, 2, 3, 1}); }; @@ -730,7 +758,7 @@ TEST(TransposeOptimizerTests, TestReduceSumKeepdimsTrue) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 7, + /*opset_version*/ {7, 18}, /*per_sample_tolerance*/ 1e-07, /*relative_per_sample_tolerance*/ 1e-06); } @@ -756,7 +784,7 @@ TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsTrue) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 7, + /*opset_version*/ {7, 18}, /*per_sample_tolerance*/ 1e-07, /*relative_per_sample_tolerance*/ 1e-06); } @@ -770,9 +798,15 @@ TEST(TransposeOptimizerTests, TestReduceSumKeepdimsFalse) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducesum_1 = builder.AddNode("ReduceSum", {transpose_1_out_0}, {reducesum_1_out_0}); - reducesum_1.AddAttribute("axes", std::vector{0, -2}); - reducesum_1.AddAttribute("keepdims", (int64_t)0); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* init = builder.MakeInitializer({2}, {0, -2}); + auto& reducesum_1 = builder.AddNode("ReduceSum", {transpose_1_out_0, init}, {reducesum_1_out_0}); + reducesum_1.AddAttribute("keepdims", (int64_t)0); + } else { + auto& reducesum_1 = builder.AddNode("ReduceSum", {transpose_1_out_0}, {reducesum_1_out_0}); + reducesum_1.AddAttribute("axes", std::vector{0, -2}); + reducesum_1.AddAttribute("keepdims", (int64_t)0); + } auto& transpose_2 = builder.AddNode("Transpose", {reducesum_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{1, 0}); }; @@ -786,7 +820,7 @@ TEST(TransposeOptimizerTests, TestReduceSumKeepdimsFalse) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 7, + /*opset_version*/ {7, 18}, /*per_sample_tolerance*/ 1e-07, /*relative_per_sample_tolerance*/ 1e-06); } @@ -812,7 +846,7 @@ TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsFalse) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 7, + /*opset_version*/ {7, 18}, /*per_sample_tolerance*/ 1e-07, /*relative_per_sample_tolerance*/ 1e-06); } @@ -874,7 +908,7 @@ TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsTrueOpset15) { /*relative_per_sample_tolerance*/ 1e-06); } -TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsTrueNoopEmptyTrue) { +TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsTrueNoopEmptyTrueOpset15) { auto build_test_case_1 = [&](ModelTestBuilder& builder) { auto* input0_arg = MakeInput(builder, {{-1, 4, -1, 5}}, {2, 4, 6, 5}, 0.0, 1.0); auto* const_1 = builder.MakeInitializer({0}, {}); @@ -905,7 +939,7 @@ TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsTrueNoopEmptyTrue) { /*relative_per_sample_tolerance*/ 1e-06); } -TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsTrueNoopEmptyFalse) { +TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsTrueNoopEmptyFalseOpset15) { auto build_test_case_1 = [&](ModelTestBuilder& builder) { auto* input0_arg = MakeInput(builder, {{-1, 4, -1, 5}}, {2, 4, 6, 5}, 0.0, 1.0); auto* const_1 = builder.MakeInitializer({0}, {}); @@ -933,7 +967,7 @@ TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsTrueNoopEmptyFalse) /*relative_per_sample_tolerance*/ 1e-06); } -TEST(TransposeOptimizerTests, TestReduceSumNoAxesInput) { +TEST(TransposeOptimizerTests, TestReduceSumNoAxesInputOpset15) { auto build_test_case_1 = [&](ModelTestBuilder& builder) { auto* input0_arg = MakeInput(builder, {{-1, 4, -1, 5}}, {2, 4, 6, 5}, 0.0, 1.0); auto* transpose_1_out_0 = builder.MakeIntermediate(); @@ -1017,7 +1051,7 @@ TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsFalseOpset15) { /*relative_per_sample_tolerance*/ 1e-06); } -TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsFalseNoopEmptyTrue) { +TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsFalseNoopEmptyTrueOpset15) { auto build_test_case_1 = [&](ModelTestBuilder& builder) { auto* input0_arg = MakeInput(builder, {{-1, 4, -1, 5}}, {2, 4, 6, 5}, 0.0, 1.0); auto* const_1 = builder.MakeInitializer({0}, {}); @@ -1048,7 +1082,7 @@ TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsFalseNoopEmptyTrue) /*relative_per_sample_tolerance*/ 1e-06); } -TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsFalseNoopEmptyFalse) { +TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsFalseNoopEmptyFalseOpset15) { auto build_test_case_1 = [&](ModelTestBuilder& builder) { auto* input0_arg = MakeInput(builder, {{-1, 4, -1, 5}}, {2, 4, 6, 5}, 0.0, 1.0); auto* const_1 = builder.MakeInitializer({0}, {}); @@ -1076,7 +1110,7 @@ TEST(TransposeOptimizerTests, TestReduceSumEmptyAxesKeepdimsFalseNoopEmptyFalse) /*relative_per_sample_tolerance*/ 1e-06); } -TEST(TransposeOptimizerTests, TestReduceSumNoAxesInput_2) { +TEST(TransposeOptimizerTests, TestReduceSumNoAxesInput_2Opset15) { auto build_test_case_1 = [&](ModelTestBuilder& builder) { auto* input0_arg = MakeInput(builder, {{-1, 4, -1, 5}}, {2, 4, 6, 5}, 0.0, 1.0); auto* transpose_1_out_0 = builder.MakeIntermediate(); @@ -1103,7 +1137,7 @@ TEST(TransposeOptimizerTests, TestReduceSumNoAxesInput_2) { /*relative_per_sample_tolerance*/ 1e-06); } -TEST(TransposeOptimizerTests, TestReduceSumNonconstKeepdimsTrueNoOpt) { +TEST(TransposeOptimizerTests, TestReduceSumNonconstKeepdimsTrueNoOptOpset13) { auto build_test_case_1 = [&](ModelTestBuilder& builder) { auto* input0_arg = MakeInput(builder, {{-1, 4, -1, 5}}, {2, 4, 6, 5}, 0.0, 1.0); auto* input1_arg = MakeInput(builder, {std::vector{}}, std::vector{}, {-1}); @@ -1130,7 +1164,7 @@ TEST(TransposeOptimizerTests, TestReduceSumNonconstKeepdimsTrueNoOpt) { /*opset_version*/ 13); } -TEST(TransposeOptimizerTests, TestReduceSumNonconstKeepdimsFalseNoOpt) { +TEST(TransposeOptimizerTests, TestReduceSumNonconstKeepdimsFalseNoOptOpset13) { auto build_test_case_1 = [&](ModelTestBuilder& builder) { auto* input0_arg = MakeInput(builder, {{-1, 4, -1, 5}}, {2, 4, 6, 5}, 0.0, 1.0); auto* input1_arg = MakeInput(builder, {std::vector{}}, std::vector{}, {-1}); @@ -1166,9 +1200,15 @@ TEST(TransposeOptimizerTests, TestReduceMaxKeepdimsTrue) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0}, {reducemax_1_out_0}); - reducemax_1.AddAttribute("axes", std::vector{0, -2}); - reducemax_1.AddAttribute("keepdims", (int64_t)1); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* axes = builder.MakeInitializer({2}, {0, -2}); + auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0, axes}, {reducemax_1_out_0}); + reducemax_1.AddAttribute("keepdims", (int64_t)1); + } else { + auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0}, {reducemax_1_out_0}); + reducemax_1.AddAttribute("axes", std::vector{0, -2}); + reducemax_1.AddAttribute("keepdims", (int64_t)1); + } auto& transpose_2 = builder.AddNode("Transpose", {reducemax_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{0, 2, 3, 1}); }; @@ -1182,7 +1222,7 @@ TEST(TransposeOptimizerTests, TestReduceMaxKeepdimsTrue) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceMaxKeepdimsTrueDefaultAxes) { @@ -1206,7 +1246,7 @@ TEST(TransposeOptimizerTests, TestReduceMaxKeepdimsTrueDefaultAxes) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceMaxKeepdimsFalse) { @@ -1218,13 +1258,19 @@ TEST(TransposeOptimizerTests, TestReduceMaxKeepdimsFalse) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0}, {reducemax_1_out_0}); - reducemax_1.AddAttribute("axes", std::vector{0, -2}); - reducemax_1.AddAttribute("keepdims", (int64_t)0); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* axes = builder.MakeInitializer({2}, {0, -2}); + auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0, axes}, {reducemax_1_out_0}); + reducemax_1.AddAttribute("keepdims", (int64_t)0); + } else { + auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0}, {reducemax_1_out_0}); + reducemax_1.AddAttribute("axes", std::vector{0, -2}); + reducemax_1.AddAttribute("keepdims", (int64_t)0); + } auto& transpose_2 = builder.AddNode("Transpose", {reducemax_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{1, 0}); }; - + auto check_optimized_graph_1 = [&](InferenceSessionWrapper& session) { int transpose_cost = EstimateTransposeCost(session.GetGraph()); EXPECT_EQ(transpose_cost, 0); @@ -1234,7 +1280,7 @@ TEST(TransposeOptimizerTests, TestReduceMaxKeepdimsFalse) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceMaxKeepdimsFalseDefaultAxes) { @@ -1258,7 +1304,7 @@ TEST(TransposeOptimizerTests, TestReduceMaxKeepdimsFalseDefaultAxes) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceMax) { @@ -1270,8 +1316,13 @@ TEST(TransposeOptimizerTests, TestReduceMax) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0}, {reducemax_1_out_0}); - reducemax_1.AddAttribute("axes", std::vector{0, -2}); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* axes = builder.MakeInitializer({2}, {0, -2}); + builder.AddNode("ReduceMax", {transpose_1_out_0, axes}, {reducemax_1_out_0}); + } else { + auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0}, {reducemax_1_out_0}); + reducemax_1.AddAttribute("axes", std::vector{0, -2}); + } auto& transpose_2 = builder.AddNode("Transpose", {reducemax_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{0, 2, 3, 1}); }; @@ -1285,7 +1336,7 @@ TEST(TransposeOptimizerTests, TestReduceMax) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceMaxDefaultAxes) { @@ -1308,7 +1359,7 @@ TEST(TransposeOptimizerTests, TestReduceMaxDefaultAxes) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceOpsReduceLogSum) { @@ -1320,9 +1371,15 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceLogSum) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducelogsum_1 = builder.AddNode("ReduceLogSum", {transpose_1_out_0}, {reducelogsum_1_out_0}); - reducelogsum_1.AddAttribute("axes", std::vector{0, -2}); - reducelogsum_1.AddAttribute("keepdims", (int64_t)0); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* axes = builder.MakeInitializer({2}, {0, -2}); + auto& reducelogsum_1 = builder.AddNode("ReduceLogSum", {transpose_1_out_0, axes}, {reducelogsum_1_out_0}); + reducelogsum_1.AddAttribute("keepdims", (int64_t)0); + } else { + auto& reducelogsum_1 = builder.AddNode("ReduceLogSum", {transpose_1_out_0}, {reducelogsum_1_out_0}); + reducelogsum_1.AddAttribute("axes", std::vector{0, -2}); + reducelogsum_1.AddAttribute("keepdims", (int64_t)0); + } auto& transpose_2 = builder.AddNode("Transpose", {reducelogsum_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{1, 0}); }; @@ -1336,7 +1393,7 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceLogSum) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceOpsReduceLogSumExp) { @@ -1348,9 +1405,15 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceLogSumExp) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducelogsumexp_1 = builder.AddNode("ReduceLogSumExp", {transpose_1_out_0}, {reducelogsumexp_1_out_0}); - reducelogsumexp_1.AddAttribute("axes", std::vector{0, -2}); - reducelogsumexp_1.AddAttribute("keepdims", (int64_t)0); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* axes = builder.MakeInitializer({2}, {0, -2}); + auto& reducelogsumexp_1 = builder.AddNode("ReduceLogSumExp", {transpose_1_out_0, axes}, {reducelogsumexp_1_out_0}); + reducelogsumexp_1.AddAttribute("keepdims", (int64_t)0); + } else { + auto& reducelogsumexp_1 = builder.AddNode("ReduceLogSumExp", {transpose_1_out_0}, {reducelogsumexp_1_out_0}); + reducelogsumexp_1.AddAttribute("axes", std::vector{0, -2}); + reducelogsumexp_1.AddAttribute("keepdims", (int64_t)0); + } auto& transpose_2 = builder.AddNode("Transpose", {reducelogsumexp_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{1, 0}); }; @@ -1364,7 +1427,7 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceLogSumExp) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceOpsReduceMax) { @@ -1376,9 +1439,15 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceMax) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0}, {reducemax_1_out_0}); - reducemax_1.AddAttribute("axes", std::vector{0, -2}); - reducemax_1.AddAttribute("keepdims", (int64_t)0); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* axes = builder.MakeInitializer({2}, {0, -2}); + auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0, axes}, {reducemax_1_out_0}); + reducemax_1.AddAttribute("keepdims", (int64_t)0); + } else { + auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0}, {reducemax_1_out_0}); + reducemax_1.AddAttribute("axes", std::vector{0, -2}); + reducemax_1.AddAttribute("keepdims", (int64_t)0); + } auto& transpose_2 = builder.AddNode("Transpose", {reducemax_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{1, 0}); }; @@ -1392,7 +1461,7 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceMax) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceOpsReduceMean) { @@ -1404,9 +1473,15 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceMean) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducemean_1 = builder.AddNode("ReduceMean", {transpose_1_out_0}, {reducemean_1_out_0}); - reducemean_1.AddAttribute("axes", std::vector{0, -2}); - reducemean_1.AddAttribute("keepdims", (int64_t)0); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* axes = builder.MakeInitializer({2}, {0, -2}); + auto& reducemean_1 = builder.AddNode("ReduceMean", {transpose_1_out_0, axes}, {reducemean_1_out_0}); + reducemean_1.AddAttribute("keepdims", (int64_t)0); + } else { + auto& reducemean_1 = builder.AddNode("ReduceMean", {transpose_1_out_0}, {reducemean_1_out_0}); + reducemean_1.AddAttribute("axes", std::vector{0, -2}); + reducemean_1.AddAttribute("keepdims", (int64_t)0); + } auto& transpose_2 = builder.AddNode("Transpose", {reducemean_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{1, 0}); }; @@ -1420,7 +1495,7 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceMean) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceOpsReduceMin) { @@ -1432,9 +1507,15 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceMin) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducemin_1 = builder.AddNode("ReduceMin", {transpose_1_out_0}, {reducemin_1_out_0}); - reducemin_1.AddAttribute("axes", std::vector{0, -2}); - reducemin_1.AddAttribute("keepdims", (int64_t)0); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* axes = builder.MakeInitializer({2}, {0, -2}); + auto& reducemin_1 = builder.AddNode("ReduceMin", {transpose_1_out_0, axes}, {reducemin_1_out_0}); + reducemin_1.AddAttribute("keepdims", (int64_t)0); + } else { + auto& reducemin_1 = builder.AddNode("ReduceMin", {transpose_1_out_0}, {reducemin_1_out_0}); + reducemin_1.AddAttribute("axes", std::vector{0, -2}); + reducemin_1.AddAttribute("keepdims", (int64_t)0); + } auto& transpose_2 = builder.AddNode("Transpose", {reducemin_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{1, 0}); }; @@ -1448,7 +1529,7 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceMin) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceOpsReduceProd) { @@ -1460,9 +1541,15 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceProd) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reduceprod_1 = builder.AddNode("ReduceProd", {transpose_1_out_0}, {reduceprod_1_out_0}); - reduceprod_1.AddAttribute("axes", std::vector{0, -2}); - reduceprod_1.AddAttribute("keepdims", (int64_t)0); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* axes = builder.MakeInitializer({2}, {0, -2}); + auto& reduceprod_1 = builder.AddNode("ReduceProd", {transpose_1_out_0, axes}, {reduceprod_1_out_0}); + reduceprod_1.AddAttribute("keepdims", (int64_t)0); + } else { + auto& reduceprod_1 = builder.AddNode("ReduceProd", {transpose_1_out_0}, {reduceprod_1_out_0}); + reduceprod_1.AddAttribute("axes", std::vector{0, -2}); + reduceprod_1.AddAttribute("keepdims", (int64_t)0); + } auto& transpose_2 = builder.AddNode("Transpose", {reduceprod_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{1, 0}); }; @@ -1476,7 +1563,7 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceProd) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceOpsReduceSumSquare) { @@ -1488,9 +1575,15 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceSumSquare) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducesumsquare_1 = builder.AddNode("ReduceSumSquare", {transpose_1_out_0}, {reducesumsquare_1_out_0}); - reducesumsquare_1.AddAttribute("axes", std::vector{0, -2}); - reducesumsquare_1.AddAttribute("keepdims", (int64_t)0); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* init = builder.MakeInitializer({2}, {0, -2}); + auto& reducesumsquare_1 = builder.AddNode("ReduceSumSquare", {transpose_1_out_0, init}, {reducesumsquare_1_out_0}); + reducesumsquare_1.AddAttribute("keepdims", (int64_t)0); + } else { + auto& reducesumsquare_1 = builder.AddNode("ReduceSumSquare", {transpose_1_out_0}, {reducesumsquare_1_out_0}); + reducesumsquare_1.AddAttribute("axes", std::vector{0, -2}); + reducesumsquare_1.AddAttribute("keepdims", (int64_t)0); + } auto& transpose_2 = builder.AddNode("Transpose", {reducesumsquare_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{1, 0}); }; @@ -1504,7 +1597,7 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceSumSquare) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceOpsReduceL1) { @@ -1516,9 +1609,15 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceL1) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducel1_1 = builder.AddNode("ReduceL1", {transpose_1_out_0}, {reducel1_1_out_0}); - reducel1_1.AddAttribute("axes", std::vector{0, -2}); - reducel1_1.AddAttribute("keepdims", (int64_t)0); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* axes = builder.MakeInitializer({2}, {0, -2}); + auto& reducel1_1 = builder.AddNode("ReduceL1", {transpose_1_out_0, axes}, {reducel1_1_out_0}); + reducel1_1.AddAttribute("keepdims", (int64_t)0); + } else { + auto& reducel1_1 = builder.AddNode("ReduceL1", {transpose_1_out_0}, {reducel1_1_out_0}); + reducel1_1.AddAttribute("axes", std::vector{0, -2}); + reducel1_1.AddAttribute("keepdims", (int64_t)0); + } auto& transpose_2 = builder.AddNode("Transpose", {reducel1_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{1, 0}); }; @@ -1532,7 +1631,7 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceL1) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReduceOpsReduceL2) { @@ -1544,9 +1643,15 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceL2) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducel2_1 = builder.AddNode("ReduceL2", {transpose_1_out_0}, {reducel2_1_out_0}); - reducel2_1.AddAttribute("axes", std::vector{0, -2}); - reducel2_1.AddAttribute("keepdims", (int64_t)0); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* axes = builder.MakeInitializer({2}, {0, -2}); + auto& reducel2_1 = builder.AddNode("ReduceL2", {transpose_1_out_0, axes}, {reducel2_1_out_0}); + reducel2_1.AddAttribute("keepdims", (int64_t)0); + } else { + auto& reducel2_1 = builder.AddNode("ReduceL2", {transpose_1_out_0}, {reducel2_1_out_0}); + reducel2_1.AddAttribute("axes", std::vector{0, -2}); + reducel2_1.AddAttribute("keepdims", (int64_t)0); + } auto& transpose_2 = builder.AddNode("Transpose", {reducel2_1_out_0}, {transpose_2_out_0}); transpose_2.AddAttribute("perm", std::vector{1, 0}); }; @@ -1560,10 +1665,10 @@ TEST(TransposeOptimizerTests, TestReduceOpsReduceL2) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } -TEST(TransposeOptimizerTests, TestSqueeze) { +TEST(TransposeOptimizerTests, TestSqueezeOpset7) { auto build_test_case_1 = [&](ModelTestBuilder& builder) { auto* input0_arg = MakeInput(builder, {{1, -1, 1, 2}}, {1, 4, 1, 2}, 0.0, 1.0); auto* transpose_1_out_0 = builder.MakeIntermediate(); @@ -1663,7 +1768,7 @@ TEST(TransposeOptimizerTests, TestSqueezeEmptyNoOpt) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 7); + /*opset_version*/ {7, 18}); } TEST(TransposeOptimizerTests, TestSqueezeEmptyNoOptOpset15) { @@ -1708,10 +1813,10 @@ TEST(TransposeOptimizerTests, TestSqueezeNonconstNoOpt) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } -TEST(TransposeOptimizerTests, TestUnsqueeze) { +TEST(TransposeOptimizerTests, TestUnsqueezeOpset7) { auto build_test_case_1 = [&](ModelTestBuilder& builder) { auto* input0_arg = MakeInput(builder, {{2, -1, 6, 5}}, {2, 4, 6, 5}, 0.0, 1.0); auto* transpose_1_out_0 = builder.MakeIntermediate(); @@ -1901,7 +2006,7 @@ TEST(TransposeOptimizerTests, TestSliceOpset15) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSliceNoAxesOpset15) { @@ -1929,7 +2034,7 @@ TEST(TransposeOptimizerTests, TestSliceNoAxesOpset15) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSliceNegativeAxesInt32) { @@ -1958,7 +2063,7 @@ TEST(TransposeOptimizerTests, TestSliceNegativeAxesInt32) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSliceStepsInt32) { @@ -1988,7 +2093,7 @@ TEST(TransposeOptimizerTests, TestSliceStepsInt32) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSliceNegativeAxes) { @@ -2017,7 +2122,7 @@ TEST(TransposeOptimizerTests, TestSliceNegativeAxes) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSliceSteps) { @@ -2047,7 +2152,7 @@ TEST(TransposeOptimizerTests, TestSliceSteps) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSliceNonconstNoOpt) { @@ -2075,7 +2180,7 @@ TEST(TransposeOptimizerTests, TestSliceNonconstNoOpt) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSliceNonconstInt32NoOpt) { @@ -2103,7 +2208,7 @@ TEST(TransposeOptimizerTests, TestSliceNonconstInt32NoOpt) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSliceDefaultAxesNonconstStarts) { @@ -2131,7 +2236,7 @@ TEST(TransposeOptimizerTests, TestSliceDefaultAxesNonconstStarts) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSliceDefaultAxesNonconstStartsUnknownLengthNoOpt) { @@ -2158,7 +2263,7 @@ TEST(TransposeOptimizerTests, TestSliceDefaultAxesNonconstStartsUnknownLengthNoO check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSliceDefaultAxesNonconstStartsInt32) { @@ -2186,7 +2291,7 @@ TEST(TransposeOptimizerTests, TestSliceDefaultAxesNonconstStartsInt32) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSliceDefaultAxesNonconstStartsUnknownLengthInt32NoOpt) { @@ -2213,7 +2318,7 @@ TEST(TransposeOptimizerTests, TestSliceDefaultAxesNonconstStartsUnknownLengthInt check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestTile) { @@ -2240,7 +2345,7 @@ TEST(TransposeOptimizerTests, TestTile) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestTileNonconstReps) { @@ -2267,7 +2372,7 @@ TEST(TransposeOptimizerTests, TestTileNonconstReps) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestArgMinNoAxisKeepdimsTrue) { @@ -2294,7 +2399,7 @@ TEST(TransposeOptimizerTests, TestArgMinNoAxisKeepdimsTrue) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestArgMinNoAxisKeepdimsFalse) { @@ -2321,7 +2426,7 @@ TEST(TransposeOptimizerTests, TestArgMinNoAxisKeepdimsFalse) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestArgMinNoAxis) { @@ -2347,7 +2452,7 @@ TEST(TransposeOptimizerTests, TestArgMinNoAxis) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestArgMinKeepdimsTrue) { @@ -2375,7 +2480,7 @@ TEST(TransposeOptimizerTests, TestArgMinKeepdimsTrue) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestArgMinKeepdimsFalse) { @@ -2403,7 +2508,7 @@ TEST(TransposeOptimizerTests, TestArgMinKeepdimsFalse) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestArgMin) { @@ -2430,7 +2535,7 @@ TEST(TransposeOptimizerTests, TestArgMin) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestArgMax) { @@ -2458,7 +2563,7 @@ TEST(TransposeOptimizerTests, TestArgMax) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSoftmax) { @@ -2771,7 +2876,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsAdd) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsMul) { @@ -2801,7 +2906,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsMul) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsSub) { @@ -2831,7 +2936,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsSub) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsDiv) { @@ -2861,7 +2966,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsDiv) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsPRelu) { @@ -2891,7 +2996,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsPRelu) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsGreater) { @@ -2921,7 +3026,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsGreater) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsLess) { @@ -2951,7 +3056,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsLess) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsPow) { @@ -2981,7 +3086,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsPow) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsMax) { @@ -3011,7 +3116,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsMax) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsMin) { @@ -3041,7 +3146,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsMin) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsMean) { @@ -3071,7 +3176,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsMean) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsSum) { @@ -3101,7 +3206,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsSum) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsGreaterOrEqual) { @@ -3131,7 +3236,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsGreaterOrEqual) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsLessOrEqual) { @@ -3161,7 +3266,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsLessOrEqual) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsEqual) { @@ -3191,7 +3296,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsEqual) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsAnd) { @@ -3221,7 +3326,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsAnd) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsOr) { @@ -3251,7 +3356,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsOr) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsXor) { @@ -3281,7 +3386,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsXor) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsMod) { @@ -3312,7 +3417,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsMod) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastOpsBitShift) { @@ -3343,7 +3448,7 @@ TEST(TransposeOptimizerTests, TestBroadcastOpsBitShift) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestWhere) { @@ -3374,7 +3479,7 @@ TEST(TransposeOptimizerTests, TestWhere) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestQuantizeLinearScalar) { @@ -3402,7 +3507,7 @@ TEST(TransposeOptimizerTests, TestQuantizeLinearScalar) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestQuantizeLinearScalarIgnoreAxis) { @@ -3431,7 +3536,7 @@ TEST(TransposeOptimizerTests, TestQuantizeLinearScalarIgnoreAxis) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestQuantizeLinearVector) { @@ -3460,7 +3565,7 @@ TEST(TransposeOptimizerTests, TestQuantizeLinearVector) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestQuantizeLinearVectorUnknownRank) { @@ -3489,7 +3594,7 @@ TEST(TransposeOptimizerTests, TestQuantizeLinearVectorUnknownRank) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestQuantizeLinearScalarOpset10) { @@ -3546,7 +3651,7 @@ TEST(TransposeOptimizerTests, TestDequantizeLinearScalarIgnoreAxis) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestDequantizeLinearVector) { @@ -3575,7 +3680,7 @@ TEST(TransposeOptimizerTests, TestDequantizeLinearVector) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestDequantizeLinearNoAxis) { @@ -3665,7 +3770,7 @@ TEST(TransposeOptimizerTests, TestCast) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestBroadcastReusedInputs) { @@ -3696,7 +3801,7 @@ TEST(TransposeOptimizerTests, TestBroadcastReusedInputs) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestTransposeGraphOutput) { @@ -3724,7 +3829,7 @@ TEST(TransposeOptimizerTests, TestTransposeGraphOutput) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestSimpleReshapeAsTranspose) { @@ -3757,7 +3862,7 @@ TEST(TransposeOptimizerTests, TestSimpleReshapeAsTranspose) { check_optimized_graph, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestReshapeAsTransposeGraphOutput) { @@ -3788,7 +3893,7 @@ TEST(TransposeOptimizerTests, TestReshapeAsTransposeGraphOutput) { check_optimized_graph, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestCancelingNodesGraphOutputs) { @@ -3819,7 +3924,7 @@ TEST(TransposeOptimizerTests, TestCancelingNodesGraphOutputs) { check_optimized_graph, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestNonCancelingReshape) { @@ -3855,7 +3960,7 @@ TEST(TransposeOptimizerTests, TestNonCancelingReshape) { check_optimized_graph, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestPushBroadcastUnsqueezeTranspose) { @@ -3890,7 +3995,7 @@ TEST(TransposeOptimizerTests, TestPushBroadcastUnsqueezeTranspose) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestOptimizeTowardsTranspose) { @@ -3920,7 +4025,7 @@ TEST(TransposeOptimizerTests, TestOptimizeTowardsTranspose) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestOnlyOptimizeTowardsTranspose) { @@ -3947,7 +4052,7 @@ TEST(TransposeOptimizerTests, TestOnlyOptimizeTowardsTranspose) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestDontOptimizeWrongInput) { @@ -3973,7 +4078,7 @@ TEST(TransposeOptimizerTests, TestDontOptimizeWrongInput) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestOptimizeBothInputs) { @@ -4001,7 +4106,7 @@ TEST(TransposeOptimizerTests, TestOptimizeBothInputs) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } TEST(TransposeOptimizerTests, TestOmitIdentityTranspose) { @@ -4012,9 +4117,16 @@ TEST(TransposeOptimizerTests, TestOmitIdentityTranspose) { auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); - auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0}, {reducemax_1_out_0}); - reducemax_1.AddAttribute("axes", std::vector{1}); - reducemax_1.AddAttribute("keepdims", (int64_t)0); + if (builder.DomainToVersionMap().find(kOnnxDomain)->second >= 18) { + auto* init = builder.MakeInitializer({1}, {1}); + auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0, init}, {reducemax_1_out_0}); + reducemax_1.AddAttribute("keepdims", (int64_t)0); + } + else { + auto& reducemax_1 = builder.AddNode("ReduceMax", {transpose_1_out_0}, {reducemax_1_out_0}); + reducemax_1.AddAttribute("axes", std::vector{1}); + reducemax_1.AddAttribute("keepdims", (int64_t)0); + } }; auto check_optimized_graph_1 = [&](InferenceSessionWrapper& session) { @@ -4027,7 +4139,7 @@ TEST(TransposeOptimizerTests, TestOmitIdentityTranspose) { check_optimized_graph_1, TransformerLevel::Default, TransformerLevel::Level1, - /*opset_version*/ 15); + /*opset_version*/ {15, 18}); } // regression test for a model where the transpose optimizations were not completed in a single pass in level 1. From 96b95a24eeec2c760dfd08b7ac98868860a890c0 Mon Sep 17 00:00:00 2001 From: Boyd Johnson Date: Wed, 8 Feb 2023 16:57:15 -0600 Subject: [PATCH 36/68] Add rust bindings (#12606) This adds updated Rust bindings that have been located at [nbigaouette/onnxruntime-rs](https://github.com/nbigaouette/onnxruntime-rs). check out the build instructions included in this PR at /rust/BUILD.md. Changes to the bindings included in this PR: - The bindings are generated with the build script on each build - The onnxruntime shared library is built with ORT_RUST_STRATEGY=compile which is now the default. - A memory leak was fixed where a call to free wasn't called - Several small memory errors were fixed - Session is Send but not Sync, Environment is Send + Sync - Inputs and Outputs can be ndarray::Arrays of many different types. Some commits can be squashed, if wanted, but were left unsquashed to show differences between old bindings and new bindings. This PR does not cover packaging nor does it include the Rust bindings withing the build system. For those of you who have previous Rust code based on the bindings, these new bindings can be used as a `path` dependency or a `git` dependency (though I have not tested this out). The work addressed in this PR was discussed in #11992 --- .gitignore | 4 + rust/BUILD.md | 48 ++ rust/Cargo.toml | 5 + rust/LICENSE-APACHE | 201 +++++ rust/LICENSE-MIT | 21 + rust/README.md | 196 +++++ rust/onnxruntime-sys/Cargo.toml | 35 + rust/onnxruntime-sys/build.rs | 429 ++++++++++ rust/onnxruntime-sys/examples/c_api_sample.rs | 395 +++++++++ rust/onnxruntime-sys/src/lib.rs | 15 + rust/onnxruntime/Cargo.toml | 43 + rust/onnxruntime/examples/issue22.rs | 55 ++ rust/onnxruntime/examples/print_structure.rs | 47 + rust/onnxruntime/examples/sample.rs | 83 ++ rust/onnxruntime/src/download.rs | 113 +++ rust/onnxruntime/src/download/language.rs | 25 + .../language/machine_comprehension.rs | 127 +++ rust/onnxruntime/src/download/vision.rs | 45 + .../vision/body_face_gesture_analysis.rs | 43 + .../domain_based_image_classification.rs | 30 + .../download/vision/image_classification.rs | 350 ++++++++ .../src/download/vision/image_manipulation.rs | 86 ++ .../object_detection_image_segmentation.rs | 107 +++ rust/onnxruntime/src/environment.rs | 373 ++++++++ rust/onnxruntime/src/error.rs | 249 ++++++ rust/onnxruntime/src/lib.rs | 560 ++++++++++++ rust/onnxruntime/src/memory.rs | 81 ++ rust/onnxruntime/src/session.rs | 806 ++++++++++++++++++ rust/onnxruntime/src/tensor.rs | 31 + rust/onnxruntime/src/tensor/construct.rs | 34 + rust/onnxruntime/src/tensor/ndarray_tensor.rs | 210 +++++ .../src/tensor/ort_input_tensor.rs | 325 +++++++ .../src/tensor/ort_output_tensor.rs | 347 ++++++++ rust/onnxruntime/tests/data/mnist_5.jpg | Bin 0 -> 555 bytes rust/onnxruntime/tests/data/mushroom.png | Bin 0 -> 106499 bytes rust/onnxruntime/tests/data/upsample.onnx | Bin 0 -> 1861 bytes rust/onnxruntime/tests/integration_tests.rs | 555 ++++++++++++ rust/rustfmt.toml | 2 + 38 files changed, 6076 insertions(+) create mode 100644 rust/BUILD.md create mode 100644 rust/Cargo.toml create mode 100644 rust/LICENSE-APACHE create mode 100644 rust/LICENSE-MIT create mode 100644 rust/README.md create mode 100644 rust/onnxruntime-sys/Cargo.toml create mode 100644 rust/onnxruntime-sys/build.rs create mode 100644 rust/onnxruntime-sys/examples/c_api_sample.rs create mode 100644 rust/onnxruntime-sys/src/lib.rs create mode 100644 rust/onnxruntime/Cargo.toml create mode 100644 rust/onnxruntime/examples/issue22.rs create mode 100644 rust/onnxruntime/examples/print_structure.rs create mode 100644 rust/onnxruntime/examples/sample.rs create mode 100644 rust/onnxruntime/src/download.rs create mode 100644 rust/onnxruntime/src/download/language.rs create mode 100644 rust/onnxruntime/src/download/language/machine_comprehension.rs create mode 100644 rust/onnxruntime/src/download/vision.rs create mode 100644 rust/onnxruntime/src/download/vision/body_face_gesture_analysis.rs create mode 100644 rust/onnxruntime/src/download/vision/domain_based_image_classification.rs create mode 100644 rust/onnxruntime/src/download/vision/image_classification.rs create mode 100644 rust/onnxruntime/src/download/vision/image_manipulation.rs create mode 100644 rust/onnxruntime/src/download/vision/object_detection_image_segmentation.rs create mode 100644 rust/onnxruntime/src/environment.rs create mode 100644 rust/onnxruntime/src/error.rs create mode 100644 rust/onnxruntime/src/lib.rs create mode 100644 rust/onnxruntime/src/memory.rs create mode 100644 rust/onnxruntime/src/session.rs create mode 100644 rust/onnxruntime/src/tensor.rs create mode 100644 rust/onnxruntime/src/tensor/construct.rs create mode 100644 rust/onnxruntime/src/tensor/ndarray_tensor.rs create mode 100644 rust/onnxruntime/src/tensor/ort_input_tensor.rs create mode 100644 rust/onnxruntime/src/tensor/ort_output_tensor.rs create mode 100644 rust/onnxruntime/tests/data/mnist_5.jpg create mode 100644 rust/onnxruntime/tests/data/mushroom.png create mode 100644 rust/onnxruntime/tests/data/upsample.onnx create mode 100644 rust/onnxruntime/tests/integration_tests.rs create mode 100644 rust/rustfmt.toml diff --git a/.gitignore b/.gitignore index 26620d1bd5214..739ec17ca2fce 100644 --- a/.gitignore +++ b/.gitignore @@ -57,3 +57,7 @@ onnxruntime/python/version_info.py # clangd .cache/ compile_commands.json +# Rust specific +rust/**/target +rust/**/Cargo.lock +rust/onnxruntime/synset.txt diff --git a/rust/BUILD.md b/rust/BUILD.md new file mode 100644 index 0000000000000..68500c7fc624a --- /dev/null +++ b/rust/BUILD.md @@ -0,0 +1,48 @@ +# Building and testing the Rust bindings + +These instructions require cargo and rustc. +To get these follow the instructions at [https://rustup.rs](https://rustup.rs) +The instructions compile the onnxruntime along with the bindings, +so require `cmake`, a python 3 interpreter, clang (needed to parse the C headers to generate the Rust bindings), +and the platform compiler to compile onnxruntime. + +## Local setup of onnxruntime repo + +```sh + git clone https://github.com/microsoft/onnxruntime + cd onnxruntime + git submodule update --init --recursive +``` + +## cargo build both crates + +from the root of onnxruntime repo + +```sh + CARGO_TARGET_DIR=build/rust cargo build --manifest-path rust/Cargo.toml +``` + +The CARGO_TARGET_DIR environment variable puts the build artifacts in `onnxruntime/build/rust` +instead of `onnxruntime/rust/target`. + +## cargo test both crates + +```sh + CARGO_TARGET_DIR=build/rust cargo test --manifest-path rust/Cargo.toml --features model-fetching +``` + +### cargo test both crates while specifying the absolute path to the OnnxRuntime shared library. + +```sh + RUST_ONNXRUNTIME_LIBRARY_PATH= CARGO_TARGET_DIR=build/rust cargo test --manifest-path rust/Cargo.toml --features model-fetching +``` + +## cargo test with sanitizer support + +**If you are using a nightly Rust compiler and are on one the platforms listed in [Rust sanitizer support](https://doc.rust-lang.org/beta/unstable-book/compiler-flags/sanitizer.html).** + +where `$SAN` is one of `address`, `thread`, `memory` or `leak` + +```sh + RUSTFLAGS="-Zsanitizer=$SAN" CARGO_TARGET_DIR=build/rust cargo test --manifest-path rust/Cargo.toml --features model-fetching --target -Z build-std -- --test-threads=1 +``` diff --git a/rust/Cargo.toml b/rust/Cargo.toml new file mode 100644 index 0000000000000..7c33647c5d3da --- /dev/null +++ b/rust/Cargo.toml @@ -0,0 +1,5 @@ +[workspace] +members = [ + "onnxruntime-sys", + "onnxruntime", +] diff --git a/rust/LICENSE-APACHE b/rust/LICENSE-APACHE new file mode 100644 index 0000000000000..e0284d8a8d512 --- /dev/null +++ b/rust/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright 2020 Nicolas Bigaouette + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/rust/LICENSE-MIT b/rust/LICENSE-MIT new file mode 100644 index 0000000000000..2b6d07c1daf81 --- /dev/null +++ b/rust/LICENSE-MIT @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Nicolas Bigaouette + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/rust/README.md b/rust/README.md new file mode 100644 index 0000000000000..14b9e8cd632b4 --- /dev/null +++ b/rust/README.md @@ -0,0 +1,196 @@ +# ONNX Runtime + +These are Rust bindings to +[Microsoft's ONNX Runtime](https://github.com/microsoft/onnxruntime). + +This project consists of two crates: + +* [`onnxruntime-sys`](onnxruntime-sys): Low-level binding to the C API; +* [`onnxruntime`](onnxruntime): High-level and safe API. + +The `build.rs` script supports downloading pre-built versions of the Microsoft ONNX Runtime, +which provides the following targets: + +CPU: + +* Linux x86_64 +* macOS x86_64 +* macOS aarch64 +* Windows i686 +* Windows x86_64 + +GPU: + +* Linux x86_64 +* Windows x86_64 + +--- + +**WARNING**: + +* This is an experiment and work in progress; it is _not_ complete/working/safe. Help welcome! +* Basic inference works, see [`onnxruntime/examples/sample.rs`](onnxruntime/examples/sample.rs) or [`onnxruntime/tests/integration_tests.rs`](onnxruntime/tests/integration_tests.rs) +* ONNX Runtime has many options to control the inference process but those options are not yet exposed. + +--- + +## Setup + +Three different strategy to obtain the ONNX Runtime are supported by the `build.rs` script: + +1. Download a pre-built binary from upstream; +2. Point to a local version already installed; +3. Compile from source. + +To select which strategy to use, set the `ORT_RUST_STRATEGY` environment variable to: + +1. `download`: Download prebuilt onnxruntime; +2. `system`: To use a locally installed version (use `ORT_RUST_LIB_LOCATION` environment variable to point to the install path) +3. `compile`: To compile the library. This is the default. + +The `download` strategy supports downloading a version of ONNXRuntime that supports CUDA. To use this, set the +environment variable `ORT_RUST_USE_CUDA=1` (only supports Linux or Windows). + +### Note on 'ORT_RUST_STRATEGY=system' + +When using `ORT_RUST_STRATEGY=system`, executing a built crate binary (for example the tests) might fail, at least on macOS, +if the library is not installed in a system path. An error similar to the following happens: + +```text +dyld: Library not loaded: @rpath/libonnxruntime.1.7.1.dylib + Referenced from: onnxruntime-rs.git/target/debug/deps/onnxruntime_sys-22eb0e3e89a0278c + Reason: image not found +``` + +To fix, one can either: + +* Set the `LD_LIBRARY_PATH` environment variable to point to the path where the library can be found. +* Adapt the `.cargo/config` file to contain a linker flag to provide the **full** path: + + ```toml + [target.aarch64-apple-darwin] + rustflags = ["-C", "link-args=-Wl,-rpath,/full/path/to/onnxruntime/lib"] + ``` + +See [rust-lang/cargo #5077](https://github.com/rust-lang/cargo/issues/5077) for more information. + +## Example + +The C++ example that uses the C API +([`C_Api_Sample.cpp`](https://github.com/microsoft/onnxruntime/blob/v1.3.1/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/C_Api_Sample.cpp)) +was ported to both the low level crate (`onnxruntime-sys`) and the high level on (`onnxruntime`). + +### onnxruntime-sys + +To run this example ([`onnxruntime-sys/examples/c_api_sample.rs`](onnxruntime-sys/examples/c_api_sample.rs)): + +```sh +# Download the model (SqueezeNet 1.0, ONNX version: 1.3, Opset version: 8) +❯ curl -LO "https://github.com/onnx/models/raw/main/vision/classification/squeezenet/model/squeezenet1.0-8.onnx" +❯ cargo run --example c_api_sample +[...] + Finished dev [unoptimized + debuginfo] target(s) in 1.88s + Running `target/debug/examples/c_api_sample` +Using Onnxruntime C API +2020-08-09 09:37:41.554922 [I:onnxruntime:, inference_session.cc:174 ConstructorCommon] Creating and using per session threadpools since use_per_session_threads_ is true +2020-08-09 09:37:41.556650 [I:onnxruntime:, inference_session.cc:830 Initialize] Initializing session. +2020-08-09 09:37:41.556665 [I:onnxruntime:, inference_session.cc:848 Initialize] Adding default CPU execution provider. +2020-08-09 09:37:41.556678 [I:onnxruntime:test, bfc_arena.cc:15 BFCArena] Creating BFCArena for Cpu +2020-08-09 09:37:41.556687 [V:onnxruntime:test, bfc_arena.cc:32 BFCArena] Creating 21 bins of max chunk size 256 to 268435456 +2020-08-09 09:37:41.558313 [I:onnxruntime:, reshape_fusion.cc:37 ApplyImpl] Total fused reshape node count: 0 +2020-08-09 09:37:41.559327 [I:onnxruntime:, reshape_fusion.cc:37 ApplyImpl] Total fused reshape node count: 0 +2020-08-09 09:37:41.559476 [I:onnxruntime:, reshape_fusion.cc:37 ApplyImpl] Total fused reshape node count: 0 +2020-08-09 09:37:41.559607 [V:onnxruntime:, inference_session.cc:671 TransformGraph] Node placements +2020-08-09 09:37:41.559615 [V:onnxruntime:, inference_session.cc:673 TransformGraph] All nodes have been placed on [CPUExecutionProvider]. +2020-08-09 09:37:41.559639 [I:onnxruntime:, session_state.cc:25 SetGraph] SaveMLValueNameIndexMapping +2020-08-09 09:37:41.559787 [I:onnxruntime:, session_state.cc:70 SetGraph] Done saving OrtValue mappings. +2020-08-09 09:37:41.560252 [I:onnxruntime:, session_state_initializer.cc:178 SaveInitializedTensors] Saving initialized tensors. +2020-08-09 09:37:41.563467 [I:onnxruntime:, session_state_initializer.cc:223 SaveInitializedTensors] Done saving initialized tensors +2020-08-09 09:37:41.563979 [I:onnxruntime:, inference_session.cc:919 Initialize] Session successfully initialized. +Number of inputs = 1 +Input 0 : name=data_0 +Input 0 : type=1 +Input 0 : num_dims=4 +Input 0 : dim 0=1 +Input 0 : dim 1=3 +Input 0 : dim 2=224 +Input 0 : dim 3=224 +2020-08-09 09:37:41.573127 [I:onnxruntime:, sequential_executor.cc:145 Execute] Begin execution +2020-08-09 09:37:41.573183 [I:onnxruntime:test, bfc_arena.cc:259 AllocateRawInternal] Extending BFCArena for Cpu. bin_num:13 rounded_bytes:3154176 +2020-08-09 09:37:41.573197 [I:onnxruntime:test, bfc_arena.cc:143 Extend] Extended allocation by 4194304 bytes. +2020-08-09 09:37:41.573203 [I:onnxruntime:test, bfc_arena.cc:147 Extend] Total allocated bytes: 9137152 +2020-08-09 09:37:41.573212 [I:onnxruntime:test, bfc_arena.cc:150 Extend] Allocated memory at 0x7fb7d6cb7000 to 0x7fb7d70b7000 +2020-08-09 09:37:41.573248 [I:onnxruntime:test, bfc_arena.cc:259 AllocateRawInternal] Extending BFCArena for Cpu. bin_num:8 rounded_bytes:65536 +2020-08-09 09:37:41.573256 [I:onnxruntime:test, bfc_arena.cc:143 Extend] Extended allocation by 4194304 bytes. +2020-08-09 09:37:41.573262 [I:onnxruntime:test, bfc_arena.cc:147 Extend] Total allocated bytes: 13331456 +2020-08-09 09:37:41.573268 [I:onnxruntime:test, bfc_arena.cc:150 Extend] Allocated memory at 0x7fb7d70b7000 to 0x7fb7d74b7000 +Score for class [0] = 0.000045440644 +Score for class [1] = 0.0038458651 +Score for class [2] = 0.00012494653 +Score for class [3] = 0.0011804523 +Score for class [4] = 0.0013169361 +Done! +``` + +### onnxruntime + +To run this example ([`onnxruntime/examples/sample.rs`](onnxruntime/examples/sample.rs)): + +```sh +# Download the model (SqueezeNet 1.0, ONNX version: 1.3, Opset version: 8) +❯ curl -LO "https://github.com/onnx/models/raw/main/vision/classification/squeezenet/model/squeezenet1.0-8.onnx" +❯ cargo run --example sample +[...] + Finished dev [unoptimized + debuginfo] target(s) in 13.62s + Running `target/debug/examples/sample` +Uninitialized environment found, initializing it with name "test". +2020-08-09 09:34:37.395577 [I:onnxruntime:, inference_session.cc:174 ConstructorCommon] Creating and using per session threadpools since use_per_session_threads_ is true +2020-08-09 09:34:37.399253 [I:onnxruntime:, inference_session.cc:830 Initialize] Initializing session. +2020-08-09 09:34:37.399284 [I:onnxruntime:, inference_session.cc:848 Initialize] Adding default CPU execution provider. +2020-08-09 09:34:37.399313 [I:onnxruntime:test, bfc_arena.cc:15 BFCArena] Creating BFCArena for Cpu +2020-08-09 09:34:37.399335 [V:onnxruntime:test, bfc_arena.cc:32 BFCArena] Creating 21 bins of max chunk size 256 to 268435456 +2020-08-09 09:34:37.410516 [I:onnxruntime:, reshape_fusion.cc:37 ApplyImpl] Total fused reshape node count: 0 +2020-08-09 09:34:37.417478 [I:onnxruntime:, reshape_fusion.cc:37 ApplyImpl] Total fused reshape node count: 0 +2020-08-09 09:34:37.420131 [I:onnxruntime:, reshape_fusion.cc:37 ApplyImpl] Total fused reshape node count: 0 +2020-08-09 09:34:37.422623 [V:onnxruntime:, inference_session.cc:671 TransformGraph] Node placements +2020-08-09 09:34:37.428863 [V:onnxruntime:, inference_session.cc:673 TransformGraph] All nodes have been placed on [CPUExecutionProvider]. +2020-08-09 09:34:37.428954 [I:onnxruntime:, session_state.cc:25 SetGraph] SaveMLValueNameIndexMapping +2020-08-09 09:34:37.429079 [I:onnxruntime:, session_state.cc:70 SetGraph] Done saving OrtValue mappings. +2020-08-09 09:34:37.429925 [I:onnxruntime:, session_state_initializer.cc:178 SaveInitializedTensors] Saving initialized tensors. +2020-08-09 09:34:37.436300 [I:onnxruntime:, session_state_initializer.cc:223 SaveInitializedTensors] Done saving initialized tensors +2020-08-09 09:34:37.437255 [I:onnxruntime:, inference_session.cc:919 Initialize] Session successfully initialized. +Dropping the session options. +2020-08-09 09:34:37.448956 [I:onnxruntime:, sequential_executor.cc:145 Execute] Begin execution +2020-08-09 09:34:37.449041 [I:onnxruntime:test, bfc_arena.cc:259 AllocateRawInternal] Extending BFCArena for Cpu. bin_num:13 rounded_bytes:3154176 +2020-08-09 09:34:37.449072 [I:onnxruntime:test, bfc_arena.cc:143 Extend] Extended allocation by 4194304 bytes. +2020-08-09 09:34:37.449087 [I:onnxruntime:test, bfc_arena.cc:147 Extend] Total allocated bytes: 9137152 +2020-08-09 09:34:37.449104 [I:onnxruntime:test, bfc_arena.cc:150 Extend] Allocated memory at 0x7fb3b9585000 to 0x7fb3b9985000 +2020-08-09 09:34:37.449176 [I:onnxruntime:test, bfc_arena.cc:259 AllocateRawInternal] Extending BFCArena for Cpu. bin_num:8 rounded_bytes:65536 +2020-08-09 09:34:37.449196 [I:onnxruntime:test, bfc_arena.cc:143 Extend] Extended allocation by 4194304 bytes. +2020-08-09 09:34:37.449209 [I:onnxruntime:test, bfc_arena.cc:147 Extend] Total allocated bytes: 13331456 +2020-08-09 09:34:37.449222 [I:onnxruntime:test, bfc_arena.cc:150 Extend] Allocated memory at 0x7fb3b9985000 to 0x7fb3b9d85000 +Dropping Tensor. +Score for class [0] = 0.000045440578 +Score for class [1] = 0.0038458686 +Score for class [2] = 0.0001249467 +Score for class [3] = 0.0011804511 +Score for class [4] = 0.00131694 +Dropping TensorFromOrt. +Dropping the session. +Dropping the memory information. +Dropping the environment. +``` + +See also the integration tests ([`onnxruntime/tests/integration_tests.rs`](onnxruntime/tests/integration_tests.rs)) +that performs simple model download and inference, validating the results. + +## License + +The Rust bindings are licensed under either of + +* Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or + http://www.apache.org/licenses/LICENSE-2.0) +* MIT license ([LICENSE-MIT](LICENSE-MIT) or + http://opensource.org/licenses/MIT) + +at your option. diff --git a/rust/onnxruntime-sys/Cargo.toml b/rust/onnxruntime-sys/Cargo.toml new file mode 100644 index 0000000000000..4806e6ca2953c --- /dev/null +++ b/rust/onnxruntime-sys/Cargo.toml @@ -0,0 +1,35 @@ +[package] +authors = ["Nicolas Bigaouette "] +edition = "2018" +name = "onnxruntime-sys" +version = "0.0.14" + +links = "onnxruntime" + +description = "Unsafe wrapper around Microsoft's ONNX Runtime" +documentation = "https://docs.rs/onnxruntime-sys" +homepage = "https://github.com/microsoft/onnxruntime" +license = "MIT OR Apache-2.0" +readme = "../README.md" +repository = "https://github.com/microsoft/onnxruntime" + +categories = ["science"] +keywords = ["neuralnetworks", "onnx", "bindings"] + +[dependencies] +libloading = "0.7" + +[build-dependencies] +bindgen = "0.63" +cmake = "0.1" + +# Used on unix +flate2 = "1.0" +tar = "0.4" +ureq = "2.1" + +# Used on Windows +zip = "0.6" + +[features] +default = [] diff --git a/rust/onnxruntime-sys/build.rs b/rust/onnxruntime-sys/build.rs new file mode 100644 index 0000000000000..82d1e4278015c --- /dev/null +++ b/rust/onnxruntime-sys/build.rs @@ -0,0 +1,429 @@ +#![allow(dead_code)] + +use std::{ + borrow::Cow, + env, fs, + io::{self, Read, Write}, + path::{Path, PathBuf}, + str::FromStr, +}; + +/// ONNX Runtime version +/// +/// WARNING: If version is changed, bindings for all platforms will have to be re-generated. +/// To do so, run this: +/// cargo build --package onnxruntime-sys --features generate-bindings +const ORT_VERSION: &str = include_str!("../../VERSION_NUMBER"); + +/// Base Url from which to download pre-built releases/ +const ORT_RELEASE_BASE_URL: &str = "https://github.com/microsoft/onnxruntime/releases/download"; + +/// Environment variable selecting which strategy to use for finding the library +/// Possibilities: +/// * "download": Download a pre-built library. This is the default if `ORT_STRATEGY` is not set. +/// * "system": Use installed library. Use `ORT_LIB_LOCATION` to point to proper location. +/// * "compile": Download source and compile (TODO). +const ORT_RUST_ENV_STRATEGY: &str = "ORT_RUST_STRATEGY"; + +/// Name of environment variable that, if present, contains the location of a pre-built library. +/// Only used if `ORT_STRATEGY=system`. +const ORT_RUST_ENV_SYSTEM_LIB_LOCATION: &str = "ORT_RUST_LIB_LOCATION"; +/// Name of environment variable that, if present, controls whether to use CUDA or not. +const ORT_RUST_ENV_GPU: &str = "ORT_RUST_USE_CUDA"; + +/// Subdirectory (of the 'target' directory) into which to extract the prebuilt library. +const ORT_PREBUILT_EXTRACT_DIR: &str = "onnxruntime"; + +fn main() { + let libort_install_dir = prepare_libort_dir(); + + let include_dir = libort_install_dir.join("include"); + let lib_dir = libort_install_dir.join("lib"); + + println!("Include directory: {:?}", include_dir); + println!("Lib directory: {:?}", lib_dir); + + // Tell cargo to tell rustc to link onnxruntime shared library. + println!("cargo:rustc-link-lib=onnxruntime"); + println!("cargo:rustc-link-search=native={}", lib_dir.display()); + + println!("cargo:rerun-if-env-changed={}", ORT_RUST_ENV_STRATEGY); + println!("cargo:rerun-if-env-changed={}", ORT_RUST_ENV_GPU); + println!( + "cargo:rerun-if-env-changed={}", + ORT_RUST_ENV_SYSTEM_LIB_LOCATION + ); + + generate_bindings(&include_dir); +} + +fn generate_bindings(include_dir: &Path) { + let clang_args = &[ + format!("-I{}", include_dir.display()), + format!( + "-I{}", + include_dir + .join("onnxruntime") + .join("core") + .join("session") + .display() + ), + ]; + + let path = include_dir + .join("onnxruntime") + .join("core") + .join("session") + .join("onnxruntime_c_api.h"); + + // The bindgen::Builder is the main entry point + // to bindgen, and lets you build up options for + // the resulting bindings. + let bindings = bindgen::Builder::default() + // The input header we would like to generate + // bindings for. + .header(path.to_string_lossy().to_string()) + // The current working directory is 'onnxruntime-sys' + .clang_args(clang_args) + // Tell cargo to invalidate the built crate whenever any of the + // included header files changed. + .parse_callbacks(Box::new(bindgen::CargoCallbacks)) + .dynamic_library_name("onnxruntime") + .allowlist_type("Ort.*") + .allowlist_type("Onnx.*") + .allowlist_type("ONNX.*") + .allowlist_function("Ort.*") + .allowlist_var("ORT.*") + // Set `size_t` to be translated to `usize` for win32 compatibility. + .size_t_is_usize(true) + // Format using rustfmt + .rustfmt_bindings(true) + .rustified_enum(".*") + // Finish the builder and generate the bindings. + .generate() + // Unwrap the Result and panic on failure. + .expect("Unable to generate bindings"); + + let generated_file = PathBuf::from(env::var("OUT_DIR").unwrap()).join("bindings.rs"); + println!("cargo:rerun-if-changed={:?}", generated_file); + bindings + .write_to_file(&generated_file) + .expect("Couldn't write bindings!"); +} + +fn download

(source_url: &str, target_file: P) +where + P: AsRef, +{ + let resp = ureq::get(source_url) + .timeout(std::time::Duration::from_secs(300)) + .call() + .unwrap_or_else(|err| panic!("ERROR: Failed to download {}: {:?}", source_url, err)); + + let len = resp + .header("Content-Length") + .and_then(|s| s.parse::().ok()) + .unwrap(); + let mut reader = resp.into_reader(); + // FIXME: Save directly to the file + let mut buffer = vec![]; + let read_len = reader.read_to_end(&mut buffer).unwrap(); + assert_eq!(buffer.len(), len); + assert_eq!(buffer.len(), read_len); + + let f = fs::File::create(&target_file).unwrap(); + let mut writer = io::BufWriter::new(f); + writer.write_all(&buffer).unwrap(); +} + +fn extract_archive(filename: &Path, output: &Path) { + match filename.extension().map(std::ffi::OsStr::to_str) { + Some(Some("zip")) => extract_zip(filename, output), + Some(Some("tgz")) => extract_tgz(filename, output), + _ => unimplemented!(), + } +} + +fn extract_tgz(filename: &Path, output: &Path) { + let file = fs::File::open(&filename).unwrap(); + let buf = io::BufReader::new(file); + let tar = flate2::read::GzDecoder::new(buf); + let mut archive = tar::Archive::new(tar); + archive.unpack(output).unwrap(); +} + +fn extract_zip(filename: &Path, outpath: &Path) { + let file = fs::File::open(&filename).unwrap(); + let buf = io::BufReader::new(file); + let mut archive = zip::ZipArchive::new(buf).unwrap(); + for i in 0..archive.len() { + let mut file = archive.by_index(i).unwrap(); + #[allow(deprecated)] + let outpath = outpath.join(file.sanitized_name()); + if !file.name().ends_with('/') { + println!( + "File {} extracted to \"{}\" ({} bytes)", + i, + outpath.as_path().display(), + file.size() + ); + if let Some(p) = outpath.parent() { + if !p.exists() { + fs::create_dir_all(&p).unwrap(); + } + } + let mut outfile = fs::File::create(&outpath).unwrap(); + io::copy(&mut file, &mut outfile).unwrap(); + } + } +} + +trait OnnxPrebuiltArchive { + fn as_onnx_str(&self) -> Cow; +} + +#[derive(Debug)] +enum Architecture { + X86, + X86_64, + Arm, + Arm64, +} + +impl FromStr for Architecture { + type Err = String; + + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "x86" => Ok(Architecture::X86), + "x86_64" => Ok(Architecture::X86_64), + "arm" => Ok(Architecture::Arm), + "aarch64" => Ok(Architecture::Arm64), + _ => Err(format!("Unsupported architecture: {}", s)), + } + } +} + +impl OnnxPrebuiltArchive for Architecture { + fn as_onnx_str(&self) -> Cow { + match self { + Architecture::X86 => Cow::from("x86"), + Architecture::X86_64 => Cow::from("x64"), + Architecture::Arm => Cow::from("arm"), + Architecture::Arm64 => Cow::from("arm64"), + } + } +} + +#[derive(Debug)] +#[allow(clippy::enum_variant_names)] +enum Os { + Windows, + Linux, + MacOs, +} + +impl Os { + fn archive_extension(&self) -> &'static str { + match self { + Os::Windows => "zip", + Os::Linux => "tgz", + Os::MacOs => "tgz", + } + } +} + +impl FromStr for Os { + type Err = String; + + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "windows" => Ok(Os::Windows), + "macos" => Ok(Os::MacOs), + "linux" => Ok(Os::Linux), + _ => Err(format!("Unsupported os: {}", s)), + } + } +} + +impl OnnxPrebuiltArchive for Os { + fn as_onnx_str(&self) -> Cow { + match self { + Os::Windows => Cow::from("win"), + Os::Linux => Cow::from("linux"), + Os::MacOs => Cow::from("osx"), + } + } +} + +#[derive(Debug, PartialEq, Eq)] +enum Accelerator { + Cpu, + Cuda, +} + +impl FromStr for Accelerator { + type Err = String; + + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "1" | "yes" | "true" | "on" => Ok(Accelerator::Cuda), + _ => Ok(Accelerator::Cpu), + } + } +} + +impl OnnxPrebuiltArchive for Accelerator { + fn as_onnx_str(&self) -> Cow { + match self { + Accelerator::Cpu => Cow::from(""), + Accelerator::Cuda => Cow::from("gpu"), + } + } +} + +#[derive(Debug)] +struct Triplet { + os: Os, + arch: Architecture, + accelerator: Accelerator, +} + +impl OnnxPrebuiltArchive for Triplet { + fn as_onnx_str(&self) -> Cow { + match (&self.os, &self.arch, &self.accelerator) { + // onnxruntime-win-x86-1.11.1.zip + // onnxruntime-win-x64-1.11.1.zip + // onnxruntime-win-arm-1.11.1.zip + // onnxruntime-win-arm64-1.11.1.zip + // onnxruntime-linux-x64-1.11.1.tgz + // onnxruntime-osx-x86_64-1.11.1.tgz + // onnxruntime-osx-arm64-1.11.1.tgz + ( + Os::Windows, + Architecture::X86 | Architecture::X86_64 | Architecture::Arm | Architecture::Arm64, + Accelerator::Cpu, + ) + | (Os::MacOs, Architecture::Arm64, Accelerator::Cpu) + | (Os::Linux, Architecture::X86_64, Accelerator::Cpu) => Cow::from(format!( + "{}-{}", + self.os.as_onnx_str(), + self.arch.as_onnx_str() + )), + (Os::MacOs, Architecture::X86_64, Accelerator::Cpu) => Cow::from(format!( + "{}-x86_{}", + self.os.as_onnx_str(), + self.arch.as_onnx_str().trim_start_matches('x') + )), + // onnxruntime-win-x64-gpu-1.11.1.zip + // onnxruntime-linux-x64-gpu-1.11.1.tgz + (Os::Linux | Os::Windows, Architecture::X86_64, Accelerator::Cuda) => { + Cow::from(format!( + "{}-{}-{}", + self.os.as_onnx_str(), + self.arch.as_onnx_str(), + self.accelerator.as_onnx_str(), + )) + } + _ => { + panic!( + "Unsupported prebuilt triplet: {:?}, {:?}, {:?}. Please use {}=system and {}=/path/to/onnxruntime", + self.os, self.arch, self.accelerator, ORT_RUST_ENV_STRATEGY, ORT_RUST_ENV_SYSTEM_LIB_LOCATION + ); + } + } + } +} + +fn prebuilt_archive_url() -> (PathBuf, String) { + let triplet = Triplet { + os: env::var("CARGO_CFG_TARGET_OS") + .expect("Unable to get TARGET_OS") + .parse() + .unwrap(), + arch: env::var("CARGO_CFG_TARGET_ARCH") + .expect("Unable to get TARGET_ARCH") + .parse() + .unwrap(), + accelerator: env::var(ORT_RUST_ENV_GPU) + .unwrap_or_default() + .parse() + .unwrap(), + }; + + let prebuilt_archive = format!( + "onnxruntime-{}-{}.{}", + triplet.as_onnx_str(), + ORT_VERSION, + triplet.os.archive_extension() + ); + let prebuilt_url = format!( + "{}/v{}/{}", + ORT_RELEASE_BASE_URL, ORT_VERSION, prebuilt_archive + ); + + (PathBuf::from(prebuilt_archive), prebuilt_url) +} + +fn prepare_libort_dir_prebuilt() -> PathBuf { + let (prebuilt_archive, prebuilt_url) = prebuilt_archive_url(); + + let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap()); + let extract_dir = out_dir.join(ORT_PREBUILT_EXTRACT_DIR); + let downloaded_file = out_dir.join(&prebuilt_archive); + + println!("cargo:rerun-if-changed={}", downloaded_file.display()); + + if !downloaded_file.exists() { + println!("Creating directory {:?}", out_dir); + fs::create_dir_all(&out_dir).unwrap(); + + println!( + "Downloading {} into {}", + prebuilt_url, + downloaded_file.display() + ); + download(&prebuilt_url, &downloaded_file); + } + + if !extract_dir.exists() { + println!("Extracting to {}...", extract_dir.display()); + extract_archive(&downloaded_file, &extract_dir); + } + + extract_dir.join(prebuilt_archive.file_stem().unwrap()) +} + +fn prepare_libort_dir() -> PathBuf { + let strategy = env::var(ORT_RUST_ENV_STRATEGY); + println!( + "strategy: {:?}", + strategy.as_ref().map_or_else(|_| "unknown", String::as_str) + ); + match strategy.as_ref().map(String::as_str) { + Ok("download") => prepare_libort_dir_prebuilt(), + Ok("system") => PathBuf::from(match env::var(ORT_RUST_ENV_SYSTEM_LIB_LOCATION) { + Ok(p) => p, + Err(e) => { + panic!( + "Could not get value of environment variable {:?}: {:?}", + ORT_RUST_ENV_SYSTEM_LIB_LOCATION, e + ); + } + }), + Ok("compile") | Err(_) => prepare_libort_dir_compiled(), + _ => panic!("Unknown value for {:?}", ORT_RUST_ENV_STRATEGY), + } +} + +fn prepare_libort_dir_compiled() -> PathBuf { + let mut config = cmake::Config::new("../../cmake"); + + config.define("onnxruntime_BUILD_SHARED_LIB", "ON"); + + if env::var(ORT_RUST_ENV_GPU).unwrap_or_default().parse() == Ok(Accelerator::Cuda) { + config.define("onnxruntime_USE_CUDA", "ON"); + } + + config.build() +} diff --git a/rust/onnxruntime-sys/examples/c_api_sample.rs b/rust/onnxruntime-sys/examples/c_api_sample.rs new file mode 100644 index 0000000000000..499f1548de396 --- /dev/null +++ b/rust/onnxruntime-sys/examples/c_api_sample.rs @@ -0,0 +1,395 @@ +#![allow(non_snake_case)] + +use std::env::args; +#[cfg(not(target_family = "windows"))] +use std::os::unix::ffi::OsStrExt; +#[cfg(target_family = "windows")] +use std::os::windows::ffi::OsStrExt; + +use onnxruntime_sys::{ + onnxruntime, GraphOptimizationLevel, ONNXTensorElementDataType, OrtAllocator, OrtAllocatorType, + OrtApi, OrtEnv, OrtLoggingLevel, OrtMemType, OrtMemoryInfo, OrtRunOptions, OrtSession, + OrtSessionOptions, OrtStatus, OrtTensorTypeAndShapeInfo, OrtTypeInfo, OrtValue, + ORT_API_VERSION, +}; + +// https://github.com/microsoft/onnxruntime/blob/v1.4.0/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/C_Api_Sample.cpp + +fn main() { + let onnxruntime_path = args() + .nth(1) + .expect("This example expects a path to the ONNXRuntime shared library"); + + let (_, g_ort) = unsafe { + let ort = onnxruntime::new(onnxruntime_path); + + let ort = ort.expect("Error initializing onnxruntime"); + let g_ort = ort.OrtGetApiBase().as_ref().unwrap().GetApi.unwrap()(ORT_API_VERSION); + + (ort, g_ort) + }; + assert_ne!(g_ort, std::ptr::null_mut()); + + //************************************************************************* + // initialize enviroment...one enviroment per process + // enviroment maintains thread pools and other state info + let mut env_ptr: *mut OrtEnv = std::ptr::null_mut(); + let env_name = std::ffi::CString::new("test").unwrap(); + let status = unsafe { + g_ort.as_ref().unwrap().CreateEnv.unwrap()( + OrtLoggingLevel::ORT_LOGGING_LEVEL_VERBOSE, + env_name.as_ptr(), + &mut env_ptr, + ) + }; + CheckStatus(g_ort, status).unwrap(); + assert_ne!(env_ptr, std::ptr::null_mut()); + + // initialize session options if needed + let mut session_options_ptr: *mut OrtSessionOptions = std::ptr::null_mut(); + let status = + unsafe { g_ort.as_ref().unwrap().CreateSessionOptions.unwrap()(&mut session_options_ptr) }; + CheckStatus(g_ort, status).unwrap(); + unsafe { g_ort.as_ref().unwrap().SetIntraOpNumThreads.unwrap()(session_options_ptr, 1) }; + assert_ne!(session_options_ptr, std::ptr::null_mut()); + + // Sets graph optimization level + unsafe { + g_ort + .as_ref() + .unwrap() + .SetSessionGraphOptimizationLevel + .unwrap()( + session_options_ptr, + GraphOptimizationLevel::ORT_ENABLE_BASIC, + ) + }; + + // Optionally add more execution providers via session_options + // E.g. for CUDA include cuda_provider_factory.h and uncomment the following line: + // OrtSessionOptionsAppendExecutionProvider_CUDA(sessionOptions, 0); + + //************************************************************************* + // create session and load model into memory + // NOTE: Original C version loaded SqueezeNet 1.0 (ONNX version: 1.3, Opset version: 8, + // https://github.com/onnx/models/blob/main/vision/classification/squeezenet/model/squeezenet1.0-8.onnx) + // Download it: + // curl -LO "https://github.com/onnx/models/raw/main/vision/classification/squeezenet/model/squeezenet1.0-8.onnx" + // Reference: https://github.com/onnx/models/tree/main/vision/classification/squeezenet#model + let model_path = std::ffi::OsString::from("squeezenet1.0-8.onnx"); + + #[cfg(target_family = "windows")] + let model_path: Vec = model_path + .encode_wide() + .chain(std::iter::once(0)) // Make sure we have a null terminated string + .collect(); + #[cfg(not(target_family = "windows"))] + let model_path: Vec = model_path + .as_bytes() + .iter() + .chain(std::iter::once(&b'\0')) // Make sure we have a null terminated string + .map(|b| *b as std::os::raw::c_char) + .collect(); + + let mut session_ptr: *mut OrtSession = std::ptr::null_mut(); + + println!("Using Onnxruntime C API"); + let status = unsafe { + g_ort.as_ref().unwrap().CreateSession.unwrap()( + env_ptr, + model_path.as_ptr(), + session_options_ptr, + &mut session_ptr, + ) + }; + CheckStatus(g_ort, status).unwrap(); + assert_ne!(session_ptr, std::ptr::null_mut()); + + //************************************************************************* + // print model input layer (node names, types, shape etc.) + // size_t num_input_nodes; + let mut allocator_ptr: *mut OrtAllocator = std::ptr::null_mut(); + let status = unsafe { + g_ort + .as_ref() + .unwrap() + .GetAllocatorWithDefaultOptions + .unwrap()(&mut allocator_ptr) + }; + CheckStatus(g_ort, status).unwrap(); + assert_ne!(allocator_ptr, std::ptr::null_mut()); + + // print number of model input nodes + let mut num_input_nodes: usize = 0; + let status = unsafe { + g_ort.as_ref().unwrap().SessionGetInputCount.unwrap()(session_ptr, &mut num_input_nodes) + }; + CheckStatus(g_ort, status).unwrap(); + assert_ne!(num_input_nodes, 0); + println!("Number of inputs = {:?}", num_input_nodes); + let mut input_node_names: Vec<&str> = Vec::new(); + let mut input_node_dims: Vec = Vec::new(); // simplify... this model has only 1 input node {1, 3, 224, 224}. + // Otherwise need vector> + + // iterate over all input nodes + for i in 0..num_input_nodes { + // print input node names + let mut input_name: *mut i8 = std::ptr::null_mut(); + let status = unsafe { + g_ort.as_ref().unwrap().SessionGetInputName.unwrap()( + session_ptr, + i, + allocator_ptr, + &mut input_name, + ) + }; + CheckStatus(g_ort, status).unwrap(); + assert_ne!(input_name, std::ptr::null_mut()); + + // WARNING: The C function SessionGetInputName allocates memory for the string. + // We cannot let Rust free that string, the C side must free the string. + // We thus convert the pointer to a string slice (&str). + let input_name = char_p_to_str(input_name).unwrap(); + println!("Input {} : name={}", i, input_name); + input_node_names.push(input_name); + + // print input node types + let mut typeinfo_ptr: *mut OrtTypeInfo = std::ptr::null_mut(); + let status = unsafe { + g_ort.as_ref().unwrap().SessionGetInputTypeInfo.unwrap()( + session_ptr, + i, + &mut typeinfo_ptr, + ) + }; + CheckStatus(g_ort, status).unwrap(); + assert_ne!(typeinfo_ptr, std::ptr::null_mut()); + + let mut tensor_info_ptr: *const OrtTensorTypeAndShapeInfo = std::ptr::null_mut(); + let status = unsafe { + g_ort.as_ref().unwrap().CastTypeInfoToTensorInfo.unwrap()( + typeinfo_ptr, + &mut tensor_info_ptr, + ) + }; + CheckStatus(g_ort, status).unwrap(); + assert_ne!(tensor_info_ptr, std::ptr::null_mut()); + + let mut type_: ONNXTensorElementDataType = + ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; + let status = unsafe { + g_ort.as_ref().unwrap().GetTensorElementType.unwrap()(tensor_info_ptr, &mut type_) + }; + CheckStatus(g_ort, status).unwrap(); + assert_ne!( + type_, + ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED + ); + + println!("Input {} : type={}", i, type_ as i32); + + // print input shapes/dims + let mut num_dims = 0; + let status = unsafe { + g_ort.as_ref().unwrap().GetDimensionsCount.unwrap()(tensor_info_ptr, &mut num_dims) + }; + CheckStatus(g_ort, status).unwrap(); + assert_ne!(num_dims, 0); + + println!("Input {} : num_dims={}", i, num_dims); + input_node_dims.resize_with(num_dims as usize, Default::default); + let status = unsafe { + g_ort.as_ref().unwrap().GetDimensions.unwrap()( + tensor_info_ptr, + input_node_dims.as_mut_ptr(), + num_dims, + ) + }; + CheckStatus(g_ort, status).unwrap(); + + for j in 0..num_dims { + println!("Input {} : dim {}={}", i, j, input_node_dims[j as usize]); + } + + unsafe { g_ort.as_ref().unwrap().ReleaseTypeInfo.unwrap()(typeinfo_ptr) }; + } + + // Results should be... + // Number of inputs = 1 + // Input 0 : name = data_0 + // Input 0 : type = 1 + // Input 0 : num_dims = 4 + // Input 0 : dim 0 = 1 + // Input 0 : dim 1 = 3 + // Input 0 : dim 2 = 224 + // Input 0 : dim 3 = 224 + + //************************************************************************* + // Similar operations to get output node information. + // Use OrtSessionGetOutputCount(), OrtSessionGetOutputName() + // OrtSessionGetOutputTypeInfo() as shown above. + + //************************************************************************* + // Score the model using sample data, and inspect values + + let input_tensor_size = 224 * 224 * 3; // simplify ... using known dim values to calculate size + // use OrtGetTensorShapeElementCount() to get official size! + + let output_node_names = &["softmaxout_1"]; + + // initialize input data with values in [0.0, 1.0] + let mut input_tensor_values: Vec = (0..input_tensor_size) + .map(|i| (i as f32) / ((input_tensor_size + 1) as f32)) + .collect(); + + // create input tensor object from data values + let mut memory_info_ptr: *mut OrtMemoryInfo = std::ptr::null_mut(); + let status = unsafe { + g_ort.as_ref().unwrap().CreateCpuMemoryInfo.unwrap()( + OrtAllocatorType::OrtArenaAllocator, + OrtMemType::OrtMemTypeDefault, + &mut memory_info_ptr, + ) + }; + CheckStatus(g_ort, status).unwrap(); + assert_ne!(memory_info_ptr, std::ptr::null_mut()); + + // FIXME: Check me! + let mut input_tensor_ptr: *mut OrtValue = std::ptr::null_mut(); + let input_tensor_ptr_ptr: *mut *mut OrtValue = &mut input_tensor_ptr; + let input_tensor_values_ptr: *mut std::ffi::c_void = + input_tensor_values.as_mut_ptr().cast::(); + assert_ne!(input_tensor_values_ptr, std::ptr::null_mut()); + + let shape: *const i64 = input_node_dims.as_ptr(); + assert_ne!(shape, std::ptr::null_mut()); + + let status = unsafe { + g_ort + .as_ref() + .unwrap() + .CreateTensorWithDataAsOrtValue + .unwrap()( + memory_info_ptr, + input_tensor_values_ptr, + input_tensor_size * std::mem::size_of::(), + shape, + 4, + ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, + input_tensor_ptr_ptr, + ) + }; + CheckStatus(g_ort, status).unwrap(); + assert_ne!(input_tensor_ptr, std::ptr::null_mut()); + + let mut is_tensor = 0; + let status = + unsafe { g_ort.as_ref().unwrap().IsTensor.unwrap()(input_tensor_ptr, &mut is_tensor) }; + CheckStatus(g_ort, status).unwrap(); + assert_eq!(is_tensor, 1); + + let input_tensor_ptr2: *const OrtValue = input_tensor_ptr as *const OrtValue; + let input_tensor_ptr3: *const *const OrtValue = &input_tensor_ptr2; + + unsafe { g_ort.as_ref().unwrap().ReleaseMemoryInfo.unwrap()(memory_info_ptr) }; + + // score model & input tensor, get back output tensor + + let input_node_names_cstring: Vec = input_node_names + .into_iter() + .map(|n| std::ffi::CString::new(n).unwrap()) + .collect(); + let input_node_names_ptr: Vec<*const i8> = input_node_names_cstring + .into_iter() + .map(|n| n.into_raw() as *const i8) + .collect(); + let input_node_names_ptr_ptr: *const *const i8 = input_node_names_ptr.as_ptr(); + + let output_node_names_cstring: Vec = output_node_names + .iter() + .map(|n| std::ffi::CString::new(n.clone()).unwrap()) + .collect(); + let output_node_names_ptr: Vec<*const i8> = output_node_names_cstring + .iter() + .map(|n| n.as_ptr().cast::()) + .collect(); + let output_node_names_ptr_ptr: *const *const i8 = output_node_names_ptr.as_ptr(); + + let _input_node_names_cstring = + unsafe { std::ffi::CString::from_raw(input_node_names_ptr[0] as *mut i8) }; + let run_options_ptr: *const OrtRunOptions = std::ptr::null(); + let mut output_tensor_ptr: *mut OrtValue = std::ptr::null_mut(); + let output_tensor_ptr_ptr: *mut *mut OrtValue = &mut output_tensor_ptr; + + let status = unsafe { + g_ort.as_ref().unwrap().Run.unwrap()( + session_ptr, + run_options_ptr, + input_node_names_ptr_ptr, + input_tensor_ptr3, + 1, + output_node_names_ptr_ptr, + 1, + output_tensor_ptr_ptr, + ) + }; + CheckStatus(g_ort, status).unwrap(); + assert_ne!(output_tensor_ptr, std::ptr::null_mut()); + + let mut is_tensor = 0; + let status = + unsafe { g_ort.as_ref().unwrap().IsTensor.unwrap()(output_tensor_ptr, &mut is_tensor) }; + CheckStatus(g_ort, status).unwrap(); + assert_eq!(is_tensor, 1); + + // Get pointer to output tensor float values + let mut floatarr: *mut f32 = std::ptr::null_mut(); + let floatarr_ptr: *mut *mut f32 = &mut floatarr; + let floatarr_ptr_void: *mut *mut std::ffi::c_void = + floatarr_ptr.cast::<*mut std::ffi::c_void>(); + let status = unsafe { + g_ort.as_ref().unwrap().GetTensorMutableData.unwrap()(output_tensor_ptr, floatarr_ptr_void) + }; + CheckStatus(g_ort, status).unwrap(); + assert_ne!(floatarr, std::ptr::null_mut()); + + assert!((unsafe { *floatarr.offset(0) } - 0.000_045).abs() < 1e-6); + + // score the model, and print scores for first 5 classes + // NOTE: The C ONNX Runtime allocated the array, we shouldn't drop the vec + // but let C de-allocate instead. + let floatarr_vec: Vec = unsafe { Vec::from_raw_parts(floatarr, 5, 5) }; + for i in 0..5 { + println!("Score for class [{}] = {}", i, floatarr_vec[i]); + } + std::mem::forget(floatarr_vec); + + // Results should be as below... + // Score for class[0] = 0.000045 + // Score for class[1] = 0.003846 + // Score for class[2] = 0.000125 + // Score for class[3] = 0.001180 + // Score for class[4] = 0.001317 + + unsafe { g_ort.as_ref().unwrap().ReleaseValue.unwrap()(output_tensor_ptr) }; + unsafe { g_ort.as_ref().unwrap().ReleaseValue.unwrap()(input_tensor_ptr) }; + unsafe { g_ort.as_ref().unwrap().ReleaseSession.unwrap()(session_ptr) }; + unsafe { g_ort.as_ref().unwrap().ReleaseSessionOptions.unwrap()(session_options_ptr) }; + unsafe { g_ort.as_ref().unwrap().ReleaseEnv.unwrap()(env_ptr) }; + + println!("Done!"); +} + +fn CheckStatus(g_ort: *const OrtApi, status: *const OrtStatus) -> Result<(), String> { + if status != std::ptr::null() { + let raw = unsafe { g_ort.as_ref().unwrap().GetErrorMessage.unwrap()(status) }; + Err(char_p_to_str(raw).unwrap().to_string()) + } else { + Ok(()) + } +} + +fn char_p_to_str<'a>(raw: *const i8) -> Result<&'a str, std::str::Utf8Error> { + let c_str = unsafe { std::ffi::CStr::from_ptr(raw as *mut i8) }; + c_str.to_str() +} diff --git a/rust/onnxruntime-sys/src/lib.rs b/rust/onnxruntime-sys/src/lib.rs new file mode 100644 index 0000000000000..c1ba5c347a036 --- /dev/null +++ b/rust/onnxruntime-sys/src/lib.rs @@ -0,0 +1,15 @@ +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] +// Disable clippy and `u128` not being FFI-safe (see #1) +#![allow(clippy::all)] +#![allow(improper_ctypes)] + +include!(concat!(env!("OUT_DIR"), "/bindings.rs")); + +#[cfg(target_os = "windows")] +pub type OnnxEnumInt = i32; +#[cfg(not(target_os = "windows"))] +pub type OnnxEnumInt = u32; + +pub use libloading::library_filename; diff --git a/rust/onnxruntime/Cargo.toml b/rust/onnxruntime/Cargo.toml new file mode 100644 index 0000000000000..d52904c5e50a0 --- /dev/null +++ b/rust/onnxruntime/Cargo.toml @@ -0,0 +1,43 @@ +[package] +authors = ["Nicolas Bigaouette "] +edition = "2018" +name = "onnxruntime" +version = "0.0.14" + +description = "Wrapper around Microsoft's ONNX Runtime" +documentation = "https://docs.rs/onnxruntime" +homepage = "https://onnxruntime.ai/" +license = "MIT OR Apache-2.0" +readme = "../README.md" +repository = "https://github.com/microsoft/onnxruntime" + +categories = ["science"] +keywords = ["neuralnetworks", "onnx", "bindings"] + +[[test]] +name = "integration_tests" +required-features = ["model-fetching"] + +[dependencies] +libloading = "0.7" +ndarray = "0.15" +once_cell = "1.17" +onnxruntime-sys = { version = "0.0.14", path = "../onnxruntime-sys" } +thiserror = "1.0" +tracing = "0.1" + +# Enabled with 'model-fetching' feature +ureq = { version = "2.1", optional = true } + +[dev-dependencies] +image = "0.24" +test-log = { version = "0.2", default-features = false, features = ["trace"] } +tracing-subscriber = "0.2" +ureq = "2.1" + +[features] +# Fetch model from ONNX Model Zoo (https://github.com/onnx/models) +model-fetching = ["ureq"] + +[package.metadata.docs.rs] +features = ["model-fetching"] diff --git a/rust/onnxruntime/examples/issue22.rs b/rust/onnxruntime/examples/issue22.rs new file mode 100644 index 0000000000000..6c96e899fa774 --- /dev/null +++ b/rust/onnxruntime/examples/issue22.rs @@ -0,0 +1,55 @@ +//! Example reproducing issue #22. +//! +//! `model.onnx` available to download here: +//! https://drive.google.com/file/d/1FmL-Wpm06V-8wgRqvV3Skey_X98Ue4D_/view?usp=sharing + +use ndarray::Array2; +use onnxruntime::{environment::Environment, GraphOptimizationLevel, LoggingLevel}; +use std::env::var; +use tracing::Level; +use tracing_subscriber::FmtSubscriber; + +fn main() { + // a builder for `FmtSubscriber`. + let subscriber = FmtSubscriber::builder() + // all spans/events with a level higher than TRACE (e.g, debug, info, warn, etc.) + // will be written to stdout. + .with_max_level(Level::TRACE) + // completes the builder. + .finish(); + + tracing::subscriber::set_global_default(subscriber).expect("setting default subscriber failed"); + + let path = var("RUST_ONNXRUNTIME_LIBRARY_PATH").ok(); + + let builder = Environment::builder() + .with_name("env") + .with_log_level(LoggingLevel::Warning); + + let builder = if let Some(path) = path.clone() { + builder.with_library_path(path) + } else { + builder + }; + + let env = builder.build().unwrap(); + let session = env + .new_session_builder() + .unwrap() + .with_graph_optimization_level(GraphOptimizationLevel::Basic) + .unwrap() + .with_model_from_file("model.onnx") + .unwrap(); + + println!("{:#?}", session.inputs); + println!("{:#?}", session.outputs); + + let input_ids = Array2::::from_shape_vec((1, 3), vec![1, 2, 3]).unwrap(); + let attention_mask = Array2::::from_shape_vec((1, 3), vec![1, 1, 1]).unwrap(); + + let inputs = vec![input_ids.into(), attention_mask.into()]; + + let outputs = session.run(inputs).unwrap(); + + print!("outputs: {:#?}", outputs[0].float_array().unwrap()); +} diff --git a/rust/onnxruntime/examples/print_structure.rs b/rust/onnxruntime/examples/print_structure.rs new file mode 100644 index 0000000000000..ce38218189616 --- /dev/null +++ b/rust/onnxruntime/examples/print_structure.rs @@ -0,0 +1,47 @@ +//! Display the input and output structure of an ONNX model. +use onnxruntime::{environment, LoggingLevel}; +use std::{env::var, error::Error}; + +fn main() -> Result<(), Box> { + let path = var("RUST_ONNXRUNTIME_LIBRARY_PATH").ok(); + + let builder = environment::Environment::builder() + .with_name("onnx_metadata") + .with_log_level(LoggingLevel::Verbose); + + let builder = if let Some(path) = path.clone() { + builder.with_library_path(path) + } else { + builder + }; + + let environment = builder.build().unwrap(); + + // provide path to .onnx model on disk + let path = std::env::args() + .nth(1) + .expect("Must provide an .onnx file as the first arg"); + + let session = environment + .new_session_builder()? + .with_graph_optimization_level(onnxruntime::GraphOptimizationLevel::Basic)? + .with_model_from_file(path)?; + + println!("Inputs:"); + for (index, input) in session.inputs.iter().enumerate() { + println!( + " {}:\n name = {}\n type = {:?}\n dimensions = {:?}", + index, input.name, input.input_type, input.dimensions + ) + } + + println!("Outputs:"); + for (index, output) in session.outputs.iter().enumerate() { + println!( + " {}:\n name = {}\n type = {:?}\n dimensions = {:?}", + index, output.name, output.output_type, output.dimensions + ); + } + + Ok(()) +} diff --git a/rust/onnxruntime/examples/sample.rs b/rust/onnxruntime/examples/sample.rs new file mode 100644 index 0000000000000..9af5cf733ccae --- /dev/null +++ b/rust/onnxruntime/examples/sample.rs @@ -0,0 +1,83 @@ +#![forbid(unsafe_code)] + +use onnxruntime::{environment::Environment, ndarray::Array, GraphOptimizationLevel, LoggingLevel}; +use std::env::var; +use tracing::Level; +use tracing_subscriber::FmtSubscriber; + +type Error = Box; + +fn main() { + if let Err(e) = run() { + eprintln!("Error: {}", e); + std::process::exit(1); + } +} + +fn run() -> Result<(), Error> { + // Setup the example's log level. + // NOTE: ONNX Runtime's log level is controlled separately when building the environment. + let subscriber = FmtSubscriber::builder() + .with_max_level(Level::TRACE) + .finish(); + + tracing::subscriber::set_global_default(subscriber).expect("setting default subscriber failed"); + + let path = var("RUST_ONNXRUNTIME_LIBRARY_PATH").ok(); + + let builder = Environment::builder() + .with_name("test") + .with_log_level(LoggingLevel::Warning); + + let builder = if let Some(path) = path.clone() { + builder.with_library_path(path) + } else { + builder + }; + + let environment = builder.build().unwrap(); + + let session = environment + .new_session_builder()? + .with_graph_optimization_level(GraphOptimizationLevel::Basic)? + .with_intra_op_num_threads(1)? + // NOTE: The example uses SqueezeNet 1.0 (ONNX version: 1.3, Opset version: 8), + // _not_ SqueezeNet 1.1 as downloaded by '.with_model_downloaded(ImageClassification::SqueezeNet)' + // Obtain it with: + // curl -LO "https://github.com/onnx/models/raw/main/vision/classification/squeezenet/model/squeezenet1.0-8.onnx" + .with_model_from_file("squeezenet1.0-8.onnx")?; + + let input0_shape: Vec = session.inputs[0] + .dimensions() + .map(std::option::Option::unwrap) + .collect(); + let output0_shape: Vec = session.outputs[0] + .dimensions() + .map(std::option::Option::unwrap) + .collect(); + + assert_eq!(input0_shape, [1, 3, 224, 224]); + assert_eq!(output0_shape, [1, 1000, 1, 1]); + + // initialize input data with values in [0.0, 1.0] + let n: u32 = session.inputs[0] + .dimensions + .iter() + .map(|d| d.unwrap()) + .product(); + let array = Array::linspace(0.0_f32, 1.0, n as usize) + .into_shape(input0_shape) + .unwrap(); + let input_tensor_values = vec![array.into()]; + + let outputs = session.run(input_tensor_values)?; + + let output = outputs[0].float_array().unwrap(); + + assert_eq!(output.shape(), output0_shape.as_slice()); + for i in 0..5 { + println!("Score for class [{}] = {}", i, output[[0, i, 0, 0]]); + } + + Ok(()) +} diff --git a/rust/onnxruntime/src/download.rs b/rust/onnxruntime/src/download.rs new file mode 100644 index 0000000000000..0b600f3786ada --- /dev/null +++ b/rust/onnxruntime/src/download.rs @@ -0,0 +1,113 @@ +//! Module controlling models downloadable from ONNX Model Zoom +//! +//! Pre-trained models are available from the +//! [ONNX Model Zoo](https://github.com/onnx/models). +//! +//! A pre-trained model can be downloaded automatically using the +//! [`SessionBuilder`](../session/struct.SessionBuilder.html)'s +//! [`with_model_downloaded()`](../session/struct.SessionBuilder.html#method.with_model_downloaded) method. +//! +//! See [`AvailableOnnxModel`](enum.AvailableOnnxModel.html) for the different models available +//! to download. + +#[cfg(feature = "model-fetching")] +use std::{ + fs, io, + path::{Path, PathBuf}, + time::Duration, +}; + +#[cfg(feature = "model-fetching")] +use crate::error::{OrtDownloadError, Result}; + +#[cfg(feature = "model-fetching")] +use tracing::info; + +pub mod language; +pub mod vision; + +/// Available pre-trained models to download from [ONNX Model Zoo](https://github.com/onnx/models). +/// +/// According to [ONNX Model Zoo](https://github.com/onnx/models)'s GitHub page: +/// +/// > The ONNX Model Zoo is a collection of pre-trained, state-of-the-art models in the ONNX format +/// > contributed by community members like you. +#[derive(Debug, Clone)] +pub enum AvailableOnnxModel { + /// Computer vision model + Vision(vision::Vision), + /// Natural language model + Language(language::Language), +} + +trait ModelUrl { + fn fetch_url(&self) -> &'static str; +} + +impl ModelUrl for AvailableOnnxModel { + fn fetch_url(&self) -> &'static str { + match self { + AvailableOnnxModel::Vision(model) => model.fetch_url(), + AvailableOnnxModel::Language(model) => model.fetch_url(), + } + } +} + +impl AvailableOnnxModel { + #[cfg(feature = "model-fetching")] + #[tracing::instrument] + pub(crate) fn download_to

(&self, download_dir: P) -> Result + where + P: AsRef + std::fmt::Debug, + { + let url = self.fetch_url(); + + let model_filename = PathBuf::from(url.split('/').last().unwrap()); + let model_filepath = download_dir.as_ref().join(model_filename); + + if model_filepath.exists() { + info!( + model_filepath = format!("{}", model_filepath.display()).as_str(), + "File already exists, not re-downloading.", + ); + Ok(model_filepath) + } else { + info!( + model_filepath = format!("{}", model_filepath.display()).as_str(), + url = format!("{:?}", url).as_str(), + "Downloading file, please wait....", + ); + + let resp = ureq::get(url) + .timeout(Duration::from_secs(180)) // 3 minutes + .call() + .map_err(Box::new) + .map_err(OrtDownloadError::UreqError)?; + + assert!(resp.has("Content-Length")); + let len = resp + .header("Content-Length") + .and_then(|s| s.parse::().ok()) + .unwrap(); + info!(len, "Downloading {} bytes...", len); + + let mut reader = resp.into_reader(); + + let f = fs::File::create(&model_filepath).unwrap(); + let mut writer = io::BufWriter::new(f); + + let bytes_io_count = + io::copy(&mut reader, &mut writer).map_err(OrtDownloadError::IoError)?; + + if bytes_io_count == len as u64 { + Ok(model_filepath) + } else { + Err(OrtDownloadError::CopyError { + expected: len as u64, + io: bytes_io_count, + } + .into()) + } + } + } +} diff --git a/rust/onnxruntime/src/download/language.rs b/rust/onnxruntime/src/download/language.rs new file mode 100644 index 0000000000000..9bf068cf379ef --- /dev/null +++ b/rust/onnxruntime/src/download/language.rs @@ -0,0 +1,25 @@ +//! Module defining natural language models available to download. +//! +//! See [https://github.com/onnx/models#machine_comprehension](https://github.com/onnx/models#machine_comprehension). + +use super::ModelUrl; + +pub mod machine_comprehension; + +// Re-exports +pub use machine_comprehension::MachineComprehension; + +/// Natural language models +#[derive(Debug, Clone)] +pub enum Language { + /// Machine comprehension + MachineComprehension(MachineComprehension), +} + +impl ModelUrl for Language { + fn fetch_url(&self) -> &'static str { + match self { + Language::MachineComprehension(variant) => variant.fetch_url(), + } + } +} diff --git a/rust/onnxruntime/src/download/language/machine_comprehension.rs b/rust/onnxruntime/src/download/language/machine_comprehension.rs new file mode 100644 index 0000000000000..76143aacd8b35 --- /dev/null +++ b/rust/onnxruntime/src/download/language/machine_comprehension.rs @@ -0,0 +1,127 @@ +//! Module defining machine comprehension models available to download. +//! +//! See [https://github.com/onnx/models#machine_comprehension](https://github.com/onnx/models#machine_comprehension) + +// Acronyms are specific ONNX model names and contains upper cases +#![allow(clippy::upper_case_acronyms)] + +use crate::download::{language::Language, AvailableOnnxModel, ModelUrl}; + +/// Machine Comprehension +/// +/// > This subset of natural language processing models that answer questions about a given context paragraph. +/// +/// Source: [https://github.com/onnx/models#machine_comprehension](https://github.com/onnx/models#machine_comprehension) +#[derive(Debug, Clone)] +pub enum MachineComprehension { + /// Answers a query about a given context paragraph. + /// + /// > This model is a neural network for answering a query about a given context paragraph. + /// + /// Source: [https://github.com/onnx/models/tree/main/text/machine_comprehension/bidirectional_attention_flow](https://github.com/onnx/models/tree/main/text/machine_comprehension/bidirectional_attention_flow) + /// + /// Variant downloaded: ONNX Version 1.4 with Opset Version 9. + BiDAF, + /// Answers questions based on the context of the given input paragraph. + /// + /// Source: [https://github.com/onnx/models/tree/main/text/machine_comprehension/bert-squad](https://github.com/onnx/models/tree/main/text/machine_comprehension/bert-squad) + /// + /// Variant downloaded: ONNX Version 1.5 with Opset Version 10. + BERTSquad, + /// Large transformer-based model that predicts sentiment based on given input text. + /// + /// > Transformer-based language model for text generation. + /// + /// Source: [https://github.com/onnx/models/tree/main/text/machine_comprehension/roberta](https://github.com/onnx/models/tree/main/text/machine_comprehension/roberta) + RoBERTa(RoBERTa), + /// Large transformer-based language model that given a sequence of words within some text, predicts the next word. + /// + /// Source: [https://github.com/onnx/models/tree/main/text/machine_comprehension/gpt-2](https://github.com/onnx/models/tree/main/text/machine_comprehension/gpt-2) + GPT2(GPT2), +} + +/// Large transformer-based model that predicts sentiment based on given input text. +/// +/// > Transformer-based language model for text generation. +/// +/// Source: [https://github.com/onnx/models/tree/main/text/machine_comprehension/roberta](https://github.com/onnx/models/tree/main/text/machine_comprehension/roberta) +#[derive(Debug, Clone)] +pub enum RoBERTa { + /// Variant with input is a sequence of words as a string. Example: "Text to encode: Hello, World" + /// + /// Variant downloaded: ONNX Version 1.6 with Opset Version 11. + RoBERTaBase, + /// Variant with input is a sequence of words as a string including sentiment. Example: "This film is so good" + /// + /// Variant downloaded: ONNX Version 1.6 with Opset Version 9. + RoBERTaSequenceClassification, +} + +/// Large transformer-based language model that given a sequence of words within some text, predicts the next word. +/// +/// > Transformer-based language model for text generation. +/// +/// Source: [https://github.com/onnx/models/tree/main/text/machine_comprehension/gpt-2](https://github.com/onnx/models/tree/main/text/machine_comprehension/gpt-2) +/// +/// Variant downloaded: ONNX Version 1.6 with Opset Version 10. +#[derive(Debug, Clone)] +pub enum GPT2 { + /// Pure GPT2 + GPT2, + /// GPT2 + script changes + /// + /// See [https://github.com/onnx/models/blob/main/text/machine_comprehension/gpt-2/dependencies/GPT2-export.py](https://github.com/onnx/models/blob/main/text/machine_comprehension/gpt-2/dependencies/GPT2-export.py) + /// for the script changes. + GPT2LmHead, +} + +impl ModelUrl for MachineComprehension { + fn fetch_url(&self) -> &'static str { + match self { + MachineComprehension::BiDAF => "https://github.com/onnx/models/raw/main/text/machine_comprehension/bidirectional_attention_flow/model/bidaf-9.onnx", + MachineComprehension::BERTSquad => "https://github.com/onnx/models/raw/main/text/machine_comprehension/bert-squad/model/bertsquad-10.onnx", + MachineComprehension::RoBERTa(variant) => variant.fetch_url(), + MachineComprehension::GPT2(variant) => variant.fetch_url(), + } + } +} + +impl ModelUrl for RoBERTa { + fn fetch_url(&self) -> &'static str { + match self { + RoBERTa::RoBERTaBase => "https://github.com/onnx/models/raw/main/text/machine_comprehension/roberta/model/roberta-base-11.onnx", + RoBERTa::RoBERTaSequenceClassification => "https://github.com/onnx/models/raw/main/text/machine_comprehension/roberta/model/roberta-sequence-classification-9.onnx", + } + } +} + +impl ModelUrl for GPT2 { + fn fetch_url(&self) -> &'static str { + match self { + GPT2::GPT2 => "https://github.com/onnx/models/raw/main/text/machine_comprehension/gpt-2/model/gpt2-10.onnx", + GPT2::GPT2LmHead => "https://github.com/onnx/models/raw/main/text/machine_comprehension/gpt-2/model/gpt2-lm-head-10.onnx", + } + } +} + +impl From for AvailableOnnxModel { + fn from(model: MachineComprehension) -> Self { + AvailableOnnxModel::Language(Language::MachineComprehension(model)) + } +} + +impl From for AvailableOnnxModel { + fn from(model: RoBERTa) -> Self { + AvailableOnnxModel::Language(Language::MachineComprehension( + MachineComprehension::RoBERTa(model), + )) + } +} + +impl From for AvailableOnnxModel { + fn from(model: GPT2) -> Self { + AvailableOnnxModel::Language(Language::MachineComprehension(MachineComprehension::GPT2( + model, + ))) + } +} diff --git a/rust/onnxruntime/src/download/vision.rs b/rust/onnxruntime/src/download/vision.rs new file mode 100644 index 0000000000000..bc4d385b46fed --- /dev/null +++ b/rust/onnxruntime/src/download/vision.rs @@ -0,0 +1,45 @@ +//! Module defining computer vision models available to download. +//! +//! See [https://github.com/onnx/models#image_classification](https://github.com/onnx/models#image_classification) + +use super::ModelUrl; + +pub mod body_face_gesture_analysis; +pub mod domain_based_image_classification; +pub mod image_classification; +pub mod image_manipulation; +pub mod object_detection_image_segmentation; + +// Re-exports +pub use body_face_gesture_analysis::BodyFaceGestureAnalysis; +pub use domain_based_image_classification::DomainBasedImageClassification; +pub use image_classification::ImageClassification; +pub use image_manipulation::ImageManipulation; +pub use object_detection_image_segmentation::ObjectDetectionImageSegmentation; + +/// Computer vision model +#[derive(Debug, Clone)] +pub enum Vision { + /// Domain-based Image Classification + DomainBasedImageClassification(DomainBasedImageClassification), + /// Image classification model + ImageClassification(ImageClassification), + /// Object Detection & Image Segmentation + ObjectDetectionImageSegmentation(ObjectDetectionImageSegmentation), + /// Body, Face & Gesture Analysis + BodyFaceGestureAnalysis(BodyFaceGestureAnalysis), + /// Image Manipulation + ImageManipulation(ImageManipulation), +} + +impl ModelUrl for Vision { + fn fetch_url(&self) -> &'static str { + match self { + Vision::DomainBasedImageClassification(variant) => variant.fetch_url(), + Vision::ImageClassification(variant) => variant.fetch_url(), + Vision::ObjectDetectionImageSegmentation(variant) => variant.fetch_url(), + Vision::BodyFaceGestureAnalysis(variant) => variant.fetch_url(), + Vision::ImageManipulation(variant) => variant.fetch_url(), + } + } +} diff --git a/rust/onnxruntime/src/download/vision/body_face_gesture_analysis.rs b/rust/onnxruntime/src/download/vision/body_face_gesture_analysis.rs new file mode 100644 index 0000000000000..1916f85776076 --- /dev/null +++ b/rust/onnxruntime/src/download/vision/body_face_gesture_analysis.rs @@ -0,0 +1,43 @@ +//! Module defining body, face and gesture analysis models available to download. +//! +//! See [https://github.com/onnx/models#body_analysis](https://github.com/onnx/models#body_analysis) + +use crate::download::{vision::Vision, AvailableOnnxModel, ModelUrl}; + +/// Body, Face & Gesture Analysis +/// +/// > Face detection models identify and/or recognize human faces and emotions in given images. Body and Gesture +/// > Analysis models identify gender and age in given image. +/// +/// Source: [https://github.com/onnx/models#body_analysis](https://github.com/onnx/models#body_analysis) +#[derive(Debug, Clone)] +pub enum BodyFaceGestureAnalysis { + /// A CNN based model for face recognition which learns discriminative features of faces and produces + /// embeddings for input face images. + /// + /// Source: [https://github.com/onnx/models/tree/main/vision/body_analysis/arcface](https://github.com/onnx/models/tree/main/vision/body_analysis/arcface) + /// + /// Variant downloaded: ONNX Version 1.3 with Opset Version 8. + ArcFace, + /// Deep CNN for emotion recognition trained on images of faces. + /// + /// Source: [https://github.com/onnx/models/tree/main/vision/body_analysis/emotion_ferplus](https://github.com/onnx/models/tree/main/vision/body_analysis/emotion_ferplus) + /// + /// Variant downloaded: ONNX Version 1.3 with Opset Version 8. + EmotionFerPlus, +} + +impl ModelUrl for BodyFaceGestureAnalysis { + fn fetch_url(&self) -> &'static str { + match self { + BodyFaceGestureAnalysis::ArcFace => "https://github.com/onnx/models/raw/main/vision/body_analysis/arcface/model/arcfaceresnet100-8.onnx", + BodyFaceGestureAnalysis::EmotionFerPlus => "https://github.com/onnx/models/raw/main/vision/body_analysis/emotion_ferplus/model/emotion-ferplus-8.onnx", + } + } +} + +impl From for AvailableOnnxModel { + fn from(model: BodyFaceGestureAnalysis) -> Self { + AvailableOnnxModel::Vision(Vision::BodyFaceGestureAnalysis(model)) + } +} diff --git a/rust/onnxruntime/src/download/vision/domain_based_image_classification.rs b/rust/onnxruntime/src/download/vision/domain_based_image_classification.rs new file mode 100644 index 0000000000000..78387bf175795 --- /dev/null +++ b/rust/onnxruntime/src/download/vision/domain_based_image_classification.rs @@ -0,0 +1,30 @@ +//! Module defining domain-based image classification models available to download. +//! +//! See [https://github.com/onnx/models#domain-based-image-classification-](https://github.com/onnx/models#domain-based-image-classification-) + +use crate::download::{vision::Vision, AvailableOnnxModel, ModelUrl}; + +/// Image classification model +#[derive(Debug, Clone)] +pub enum DomainBasedImageClassification { + /// Handwritten digits prediction using CNN + /// + /// Source: [https://github.com/onnx/models/tree/main/vision/classification/mnist](https://github.com/onnx/models/tree/main/vision/classification/mnist) + /// + /// Variant downloaded: ONNX Version 1.3 with Opset Version 8. + Mnist, +} + +impl ModelUrl for DomainBasedImageClassification { + fn fetch_url(&self) -> &'static str { + match self { + DomainBasedImageClassification::Mnist => "https://github.com/onnx/models/raw/main/vision/classification/mnist/model/mnist-8.onnx", + } + } +} + +impl From for AvailableOnnxModel { + fn from(model: DomainBasedImageClassification) -> Self { + AvailableOnnxModel::Vision(Vision::DomainBasedImageClassification(model)) + } +} diff --git a/rust/onnxruntime/src/download/vision/image_classification.rs b/rust/onnxruntime/src/download/vision/image_classification.rs new file mode 100644 index 0000000000000..7806a75547a42 --- /dev/null +++ b/rust/onnxruntime/src/download/vision/image_classification.rs @@ -0,0 +1,350 @@ +//! Module defining image classification models available to download. +//! +//! See [https://github.com/onnx/models#image_classification](https://github.com/onnx/models#image_classification) + +// Acronyms are specific ONNX model names and contains upper cases +#![allow(clippy::upper_case_acronyms)] + +use crate::download::{vision::Vision, AvailableOnnxModel, ModelUrl}; + +/// Image classification model +/// +/// > This collection of models take images as input, then classifies the major objects in the images +/// > into 1000 object categories such as keyboard, mouse, pencil, and many animals. +/// +/// Source: [https://github.com/onnx/models#image-classification-](https://github.com/onnx/models#image-classification-) +#[derive(Debug, Clone)] +pub enum ImageClassification { + /// Image classification aimed for mobile targets. + /// + /// > MobileNet models perform image classification - they take images as input and classify the major + /// > object in the image into a set of pre-defined classes. They are trained on ImageNet dataset which + /// > contains images from 1000 classes. MobileNet models are also very efficient in terms of speed and + /// > size and hence are ideal for embedded and mobile applications. + /// + /// Source: [https://github.com/onnx/models/tree/main/vision/classification/mobilenet](https://github.com/onnx/models/tree/main/vision/classification/mobilenet) + /// + /// Variant downloaded: ONNX Version 1.2.1 with Opset Version 7. + MobileNet, + /// Image classification, trained on ImageNet with 1000 classes. + /// + /// > ResNet models provide very high accuracies with affordable model sizes. They are ideal for cases when + /// > high accuracy of classification is required. + /// + /// Source: [https://github.com/onnx/models/tree/main/vision/classification/resnet](https://github.com/onnx/models/tree/main/vision/classification/resnet) + ResNet(ResNet), + /// A small CNN with AlexNet level accuracy on ImageNet with 50x fewer parameters. + /// + /// > SqueezeNet is a small CNN which achieves AlexNet level accuracy on ImageNet with 50x fewer parameters. + /// > SqueezeNet requires less communication across servers during distributed training, less bandwidth to + /// > export a new model from the cloud to an autonomous car and more feasible to deploy on FPGAs and other + /// > hardware with limited memory. + /// + /// Source: [https://github.com/onnx/models/tree/main/vision/classification/squeezenet](https://github.com/onnx/models/tree/main/vision/classification/squeezenet) + /// + /// Variant downloaded: SqueezeNet v1.1, ONNX Version 1.2.1 with Opset Version 7. + SqueezeNet, + /// Image classification, trained on ImageNet with 1000 classes. + /// + /// > VGG models provide very high accuracies but at the cost of increased model sizes. They are ideal for + /// > cases when high accuracy of classification is essential and there are limited constraints on model sizes. + /// + /// Source: [https://github.com/onnx/models/tree/main/vision/classification/vgg](https://github.com/onnx/models/tree/main/vision/classification/vgg) + Vgg(Vgg), + /// Convolutional neural network for classification, which competed in the ImageNet Large Scale Visual Recognition Challenge in 2012. + /// + /// Source: [https://github.com/onnx/models/tree/main/vision/classification/alexnet](https://github.com/onnx/models/tree/main/vision/classification/alexnet) + /// + /// Variant downloaded: ONNX Version 1.4 with Opset Version 9. + AlexNet, + /// Convolutional neural network for classification, which competed in the ImageNet Large Scale Visual Recognition Challenge in 2014. + /// + /// Source: [https://github.com/onnx/models/tree/main/vision/classification/inception_and_googlenet/googlenet](https://github.com/onnx/models/tree/main/vision/classification/inception_and_googlenet/googlenet) + /// + /// Variant downloaded: ONNX Version 1.4 with Opset Version 9. + GoogleNet, + /// Variant of AlexNet, it's the name of a convolutional neural network for classification, which competed in the ImageNet Large Scale Visual Recognition Challenge in 2012. + /// + /// Source: [https://github.com/onnx/models/tree/main/vision/classification/caffenet](https://github.com/onnx/models/tree/main/vision/classification/caffenet) + /// + /// Variant downloaded: ONNX Version 1.4 with Opset Version 9. + CaffeNet, + /// Convolutional neural network for detection. + /// + /// > This model was made by transplanting the R-CNN SVM classifiers into a fc-rcnn classification layer. + /// + /// Source: [https://github.com/onnx/models/tree/main/vision/classification/rcnn_ilsvrc13](https://github.com/onnx/models/tree/main/vision/classification/rcnn_ilsvrc13) + /// + /// Variant downloaded: ONNX Version 1.4 with Opset Version 9. + RcnnIlsvrc13, + /// Convolutional neural network for classification. + /// + /// Source: [https://github.com/onnx/models/tree/main/vision/classification/rcnn_ilsvrc13](https://github.com/onnx/models/tree/main/vision/classification/rcnn_ilsvrc13) + /// + /// Variant downloaded: ONNX Version 1.4 with Opset Version 9. + DenseNet121, + /// Google's Inception + Inception(InceptionVersion), + /// Computationally efficient CNN architecture designed specifically for mobile devices with very limited computing power. + /// + /// Source: [https://github.com/onnx/models/tree/main/vision/classification/shufflenet](https://github.com/onnx/models/tree/main/vision/classification/shufflenet) + ShuffleNet(ShuffleNetVersion), + /// Deep convolutional networks for classification. + /// + /// > This model's 4th layer has 512 maps instead of 1024 maps mentioned in the paper. + /// + /// Source: [https://github.com/onnx/models/tree/main/vision/classification/zfnet-512](https://github.com/onnx/models/tree/main/vision/classification/zfnet-512) + ZFNet512, + /// Image classification model that achieves state-of-the-art accuracy. + /// + /// > It is designed to run on mobile CPU, GPU, and EdgeTPU devices, allowing for applications on mobile and loT, where computational resources are limited. + /// + /// Source: [https://github.com/onnx/models/tree/main/vision/classification/efficientnet-lite4](https://github.com/onnx/models/tree/main/vision/classification/efficientnet-lite4) + /// + /// Variant downloaded: ONNX Version 1.7.0 with Opset Version 11. + EfficientNetLite4, +} + +/// Google's Inception +#[derive(Debug, Clone)] +pub enum InceptionVersion { + /// Google's Inception v1 + /// + /// Source: [https://github.com/onnx/models/tree/main/vision/classification/inception_and_googlenet/inception_v1](https://github.com/onnx/models/tree/main/vision/classification/inception_and_googlenet/inception_v1) + /// + /// Variant downloaded: ONNX Version 1.4 with Opset Version 9. + V1, + /// Google's Inception v2 + /// + /// Source: [https://github.com/onnx/models/tree/main/vision/classification/inception_and_googlenet/inception_v2](https://github.com/onnx/models/tree/main/vision/classification/inception_and_googlenet/inception_v2) + /// + /// Variant downloaded: ONNX Version 1.4 with Opset Version 9. + V2, +} + +/// ResNet +/// +/// Source: [https://github.com/onnx/models/tree/main/vision/classification/resnet](https://github.com/onnx/models/tree/main/vision/classification/resnet) +#[derive(Debug, Clone)] +pub enum ResNet { + /// ResNet v1 + V1(ResNetV1), + /// ResNet v2 + V2(ResNetV2), +} +/// ResNet v1 +/// +/// Source: [https://github.com/onnx/models/tree/main/vision/classification/resnet](https://github.com/onnx/models/tree/main/vision/classification/resnet) +#[derive(Debug, Clone)] +pub enum ResNetV1 { + /// ResNet18 + /// + /// Variant downloaded: ONNX Version 1.2.1 with Opset Version 7. + ResNet18, + /// ResNet34 + /// + /// Variant downloaded: ONNX Version 1.2.1 with Opset Version 7. + ResNet34, + /// ResNet50 + /// + /// Variant downloaded: ONNX Version 1.2.1 with Opset Version 7. + ResNet50, + /// ResNet101 + /// + /// Variant downloaded: ONNX Version 1.2.1 with Opset Version 7. + ResNet101, + /// ResNet152 + /// + /// Variant downloaded: ONNX Version 1.2.1 with Opset Version 7. + ResNet152, +} +/// ResNet v2 +/// +/// Source: [https://github.com/onnx/models/tree/main/vision/classification/resnet](https://github.com/onnx/models/tree/main/vision/classification/resnet) +#[derive(Debug, Clone)] +pub enum ResNetV2 { + /// ResNet18 + /// + /// Variant downloaded: ONNX Version 1.2.1 with Opset Version 7. + ResNet18, + /// ResNet34 + /// + /// Variant downloaded: ONNX Version 1.2.1 with Opset Version 7. + ResNet34, + /// ResNet50 + /// + /// Variant downloaded: ONNX Version 1.2.1 with Opset Version 7. + ResNet50, + /// ResNet101 + /// + /// Variant downloaded: ONNX Version 1.2.1 with Opset Version 7. + ResNet101, + /// ResNet152 + /// + /// Variant downloaded: ONNX Version 1.2.1 with Opset Version 7. + ResNet152, +} + +/// ResNet +/// +/// Source: [https://github.com/onnx/models/tree/main/vision/classification/resnet](https://github.com/onnx/models/tree/main/vision/classification/resnet) +#[derive(Debug, Clone)] +pub enum Vgg { + /// VGG with 16 convolutional layers + /// + /// Variant downloaded: ONNX Version 1.2.1 with Opset Version 7. + Vgg16, + /// VGG with 16 convolutional layers, with batch normalization applied after each convolutional layer. + /// + /// The batch normalization leads to better convergence and slightly better accuracies. + /// + /// Variant downloaded: ONNX Version 1.2.1 with Opset Version 7. + Vgg16Bn, + /// VGG with 19 convolutional layers + /// + /// Variant downloaded: ONNX Version 1.2.1 with Opset Version 7. + Vgg19, + /// VGG with 19 convolutional layers, with batch normalization applied after each convolutional layer. + /// + /// The batch normalization leads to better convergence and slightly better accuracies. + /// + /// Variant downloaded: ONNX Version 1.2.1 with Opset Version 7. + Vgg19Bn, +} + +/// Computationally efficient CNN architecture designed specifically for mobile devices with very limited computing power. +/// +/// Source: [https://github.com/onnx/models/tree/main/vision/classification/shufflenet](https://github.com/onnx/models/tree/main/vision/classification/shufflenet) +#[derive(Debug, Clone)] +pub enum ShuffleNetVersion { + /// Source: [https://github.com/onnx/models/tree/main/vision/classification/shufflenet](https://github.com/onnx/models/tree/main/vision/classification/shufflenet) + /// + /// Variant downloaded: ONNX Version 1.4 with Opset Version 9. + V1, + /// ShuffleNetV2 is an improved architecture that is the state-of-the-art in terms of speed and accuracy tradeoff used for image classification. + /// + /// Source: [https://github.com/onnx/models/tree/main/vision/classification/shufflenet](https://github.com/onnx/models/tree/main/vision/classification/shufflenet) + /// + /// Variant downloaded: ONNX Version 1.6 with Opset Version 10. + V2, +} + +impl ModelUrl for ImageClassification { + fn fetch_url(&self) -> &'static str { + match self { + ImageClassification::MobileNet => "https://github.com/onnx/models/raw/main/vision/classification/mobilenet/model/mobilenetv2-7.onnx", + ImageClassification::SqueezeNet => "https://github.com/onnx/models/raw/main/vision/classification/squeezenet/model/squeezenet1.1-7.onnx", + ImageClassification::Inception(version) => version.fetch_url(), + ImageClassification::ResNet(version) => version.fetch_url(), + ImageClassification::Vgg(variant) => variant.fetch_url(), + ImageClassification::AlexNet => "https://github.com/onnx/models/raw/main/vision/classification/alexnet/model/bvlcalexnet-9.onnx", + ImageClassification::GoogleNet => "https://github.com/onnx/models/raw/main/vision/classification/inception_and_googlenet/googlenet/model/googlenet-9.onnx", + ImageClassification::CaffeNet => "https://github.com/onnx/models/raw/main/vision/classification/caffenet/model/caffenet-9.onnx", + ImageClassification::RcnnIlsvrc13 => "https://github.com/onnx/models/raw/main/vision/classification/rcnn_ilsvrc13/model/rcnn-ilsvrc13-9.onnx", + ImageClassification::DenseNet121 => "https://github.com/onnx/models/raw/main/vision/classification/densenet-121/model/densenet-9.onnx", + ImageClassification::ShuffleNet(version) => version.fetch_url(), + ImageClassification::ZFNet512 => "https://github.com/onnx/models/raw/main/vision/classification/zfnet-512/model/zfnet512-9.onnx", + ImageClassification::EfficientNetLite4 => "https://github.com/onnx/models/raw/main/vision/classification/efficientnet-lite4/model/efficientnet-lite4.onnx" + } + } +} + +impl ModelUrl for InceptionVersion { + fn fetch_url(&self) -> &'static str { + match self { + InceptionVersion::V1 => "https://github.com/onnx/models/raw/main/vision/classification/inception_and_googlenet/inception_v1/model/inception-v1-9.onnx", + InceptionVersion::V2 => "https://github.com/onnx/models/raw/main/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-9.onnx", + } + } +} + +impl ModelUrl for ResNet { + fn fetch_url(&self) -> &'static str { + match self { + ResNet::V1(variant) => variant.fetch_url(), + ResNet::V2(variant) => variant.fetch_url(), + } + } +} + +impl ModelUrl for ResNetV1 { + fn fetch_url(&self) -> &'static str { + match self { + ResNetV1::ResNet18 => "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v1-7.onnx", + ResNetV1::ResNet34 => "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet34-v1-7.onnx", + ResNetV1::ResNet50 => "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-v1-7.onnx", + ResNetV1::ResNet101 => "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet101-v1-7.onnx", + ResNetV1::ResNet152 => "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet152-v1-7.onnx", + } + } +} + +impl ModelUrl for ResNetV2 { + fn fetch_url(&self) -> &'static str { + match self { + ResNetV2::ResNet18 => "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet18-v2-7.onnx", + ResNetV2::ResNet34 => "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet34-v2-7.onnx", + ResNetV2::ResNet50 => "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet50-v2-7.onnx", + ResNetV2::ResNet101 => "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet101-v2-7.onnx", + ResNetV2::ResNet152 => "https://github.com/onnx/models/raw/main/vision/classification/resnet/model/resnet152-v2-7.onnx", + } + } +} + +impl ModelUrl for Vgg { + fn fetch_url(&self) -> &'static str { + match self { + Vgg::Vgg16 => "https://github.com/onnx/models/raw/main/vision/classification/vgg/model/vgg16-7.onnx", + Vgg::Vgg16Bn => "https://github.com/onnx/models/raw/main/vision/classification/vgg/model/vgg16-bn-7.onnx", + Vgg::Vgg19 => "https://github.com/onnx/models/raw/main/vision/classification/vgg/model/vgg19-7.onnx", + Vgg::Vgg19Bn => "https://github.com/onnx/models/raw/main/vision/classification/vgg/model/vgg19-bn-7.onnx", + } + } +} + +impl ModelUrl for ShuffleNetVersion { + fn fetch_url(&self) -> &'static str { + match self { + ShuffleNetVersion::V1 => "https://github.com/onnx/models/raw/main/vision/classification/shufflenet/model/shufflenet-9.onnx", + ShuffleNetVersion::V2 => "https://github.com/onnx/models/raw/main/vision/classification/shufflenet/model/shufflenet-v2-10.onnx", + } + } +} + +impl From for AvailableOnnxModel { + fn from(model: ImageClassification) -> Self { + AvailableOnnxModel::Vision(Vision::ImageClassification(model)) + } +} + +impl From for AvailableOnnxModel { + fn from(variant: ResNet) -> Self { + AvailableOnnxModel::Vision(Vision::ImageClassification(ImageClassification::ResNet( + variant, + ))) + } +} + +impl From for AvailableOnnxModel { + fn from(variant: Vgg) -> Self { + AvailableOnnxModel::Vision(Vision::ImageClassification(ImageClassification::Vgg( + variant, + ))) + } +} + +impl From for AvailableOnnxModel { + fn from(variant: InceptionVersion) -> Self { + AvailableOnnxModel::Vision(Vision::ImageClassification(ImageClassification::Inception( + variant, + ))) + } +} + +impl From for AvailableOnnxModel { + fn from(variant: ShuffleNetVersion) -> Self { + AvailableOnnxModel::Vision(Vision::ImageClassification( + ImageClassification::ShuffleNet(variant), + )) + } +} diff --git a/rust/onnxruntime/src/download/vision/image_manipulation.rs b/rust/onnxruntime/src/download/vision/image_manipulation.rs new file mode 100644 index 0000000000000..4a67e429133d1 --- /dev/null +++ b/rust/onnxruntime/src/download/vision/image_manipulation.rs @@ -0,0 +1,86 @@ +//! Module defining image manipulation models available to download. +//! +//! See [https://github.com/onnx/models#image_manipulation](https://github.com/onnx/models#image_manipulation) + +use crate::download::{vision::Vision, AvailableOnnxModel, ModelUrl}; + +/// Image Manipulation +/// +/// > Image manipulation models use neural networks to transform input images to modified output images. Some +/// > popular models in this category involve style transfer or enhancing images by increasing resolution. +/// +/// Source: [https://github.com/onnx/models#image_manipulation](https://github.com/onnx/models#image_manipulation) +#[derive(Debug, Clone)] +pub enum ImageManipulation { + /// Super Resolution + /// + /// > The Super Resolution machine learning model sharpens and upscales the input image to refine the + /// > details and improve quality. + /// + /// Source: [https://github.com/onnx/models/tree/main/vision/super_resolution/sub_pixel_cnn_2016](https://github.com/onnx/models/tree/main/vision/super_resolution/sub_pixel_cnn_2016) + /// + /// Variant downloaded: ONNX Version 1.5 with Opset Version 10. + SuperResolution, + /// Fast Neural Style Transfer + /// + /// > This artistic style transfer model mixes the content of an image with the style of another image. + /// > Examples of the styles can be seen + /// > [in this PyTorch example](https://github.com/pytorch/examples/tree/main/fast_neural_style#models). + /// + /// Source: [https://github.com/onnx/models/tree/main/vision/style_transfer/fast_neural_style](https://github.com/onnx/models/tree/main/vision/style_transfer/fast_neural_style) + FastNeuralStyleTransfer(FastNeuralStyleTransferStyle), +} + +/// Fast Neural Style Transfer Style +/// +/// Source: [https://github.com/onnx/models/tree/main/vision/style_transfer/fast_neural_style](https://github.com/onnx/models/tree/main/vision/style_transfer/fast_neural_style) +/// +/// Variant downloaded: ONNX Version 1.4 with Opset Version 9. +#[derive(Debug, Clone)] +pub enum FastNeuralStyleTransferStyle { + /// Mosaic style + Mosaic, + /// Candy style + Candy, + /// RainPrincess style + RainPrincess, + /// Udnie style + Udnie, + /// Pointilism style + Pointilism, +} + +impl ModelUrl for ImageManipulation { + fn fetch_url(&self) -> &'static str { + match self { + ImageManipulation::SuperResolution => "https://github.com/onnx/models/raw/main/vision/super_resolution/sub_pixel_cnn_2016/model/super-resolution-10.onnx", + ImageManipulation::FastNeuralStyleTransfer(style) => style.fetch_url(), + } + } +} + +impl ModelUrl for FastNeuralStyleTransferStyle { + fn fetch_url(&self) -> &'static str { + match self { + FastNeuralStyleTransferStyle::Mosaic => "https://github.com/onnx/models/raw/main/vision/style_transfer/fast_neural_style/model/mosaic-9.onnx", + FastNeuralStyleTransferStyle::Candy => "https://github.com/onnx/models/raw/main/vision/style_transfer/fast_neural_style/model/candy-9.onnx", + FastNeuralStyleTransferStyle::RainPrincess => "https://github.com/onnx/models/raw/main/vision/style_transfer/fast_neural_style/model/rain-princess-9.onnx", + FastNeuralStyleTransferStyle::Udnie => "https://github.com/onnx/models/raw/main/vision/style_transfer/fast_neural_style/model/udnie-9.onnx", + FastNeuralStyleTransferStyle::Pointilism => "https://github.com/onnx/models/raw/main/vision/style_transfer/fast_neural_style/model/pointilism-9.onnx", + } + } +} + +impl From for AvailableOnnxModel { + fn from(model: ImageManipulation) -> Self { + AvailableOnnxModel::Vision(Vision::ImageManipulation(model)) + } +} + +impl From for AvailableOnnxModel { + fn from(style: FastNeuralStyleTransferStyle) -> Self { + AvailableOnnxModel::Vision(Vision::ImageManipulation( + ImageManipulation::FastNeuralStyleTransfer(style), + )) + } +} diff --git a/rust/onnxruntime/src/download/vision/object_detection_image_segmentation.rs b/rust/onnxruntime/src/download/vision/object_detection_image_segmentation.rs new file mode 100644 index 0000000000000..ff95154c20c21 --- /dev/null +++ b/rust/onnxruntime/src/download/vision/object_detection_image_segmentation.rs @@ -0,0 +1,107 @@ +//! Module defining object detection and image segmentation models available to download. +//! +//! See [https://github.com/onnx/models#object_detection](https://github.com/onnx/models#object_detection) + +// Acronyms are specific ONNX model names and contains upper cases +#![allow(clippy::upper_case_acronyms)] + +use crate::download::{vision::Vision, AvailableOnnxModel, ModelUrl}; + +/// Object Detection & Image Segmentation +/// +/// > Object detection models detect the presence of multiple objects in an image and segment out areas of the +/// > image where the objects are detected. Semantic segmentation models partition an input image by labeling each pixel +/// > into a set of pre-defined categories. +/// +/// Source: [https://github.com/onnx/models#object_detection](https://github.com/onnx/models#object_detection) +#[derive(Debug, Clone)] +pub enum ObjectDetectionImageSegmentation { + /// A real-time CNN for object detection that detects 20 different classes. A smaller version of the + /// more complex full YOLOv2 network. + /// + /// Variant downloaded: ONNX Version 1.3 with Opset Version 8. + TinyYoloV2, + /// Single Stage Detector: real-time CNN for object detection that detects 80 different classes. + /// + /// Variant downloaded: ONNX Version 1.5 with Opset Version 10. + Ssd, + /// A variant of MobileNet that uses the Single Shot Detector (SSD) model framework. The model detects 80 + /// different object classes and locates up to 10 objects in an image. + /// + /// Variant downloaded: ONNX Version 1.7.0 with Opset Version 10. + SSDMobileNetV1, + /// Increases efficiency from R-CNN by connecting a RPN with a CNN to create a single, unified network for + /// object detection that detects 80 different classes. + /// + /// Variant downloaded: ONNX Version 1.5 with Opset Version 10. + FasterRcnn, + /// A real-time neural network for object instance segmentation that detects 80 different classes. Extends + /// Faster R-CNN as each of the 300 elected ROIs go through 3 parallel branches of the network: label + /// prediction, bounding box prediction and mask prediction. + /// + /// Variant downloaded: ONNX Version 1.5 with Opset Version 10. + MaskRcnn, + /// A real-time dense detector network for object detection that addresses class imbalance through Focal Loss. + /// RetinaNet is able to match the speed of previous one-stage detectors and defines the state-of-the-art in + /// two-stage detectors (surpassing R-CNN). + /// + /// Variant downloaded: ONNX Version 1.6.0 with Opset Version 9. + RetinaNet, + /// A CNN model for real-time object detection system that can detect over 9000 object categories. It uses a + /// single network evaluation, enabling it to be more than 1000x faster than R-CNN and 100x faster than + /// Faster R-CNN. + /// + /// Variant downloaded: ONNX Version 1.3 with Opset Version 8. + YoloV2, + /// A CNN model for real-time object detection system that can detect over 9000 object categories. It uses + /// a single network evaluation, enabling it to be more than 1000x faster than R-CNN and 100x faster than + /// Faster R-CNN. This model is trained with COCO dataset and contains 80 classes. + /// + /// Variant downloaded: ONNX Version 1.5 with Opset Version 9. + YoloV2Coco, + /// A deep CNN model for real-time object detection that detects 80 different classes. A little bigger than + /// YOLOv2 but still very fast. As accurate as SSD but 3 times faster. + /// + /// Variant downloaded: ONNX Version 1.5 with Opset Version 10. + YoloV3, + /// A smaller version of YOLOv3 model. + /// + /// Variant downloaded: ONNX Version 1.6 with Opset Version 11. + TinyYoloV3, + /// Optimizes the speed and accuracy of object detection. Two times faster than EfficientDet. It improves + /// YOLOv3's AP and FPS by 10% and 12%, respectively, with mAP50 of 52.32 on the COCO 2017 dataset and + /// FPS of 41.7 on Tesla 100. + /// + /// Variant downloaded: ONNX Version 1.6 with Opset Version 11. + YoloV4, + /// Deep CNN based pixel-wise semantic segmentation model with >80% mIOU (mean Intersection Over Union). + /// Trained on cityscapes dataset, which can be effectively implemented in self driving vehicle systems. + /// + /// Variant downloaded: ONNX Version 1.2.2 with Opset Version 7. + Duc, +} + +impl ModelUrl for ObjectDetectionImageSegmentation { + fn fetch_url(&self) -> &'static str { + match self { + ObjectDetectionImageSegmentation::TinyYoloV2 => "https://github.com/onnx/models/raw/main/vision/object_detection_segmentation/tiny-yolov2/model/tinyyolov2-8.onnx", + ObjectDetectionImageSegmentation::Ssd => "https://github.com/onnx/models/raw/main/vision/object_detection_segmentation/ssd/model/ssd-10.onnx", + ObjectDetectionImageSegmentation::SSDMobileNetV1 => "https://github.com/onnx/models/raw/main/vision/object_detection_segmentation/ssd-mobilenetv1/model/ssd_mobilenet_v1_10.onnx", + ObjectDetectionImageSegmentation::FasterRcnn => "https://github.com/onnx/models/raw/main/vision/object_detection_segmentation/faster-rcnn/model/FasterRCNN-10.onnx", + ObjectDetectionImageSegmentation::MaskRcnn => "https://github.com/onnx/models/raw/main/vision/object_detection_segmentation/mask-rcnn/model/MaskRCNN-10.onnx", + ObjectDetectionImageSegmentation::RetinaNet => "https://github.com/onnx/models/raw/main/vision/object_detection_segmentation/retinanet/model/retinanet-9.onnx", + ObjectDetectionImageSegmentation::YoloV2 => "https://github.com/onnx/models/raw/main/vision/object_detection_segmentation/yolov2/model/yolov2-voc-8.onnx", + ObjectDetectionImageSegmentation::YoloV2Coco => "https://github.com/onnx/models/raw/main/vision/object_detection_segmentation/yolov2-coco/model/yolov2-coco-9.onnx", + ObjectDetectionImageSegmentation::YoloV3 => "https://github.com/onnx/models/raw/main/vision/object_detection_segmentation/yolov3/model/yolov3-10.onnx", + ObjectDetectionImageSegmentation::TinyYoloV3 => "https://github.com/onnx/models/raw/main/vision/object_detection_segmentation/tiny-yolov3/model/tiny-yolov3-11.onnx", + ObjectDetectionImageSegmentation::YoloV4 => "https://github.com/onnx/models/raw/main/vision/object_detection_segmentation/yolov4/model/yolov4.onnx", + ObjectDetectionImageSegmentation::Duc => "https://github.com/onnx/models/raw/main/vision/object_detection_segmentation/duc/model/ResNet101-DUC-7.onnx", + } + } +} + +impl From for AvailableOnnxModel { + fn from(model: ObjectDetectionImageSegmentation) -> Self { + AvailableOnnxModel::Vision(Vision::ObjectDetectionImageSegmentation(model)) + } +} diff --git a/rust/onnxruntime/src/environment.rs b/rust/onnxruntime/src/environment.rs new file mode 100644 index 0000000000000..04c34ab38c7b9 --- /dev/null +++ b/rust/onnxruntime/src/environment.rs @@ -0,0 +1,373 @@ +//! Module containing environment types + +use crate::{ + error::{status_to_result, OrtError, Result}, + onnxruntime::custom_logger, + session::SessionBuilder, + LoggingLevel, +}; +use once_cell::sync::OnceCell; +use onnxruntime_sys as sys; +use onnxruntime_sys::library_filename; +use std::{ + ffi::CString, + ptr::{null, null_mut}, + sync::{Arc, Mutex, MutexGuard}, +}; +use sys::{onnxruntime, ORT_API_VERSION}; +use tracing::{debug, warn}; + +pub(crate) static ENV: OnceCell>> = OnceCell::new(); + +pub(crate) static LIB: OnceCell = OnceCell::new(); + +#[derive(Debug)] +pub(crate) struct _EnvironmentSingleton { + name: CString, + pub(crate) env_ptr: *mut sys::OrtEnv, + + pub api: *const sys::OrtApi, +} + +impl _EnvironmentSingleton { + pub(crate) unsafe fn api(&self) -> sys::OrtApi { + *self.api + } +} + +unsafe impl Send for _EnvironmentSingleton {} + +unsafe impl Sync for _EnvironmentSingleton {} + +/// An [`Environment`](session/struct.Environment.html) is the main entry point of the ONNX Runtime. +/// +/// Only one ONNXRuntime environment can be created per process. The `onnxruntime` crate +/// uses a singleton (through `lazy_static!()`) to enforce this. +/// +/// Once an environment is created, a [`Session`](../session/struct.Session.html) +/// can be obtained from it. +/// +/// **NOTE**: While the [`Environment`](environment/struct.Environment.html) constructor takes a `name` parameter +/// to name the environment, only the first name will be considered if many environments +/// are created. +/// +/// # Example +/// +/// ```no_run +/// # use std::error::Error; +/// # use std::env::var; +/// # use onnxruntime::{environment::Environment, LoggingLevel}; +/// # fn main() -> Result<(), Box> { +/// # let path = var("RUST_ONNXRUNTIME_LIBRARY_PATH").ok(); +/// +/// let builder = Environment::builder() +/// .with_name("test") +/// .with_log_level(LoggingLevel::Warning); +/// +/// let builder = if let Some(path) = path { +/// builder.with_library_path(path) +/// } else { +/// builder +/// }; +/// let environment = builder.build()?; +/// # Ok(()) +/// # } +/// ``` +pub struct Environment { + pub(crate) env: _Environment, +} + +#[derive(Debug, Clone)] +pub(crate) struct _Environment { + env: Arc>, +} + +impl _Environment { + pub(crate) fn env(&self) -> MutexGuard<_EnvironmentSingleton> { + self.env.lock().expect("The lock is poisoned") + } +} + +impl std::fmt::Debug for Environment { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.env.fmt(f) + } +} + +impl Environment { + /// Create a new environment builder using default values + /// (name: `default`, log level: [`LoggingLevel::Warning`](../enum.LoggingLevel.html#variant.Warning)) + #[must_use] + pub fn builder() -> EnvBuilder { + EnvBuilder { + name: "default".into(), + log_level: LoggingLevel::Warning, + path: None, + } + } + + /// Return the name of the current environment + #[must_use] + pub fn name(&self) -> String { + self.env().name.to_str().unwrap().to_string() + } + + pub(crate) fn env(&self) -> MutexGuard<_EnvironmentSingleton> { + self.env.env() + } + + #[tracing::instrument] + fn new(name: &str, log_level: LoggingLevel, path: Option) -> Result { + let lib = if let Some(path) = path { + LIB.get_or_try_init(|| unsafe { onnxruntime::new(path) })? + } else { + LIB.get_or_try_init(|| unsafe { onnxruntime::new(library_filename("onnxruntime")) })? + }; + let env = ENV.get_or_try_init(|| { + debug!("Environment not yet initialized, creating a new one."); + + let api = unsafe { (*lib.OrtGetApiBase()).GetApi.unwrap()(ORT_API_VERSION) }; + + let mut env_ptr: *mut sys::OrtEnv = std::ptr::null_mut(); + + let logging_function: sys::OrtLoggingFunction = Some(custom_logger); + // FIXME: What should go here? + let logger_param: *mut std::ffi::c_void = std::ptr::null_mut(); + + let cname = CString::new(name).unwrap(); + unsafe { + let create_env_with_custom_logger = (*api).CreateEnvWithCustomLogger.unwrap(); + let status = create_env_with_custom_logger( + logging_function, + logger_param, + log_level.into(), + cname.as_ptr(), + &mut env_ptr, + ); + + status_to_result(status).map_err(OrtError::Environment)?; + } + debug!( + env_ptr = format!("{:?}", env_ptr).as_str(), + "Environment created." + ); + + Ok::<_, OrtError>(Arc::new(Mutex::new(_EnvironmentSingleton { + name: cname, + env_ptr, + api, + }))) + })?; + + let mut guard = env.lock().expect("Lock is poisoned"); + + if guard.env_ptr.is_null() || guard.api.is_null() { + debug!("Environment not yet initialized, creating a new one."); + + let api = unsafe { (*lib.OrtGetApiBase()).GetApi.unwrap()(ORT_API_VERSION) }; + + let mut env_ptr: *mut sys::OrtEnv = std::ptr::null_mut(); + + let logging_function: sys::OrtLoggingFunction = Some(custom_logger); + // FIXME: What should go here? + let logger_param: *mut std::ffi::c_void = std::ptr::null_mut(); + + let cname = CString::new(name).unwrap(); + unsafe { + let create_env_with_custom_logger = (*api).CreateEnvWithCustomLogger.unwrap(); + let status = create_env_with_custom_logger( + logging_function, + logger_param, + log_level.into(), + cname.as_ptr(), + &mut env_ptr, + ); + + status_to_result(status).map_err(OrtError::Environment)?; + } + debug!( + env_ptr = format!("{:?}", env_ptr).as_str(), + "Environment created." + ); + + guard.env_ptr = env_ptr; + guard.api = api; + guard.name = cname; + } + + Ok(Environment { + env: _Environment { env: env.clone() }, + }) + } + + /// Create a new [`SessionBuilder`](../session/struct.SessionBuilder.html) + /// used to create a new ONNXRuntime session. + pub fn new_session_builder(&self) -> Result { + SessionBuilder::new(self) + } +} + +impl Drop for Environment { + fn drop(&mut self) { + if Arc::strong_count(ENV.get().unwrap()) == 2 { + let env = &mut *ENV.get().unwrap().lock().expect("Lock is poisoned"); + + unsafe { + let release_env = env.api().ReleaseEnv.unwrap(); + release_env(env.env_ptr); + + env.api = null(); + + env.env_ptr = null_mut(); + env.name = CString::default(); + }; + } + } +} + +/// Struct used to build an environment [`Environment`](environment/struct.Environment.html) +/// +/// This is the crate's main entry point. An environment _must_ be created +/// as the first step. An [`Environment`](environment/struct.Environment.html) can only be built +/// using `EnvBuilder` to configure it. +/// +/// **NOTE**: If the same configuration method (for example [`with_name()`](struct.EnvBuilder.html#method.with_name)) +/// is called multiple times, the last value will have precedence. +pub struct EnvBuilder { + name: String, + log_level: LoggingLevel, + path: Option, +} + +impl EnvBuilder { + /// Configure the environment with a given name + /// + /// **NOTE**: Since ONNXRuntime can only define one environment per process, + /// creating multiple environments using multiple `EnvBuilder` will + /// end up re-using the same environment internally; a new one will _not_ + /// be created. New parameters will be ignored. + pub fn with_name(mut self, name: S) -> EnvBuilder + where + S: Into, + { + self.name = name.into(); + self + } + + /// Add a library path to the Onnxruntime shared library. + /// + /// **Note**: The library path can be an absolute path or relative (to the executable) path. + /// If no library path is specified, it is expected that the OS can find the Onnxruntime shared + /// library in the normal manner to that OS. + pub fn with_library_path>(mut self, path: P) -> EnvBuilder { + self.path = Some(path.into()); + self + } + + /// Configure the environment with a given log level + /// + /// **NOTE**: Since ONNXRuntime can only define one environment per process, + /// creating multiple environments using multiple `EnvBuilder` will + /// end up re-using the same environment internally; a new one will _not_ + /// be created. New parameters will be ignored. + #[must_use] + pub fn with_log_level(mut self, log_level: LoggingLevel) -> EnvBuilder { + self.log_level = log_level; + self + } + + /// Commit the configuration to a new [`Environment`](environment/struct.Environment.html) + pub fn build(self) -> Result { + Environment::new(&self.name, self.log_level, self.path) + } +} + +#[cfg(test)] +pub(crate) mod tests { + use std::env::var; + + use super::*; + use test_log::test; + + pub(crate) static ONNX_RUNTIME_LIBRARY_PATH: &str = "RUST_ONNXRUNTIME_LIBRARY_PATH"; + + #[test] + fn sequential_environment_creation() { + let first_name: String = "sequential_environment_creation".into(); + + let path = var(ONNX_RUNTIME_LIBRARY_PATH).ok(); + + let builder = Environment::builder() + .with_name(first_name.clone()) + .with_log_level(LoggingLevel::Warning); + + let builder = if let Some(path) = path.clone() { + builder.with_library_path(path) + } else { + builder + }; + + let env = builder.build().unwrap(); + + let mut prev_env_ptr = env.env().env_ptr; + + for i in 0..10 { + let name = format!("sequential_environment_creation: {}", i); + let builder = Environment::builder() + .with_name(name.clone()) + .with_log_level(LoggingLevel::Warning); + + let builder = if let Some(ref path) = path { + builder.with_library_path(path) + } else { + builder + }; + + let env = builder.build().unwrap(); + let next_env_ptr = env.env().env_ptr; + assert_eq!(next_env_ptr, prev_env_ptr); + prev_env_ptr = next_env_ptr; + } + } + + #[test] + fn concurrent_environment_creations() { + let initial_name = "concurrent_environment_creation"; + + let path = var(ONNX_RUNTIME_LIBRARY_PATH).ok(); + + let main_env = Environment::new(initial_name, LoggingLevel::Warning, path.clone()).unwrap(); + let main_env_ptr = main_env.env().env_ptr as usize; + + let children: Vec<_> = (0..10) + .map(|t| { + let path = path.clone(); + + std::thread::spawn(move || { + let name = format!("concurrent_environment_creation: {}", t); + let builder = Environment::builder() + .with_name(name.clone()) + .with_log_level(LoggingLevel::Warning); + + let builder = if let Some(path) = path { + builder.with_library_path(path) + } else { + builder + }; + + let env = builder.build().unwrap(); + + assert_eq!(env.env().env_ptr as usize, main_env_ptr); + }) + }) + .collect(); + + assert_eq!(main_env.env().env_ptr as usize, main_env_ptr); + + let res: Vec> = children + .into_iter() + .map(std::thread::JoinHandle::join) + .collect(); + assert!(res.into_iter().all(|r| std::result::Result::is_ok(&r))); + } +} diff --git a/rust/onnxruntime/src/error.rs b/rust/onnxruntime/src/error.rs new file mode 100644 index 0000000000000..fc44e2b33930e --- /dev/null +++ b/rust/onnxruntime/src/error.rs @@ -0,0 +1,249 @@ +//! Module containing error definitions. + +use std::{io, path::PathBuf}; + +use thiserror::Error; + +use onnxruntime_sys as sys; + +use crate::{char_p_to_string, environment::ENV}; + +/// Type alias for the `Result` +pub type Result = std::result::Result; + +/// Error type centralizing all possible errors +#[non_exhaustive] +#[derive(Error, Debug)] +pub enum OrtError { + /// For errors with libloading + #[error("Failed to load or call onnxruntime library {0}")] + Library(#[from] libloading::Error), + /// The C API can message to the caller using a C `char *` which needs to be converted + /// to Rust's `String`. This operation can fail. + #[error("Failed to construct String")] + StringConversion(OrtApiError), + // FIXME: Move these to another enum (they are C API calls errors) + /// An error occurred when creating an ONNXRuntime environment + #[error("Failed to create environment: {0}")] + Environment(OrtApiError), + /// Error occurred when creating an ONNXRuntime session options + #[error("Failed to create session options: {0}")] + SessionOptions(OrtApiError), + /// Error occurred when creating an ONNXRuntime session + #[error("Failed to create session: {0}")] + Session(OrtApiError), + /// Error occurred when creating an ONNXRuntime allocator + #[error("Failed to get allocator: {0}")] + Allocator(OrtApiError), + /// Error occurred when counting ONNXRuntime input or output count + #[error("Failed to get input or output count: {0}")] + InOutCount(OrtApiError), + /// Error occurred when getting ONNXRuntime input name + #[error("Failed to get input name: {0}")] + InputName(OrtApiError), + /// Error occurred when getting ONNXRuntime type information + #[error("Failed to get type info: {0}")] + GetTypeInfo(OrtApiError), + /// Error occurred when casting ONNXRuntime type information to tensor information + #[error("Failed to cast type info to tensor info: {0}")] + CastTypeInfoToTensorInfo(OrtApiError), + /// Error occurred when getting tensor elements type + #[error("Failed to get tensor element type: {0}")] + TensorElementType(OrtApiError), + /// Error occurred when getting ONNXRuntime dimensions count + #[error("Failed to get dimensions count: {0}")] + GetDimensionsCount(OrtApiError), + /// Error occurred when getting ONNXRuntime dimensions + #[error("Failed to get dimensions: {0}")] + GetDimensions(OrtApiError), + /// Error occurred when creating CPU memory information + #[error("Failed to get dimensions: {0}")] + CreateCpuMemoryInfo(OrtApiError), + /// Error occurred when creating ONNXRuntime tensor + #[error("Failed to create tensor: {0}")] + CreateTensor(OrtApiError), + /// Error occurred when creating ONNXRuntime tensor with specific data + #[error("Failed to create tensor with data: {0}")] + CreateTensorWithData(OrtApiError), + /// Error occurred when filling a tensor with string data + #[error("Failed to fill string tensor: {0}")] + FillStringTensor(OrtApiError), + /// Error occurred when checking if ONNXRuntime tensor was properly initialized + #[error("Failed to check if tensor: {0}")] + IsTensor(OrtApiError), + /// Error occurred when getting tensor type and shape + #[error("Failed to get tensor type and shape: {0}")] + GetTensorTypeAndShape(OrtApiError), + /// Error occurred when ONNXRuntime inference operation was called + #[error("Failed to run: {0}")] + Run(OrtApiError), + /// Error occurred when extracting data from an ONNXRuntime tensor into an C array to be used as an `ndarray::ArrayView` + #[error("Failed to get tensor data: {0}")] + GetTensorMutableData(OrtApiError), + + /// Error occurred when downloading a pre-trained ONNX model from the [ONNX Model Zoo](https://github.com/onnx/models) + #[error("Failed to download ONNX model: {0}")] + DownloadError(#[from] OrtDownloadError), + + /// Dimensions of input data and ONNX model loaded from file do not match + #[error("Dimensions do not match: {0:?}")] + NonMatchingDimensions(NonMatchingDimensionsError), + /// File does not exists + #[error("File {filename:?} does not exists")] + FileDoesNotExists { + /// Path which does not exists + filename: PathBuf, + }, + /// Path is an invalid UTF-8 + #[error("Path {path:?} cannot be converted to UTF-8")] + NonUtf8Path { + /// Path with invalid UTF-8 + path: PathBuf, + }, + /// Attempt to build a Rust `CString` from a null pointer + #[error("Failed to build CString when original contains null: {0}")] + CStringNulError(#[from] std::ffi::NulError), + #[error("{0} pointer should be null")] + /// Ort Pointer should have been null + PointerShouldBeNull(String), + /// Ort pointer should not have been null + #[error("{0} pointer should not be null")] + PointerShouldNotBeNull(String), + /// ONNXRuntime Model has invalid dimensions + #[error("Invalid dimensions")] + InvalidDimensions, + /// The runtime type was undefined + #[error("Undefined Tensor Element Type")] + UndefinedTensorElementType, + /// Error occurred when checking if ONNXRuntime tensor was properly initialized + #[error("Failed to check if tensor")] + IsTensorCheck, +} + +/// Error used when dimensions of input (from model and from inference call) +/// do not match (as they should). +#[non_exhaustive] +#[derive(Error, Debug)] +pub enum NonMatchingDimensionsError { + /// Number of inputs from model does not match number of inputs from inference call + #[error("Non-matching number of inputs: {inference_input_count:?} for input vs {model_input_count:?} for model (inputs: {inference_input:?}, model: {model_input:?})")] + InputsCount { + /// Number of input dimensions used by inference call + inference_input_count: usize, + /// Number of input dimensions defined in model + model_input_count: usize, + /// Input dimensions used by inference call + inference_input: Vec>, + /// Input dimensions defined in model + model_input: Vec>>, + }, + /// Inputs length from model does not match the expected input from inference call + #[error("Different input lengths: Expected Input: {model_input:?} vs Received Input: {inference_input:?}")] + InputsLength { + /// Input dimensions used by inference call + inference_input: Vec>, + /// Input dimensions defined in model + model_input: Vec>>, + }, +} + +/// Error details when ONNXRuntime C API fail +#[non_exhaustive] +#[derive(Error, Debug)] +pub enum OrtApiError { + /// Details as reported by the ONNXRuntime C API in case of error + #[error("Error calling ONNX Runtime C function: {0}")] + Msg(String), + /// Details as reported by the ONNXRuntime C API in case of error cannot be converted to UTF-8 + #[error("Error calling ONNX Runtime C function and failed to convert error message to UTF-8")] + IntoStringError(std::ffi::IntoStringError), +} + +/// Error from downloading pre-trained model from the [ONNX Model Zoo](https://github.com/onnx/models). +#[non_exhaustive] +#[derive(Error, Debug)] +pub enum OrtDownloadError { + /// Generic input/output error + #[error("Error downloading data to file: {0}")] + IoError(#[from] io::Error), + #[cfg(feature = "model-fetching")] + /// Download error by ureq + #[error("Error downloading data to file: {0}")] + UreqError(#[from] Box), + /// Error getting content-length from an HTTP GET request + #[error("Error getting content-length")] + ContentLengthError, + /// Mismatch between amount of downloaded and expected bytes + #[error("Error copying data to file: expected {expected} length, received {io}")] + CopyError { + /// Expected amount of bytes to download + expected: u64, + /// Number of bytes read from network and written to file + io: u64, + }, +} + +/// Wrapper type around a ONNXRuntime C API's `OrtStatus` pointer +/// +/// This wrapper exists to facilitate conversion from C raw pointers to Rust error types +pub struct OrtStatusWrapper(*const sys::OrtStatus); + +impl From<*const sys::OrtStatus> for OrtStatusWrapper { + fn from(status: *const sys::OrtStatus) -> Self { + OrtStatusWrapper(status) + } +} + +pub(crate) fn assert_null_pointer(ptr: *const T, name: &str) -> Result<()> { + ptr.is_null() + .then_some(()) + .ok_or_else(|| OrtError::PointerShouldBeNull(name.to_owned())) +} + +pub(crate) fn assert_not_null_pointer(ptr: *const T, name: &str) -> Result<()> { + (!ptr.is_null()) + .then_some(()) + .ok_or_else(|| OrtError::PointerShouldBeNull(name.to_owned())) +} + +impl From for std::result::Result<(), OrtApiError> { + fn from(status: OrtStatusWrapper) -> Self { + if status.0.is_null() { + Ok(()) + } else { + let raw: *const i8 = unsafe { + ENV.get() + .unwrap() + .lock() + .unwrap() + .api() + .GetErrorMessage + .unwrap()(status.0) + }; + match char_p_to_string(raw) { + Ok(msg) => Err(OrtApiError::Msg(msg)), + Err(err) => match err { + OrtError::StringConversion(OrtApiError::IntoStringError(e)) => { + Err(OrtApiError::IntoStringError(e)) + } + _ => unreachable!(), + }, + } + } + } +} + +pub(crate) fn status_to_result( + status: *const sys::OrtStatus, +) -> std::result::Result<(), OrtApiError> { + let status_wrapper: OrtStatusWrapper = status.into(); + status_wrapper.into() +} + +/// A wrapper around a function on `OrtApi` that maps the status code into [`OrtApiError`] +pub(crate) unsafe fn call_ort(mut f: F) -> std::result::Result<(), OrtApiError> +where + F: FnMut(sys::OrtApi) -> *const sys::OrtStatus, +{ + status_to_result(f(ENV.get().unwrap().lock().unwrap().api())) +} diff --git a/rust/onnxruntime/src/lib.rs b/rust/onnxruntime/src/lib.rs new file mode 100644 index 0000000000000..ce4721ef4240f --- /dev/null +++ b/rust/onnxruntime/src/lib.rs @@ -0,0 +1,560 @@ +#![warn(missing_docs)] + +//! ONNX Runtime +//! +//! This crate is a (safe) wrapper around Microsoft's [ONNX Runtime](https://github.com/microsoft/onnxruntime/) +//! through its C API. +//! +//! From its [GitHub page](https://github.com/microsoft/onnxruntime/): +//! +//! > ONNX Runtime is a cross-platform, high performance ML inferencing and training accelerator. +//! +//! The (highly) unsafe [C API](https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_c_api.h) +//! is wrapped using bindgen as [`onnxruntime-sys`](https://crates.io/crates/onnxruntime-sys). +//! +//! The unsafe bindings are wrapped in this crate to expose a safe API. +//! +//! For now, efforts are concentrated on the inference API. Training is _not_ supported. +//! +//! # Example +//! +//! The C++ example that uses the C API +//! ([`C_Api_Sample.cpp`](https://github.com/microsoft/onnxruntime/blob/v1.3.1/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests.Capi/C_Api_Sample.cpp)) +//! was ported to +//! [`onnxruntime`](https://github.com/nbigaouette/onnxruntime-rs/blob/main/onnxruntime/examples/sample.rs). +//! +//! First, an environment must be created using and [`EnvBuilder`](environment/struct.EnvBuilder.html): +//! +//! ```no_run +//! # use std::error::Error; +//! # use std::env::var; +//! # use onnxruntime::{environment::Environment, LoggingLevel}; +//! # fn main() -> Result<(), Box> { +//! # let path = var("RUST_ONNXRUNTIME_LIBRARY_PATH").ok(); +//! +//! let builder = Environment::builder() +//! .with_name("test") +//! .with_log_level(LoggingLevel::Warning); +//! +//! let builder = if let Some(path) = path { +//! builder.with_library_path(path) +//! } else { +//! builder +//! }; +//! let environment = builder.build()?; +//! Ok(()) +//! } +//! ``` +//! +//! Then a [`Session`](session/struct.Session.html) is created from the environment, some options and an ONNX model file: +//! +//! ```no_run +//! # use std::error::Error; +//! # use std::env::var; +//! # use onnxruntime::{environment::Environment, LoggingLevel, GraphOptimizationLevel}; +//! # fn main() -> Result<(), Box> { +//! # let path = var("RUST_ONNXRUNTIME_LIBRARY_PATH").ok(); +//! # +//! # let builder = Environment::builder() +//! # .with_name("test") +//! # .with_log_level(LoggingLevel::Warning); +//! # +//! # let builder = if let Some(path) = path { +//! # builder.with_library_path(path) +//! # } else { +//! # builder +//! # }; +//! # let environment = builder.build()?; +//! let mut session = environment +//! .new_session_builder()? +//! .with_graph_optimization_level(GraphOptimizationLevel::Basic)? +//! .with_intra_op_num_threads(1)? +//! .with_model_from_file("squeezenet.onnx")?; +//! # Ok(()) +//! # } +//! ``` +#![cfg_attr( + feature = "model-fetching", + doc = r##" +Instead of loading a model from file using [`with_model_from_file()`](session/struct.SessionBuilder.html#method.with_model_from_file), +a model can be fetched directly from the [ONNX Model Zoo](https://github.com/onnx/models) using +[`with_model_downloaded()`](session/struct.SessionBuilder.html#method.with_model_downloaded) method +(requires the `model-fetching` feature). + +```no_run +# use std::error::Error; +# use std::env::var; +# use onnxruntime::{environment::Environment, download::vision::ImageClassification, LoggingLevel, GraphOptimizationLevel}; +# fn main() -> Result<(), Box> { +# let path = var("RUST_ONNXRUNTIME_LIBRARY_PATH").ok(); +# +# let builder = Environment::builder() +# .with_name("test") +# .with_log_level(LoggingLevel::Warning); +# +# let builder = if let Some(path) = path { +# builder.with_library_path(path) +# } else { +# builder +# }; +# let environment = builder.build()?; + +let mut session = environment + .new_session_builder()? + .with_graph_optimization_level(GraphOptimizationLevel::Basic)? + .with_intra_op_num_threads(1)? + .with_model_downloaded(ImageClassification::SqueezeNet)?; +# Ok(()) +# } +``` + +See [`AvailableOnnxModel`](download/enum.AvailableOnnxModel.html) for the different models available +to download. +"## +)] +//! +//! Inference will be run on data passed as an [`ndarray::Array`](https://docs.rs/ndarray/latest/ndarray/type.Array.html). +//! +//! ```no_run +//! # use std::error::Error; +//! # use std::env::var; +//! # use onnxruntime::{environment::Environment, LoggingLevel, GraphOptimizationLevel, tensor::construct::ConstructTensor}; +//! # fn main() -> Result<(), Box> { +//! # let path = var("RUST_ONNXRUNTIME_LIBRARY_PATH").ok(); +//! # +//! # let builder = Environment::builder() +//! # .with_name("test") +//! # .with_log_level(LoggingLevel::Warning); +//! # +//! # let builder = if let Some(path) = path { +//! # builder.with_library_path(path) +//! # } else { +//! # builder +//! # }; +//! # let environment = builder.build()?; +//! # let mut session = environment +//! # .new_session_builder()? +//! # .with_graph_optimization_level(GraphOptimizationLevel::Basic)? +//! # .with_intra_op_num_threads(1)? +//! # .with_model_from_file("squeezenet.onnx")?; +//! let array = ndarray::Array::linspace(0.0_f32, 1.0, 100); +//! // Multiple inputs and outputs are possible +//! let input_tensor = vec![array.into()]; +//! let outputs = session.run(input_tensor)?; +//! # Ok(()) +//! # } +//! ``` +//! +//! The outputs are of type [`OrtOwnedTensor`](tensor/ort_owned_tensor/struct.OrtOwnedTensor.html)s inside a vector, +//! with the same length as the inputs. +//! +//! See the [`sample.rs`](https://github.com/nbigaouette/onnxruntime-rs/blob/main/onnxruntime/examples/sample.rs) +//! example for more details. + +use onnxruntime_sys as sys; + +// Make functions `extern "stdcall"` for Windows 32bit. +// This behaviors like `extern "system"`. +#[cfg(all(target_os = "windows", target_arch = "x86"))] +macro_rules! extern_system_fn { + ($(#[$meta:meta])* fn $($tt:tt)*) => ($(#[$meta])* extern "stdcall" fn $($tt)*); + ($(#[$meta:meta])* $vis:vis fn $($tt:tt)*) => ($(#[$meta])* $vis extern "stdcall" fn $($tt)*); + ($(#[$meta:meta])* unsafe fn $($tt:tt)*) => ($(#[$meta])* unsafe extern "stdcall" fn $($tt)*); + ($(#[$meta:meta])* $vis:vis unsafe fn $($tt:tt)*) => ($(#[$meta])* $vis unsafe extern "stdcall" fn $($tt)*); +} + +// Make functions `extern "C"` for normal targets. +// This behaviors like `extern "system"`. +#[cfg(not(all(target_os = "windows", target_arch = "x86")))] +macro_rules! extern_system_fn { + ($(#[$meta:meta])* fn $($tt:tt)*) => ($(#[$meta])* extern "C" fn $($tt)*); + ($(#[$meta:meta])* $vis:vis fn $($tt:tt)*) => ($(#[$meta])* $vis extern "C" fn $($tt)*); + ($(#[$meta:meta])* unsafe fn $($tt:tt)*) => ($(#[$meta])* unsafe extern "C" fn $($tt)*); + ($(#[$meta:meta])* $vis:vis unsafe fn $($tt:tt)*) => ($(#[$meta])* $vis unsafe extern "C" fn $($tt)*); +} + +pub mod download; +pub mod environment; +pub mod error; +mod memory; +pub mod session; +pub mod tensor; + +// Re-export +pub use error::{OrtApiError, OrtError, Result}; +use sys::OnnxEnumInt; + +// Re-export ndarray as it's part of the public API anyway +pub use ndarray; + +fn char_p_to_string(raw: *const i8) -> Result { + let c_string = unsafe { std::ffi::CStr::from_ptr(raw as *mut i8).to_owned() }; + + match c_string.into_string() { + Ok(string) => Ok(string), + Err(e) => Err(OrtApiError::IntoStringError(e)), + } + .map_err(OrtError::StringConversion) +} + +mod onnxruntime { + //! Module containing a custom logger, used to catch the runtime's own logging and send it + //! to Rust's tracing logging instead. + + use std::ffi::CStr; + use tracing::{debug, error, info, span, trace, warn, Level}; + + use onnxruntime_sys as sys; + + /// Runtime's logging sends the code location where the log happened, will be parsed to this struct. + #[derive(Debug)] + struct CodeLocation<'a> { + file: &'a str, + line_number: &'a str, + function: &'a str, + } + + impl<'a> From<&'a str> for CodeLocation<'a> { + fn from(code_location: &'a str) -> Self { + let mut splitter = code_location.split(' '); + let file_and_line_number = splitter.next().unwrap_or(""); + let function = splitter.next().unwrap_or(""); + let mut file_and_line_number_splitter = file_and_line_number.split(':'); + let file = file_and_line_number_splitter + .next() + .unwrap_or(""); + let line_number = file_and_line_number_splitter + .next() + .unwrap_or(""); + + CodeLocation { + file, + line_number, + function, + } + } + } + + extern_system_fn! { + /// Callback from C that will handle the logging, forwarding the runtime's logs to the tracing crate. + pub(crate) fn custom_logger( + _params: *mut std::ffi::c_void, + severity: sys::OrtLoggingLevel, + category: *const i8, + logid: *const i8, + code_location: *const i8, + message: *const i8, + ) { + let log_level = match severity { + sys::OrtLoggingLevel::ORT_LOGGING_LEVEL_VERBOSE => Level::TRACE, + sys::OrtLoggingLevel::ORT_LOGGING_LEVEL_INFO => Level::DEBUG, + sys::OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING => Level::INFO, + sys::OrtLoggingLevel::ORT_LOGGING_LEVEL_ERROR => Level::WARN, + sys::OrtLoggingLevel::ORT_LOGGING_LEVEL_FATAL => Level::ERROR, + }; + + assert_ne!(category, std::ptr::null()); + let category = unsafe { CStr::from_ptr(category) }; + assert_ne!(code_location, std::ptr::null()); + let code_location = unsafe { CStr::from_ptr(code_location) } + .to_str() + .unwrap_or("unknown"); + assert_ne!(message, std::ptr::null()); + let message = unsafe { CStr::from_ptr(message) }; + + assert_ne!(logid, std::ptr::null()); + let logid = unsafe { CStr::from_ptr(logid) }; + + // Parse the code location + let code_location: CodeLocation = code_location.into(); + + let span = span!( + Level::TRACE, + "onnxruntime", + category = category.to_str().unwrap_or(""), + file = code_location.file, + line_number = code_location.line_number, + function = code_location.function, + logid = logid.to_str().unwrap_or(""), + ); + let _enter = span.enter(); + + match log_level { + Level::TRACE => trace!("{:?}", message), + Level::DEBUG => debug!("{:?}", message), + Level::INFO => info!("{:?}", message), + Level::WARN => warn!("{:?}", message), + Level::ERROR => error!("{:?}", message), + } + } + } +} + +/// Logging level of the ONNX Runtime C API +#[derive(Debug, Clone, Copy)] +#[cfg_attr(not(windows), repr(u32))] +#[cfg_attr(windows, repr(i32))] +pub enum LoggingLevel { + /// Verbose log level + Verbose = sys::OrtLoggingLevel::ORT_LOGGING_LEVEL_VERBOSE as OnnxEnumInt, + /// Info log level + Info = sys::OrtLoggingLevel::ORT_LOGGING_LEVEL_INFO as OnnxEnumInt, + /// Warning log level + Warning = sys::OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING as OnnxEnumInt, + /// Error log level + Error = sys::OrtLoggingLevel::ORT_LOGGING_LEVEL_ERROR as OnnxEnumInt, + /// Fatal log level + Fatal = sys::OrtLoggingLevel::ORT_LOGGING_LEVEL_FATAL as OnnxEnumInt, +} + +impl From for sys::OrtLoggingLevel { + fn from(val: LoggingLevel) -> Self { + match val { + LoggingLevel::Verbose => sys::OrtLoggingLevel::ORT_LOGGING_LEVEL_VERBOSE, + LoggingLevel::Info => sys::OrtLoggingLevel::ORT_LOGGING_LEVEL_INFO, + LoggingLevel::Warning => sys::OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING, + LoggingLevel::Error => sys::OrtLoggingLevel::ORT_LOGGING_LEVEL_ERROR, + LoggingLevel::Fatal => sys::OrtLoggingLevel::ORT_LOGGING_LEVEL_FATAL, + } + } +} + +/// Optimization level performed by ONNX Runtime of the loaded graph +/// +/// See the [official documentation](https://github.com/microsoft/onnxruntime/blob/main/docs/ONNX_Runtime_Graph_Optimizations.md) +/// for more information on the different optimization levels. +#[derive(Debug)] +#[cfg_attr(not(windows), repr(u32))] +#[cfg_attr(windows, repr(i32))] +pub enum GraphOptimizationLevel { + /// Disable optimization + DisableAll = sys::GraphOptimizationLevel::ORT_DISABLE_ALL as OnnxEnumInt, + /// Basic optimization + Basic = sys::GraphOptimizationLevel::ORT_ENABLE_BASIC as OnnxEnumInt, + /// Extended optimization + Extended = sys::GraphOptimizationLevel::ORT_ENABLE_EXTENDED as OnnxEnumInt, + /// Add optimization + All = sys::GraphOptimizationLevel::ORT_ENABLE_ALL as OnnxEnumInt, +} + +impl From for sys::GraphOptimizationLevel { + fn from(val: GraphOptimizationLevel) -> Self { + use GraphOptimizationLevel::{All, Basic, DisableAll, Extended}; + match val { + DisableAll => sys::GraphOptimizationLevel::ORT_DISABLE_ALL, + Basic => sys::GraphOptimizationLevel::ORT_ENABLE_BASIC, + Extended => sys::GraphOptimizationLevel::ORT_ENABLE_EXTENDED, + All => sys::GraphOptimizationLevel::ORT_ENABLE_ALL, + } + } +} + +// FIXME: Use https://docs.rs/bindgen/0.54.1/bindgen/struct.Builder.html#method.rustified_enum +// FIXME: Add tests to cover the commented out types +/// Enum mapping ONNX Runtime's supported tensor types +#[derive(Debug)] +#[cfg_attr(not(windows), repr(u32))] +#[cfg_attr(windows, repr(i32))] +pub enum TensorElementDataType { + /// 32-bit floating point, equivalent to Rust's `f32` + Float = sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT as OnnxEnumInt, + /// Unsigned 8-bit int, equivalent to Rust's `u8` + Uint8 = sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8 as OnnxEnumInt, + /// Signed 8-bit int, equivalent to Rust's `i8` + Int8 = sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8 as OnnxEnumInt, + /// Unsigned 16-bit int, equivalent to Rust's `u16` + Uint16 = sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16 as OnnxEnumInt, + /// Signed 16-bit int, equivalent to Rust's `i16` + Int16 = sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16 as OnnxEnumInt, + /// Signed 32-bit int, equivalent to Rust's `i32` + Int32 = sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32 as OnnxEnumInt, + /// Signed 64-bit int, equivalent to Rust's `i64` + Int64 = sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64 as OnnxEnumInt, + /// String, equivalent to Rust's `String` + String = sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING as OnnxEnumInt, + // /// Boolean, equivalent to Rust's `bool` + // Bool = sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL as OnnxEnumInt, + // /// 16-bit floating point, equivalent to Rust's `f16` + // Float16 = sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16 as OnnxEnumInt, + /// 64-bit floating point, equivalent to Rust's `f64` + Double = sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE as OnnxEnumInt, + /// Unsigned 32-bit int, equivalent to Rust's `u32` + Uint32 = sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32 as OnnxEnumInt, + /// Unsigned 64-bit int, equivalent to Rust's `u64` + Uint64 = sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64 as OnnxEnumInt, + // /// Complex 64-bit floating point, equivalent to Rust's `???` + // Complex64 = sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64 as OnnxEnumInt, + // /// Complex 128-bit floating point, equivalent to Rust's `???` + // Complex128 = sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128 as OnnxEnumInt, + // /// Brain 16-bit floating point + // Bfloat16 = sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16 as OnnxEnumInt, +} + +impl From for sys::ONNXTensorElementDataType { + fn from(val: TensorElementDataType) -> Self { + use TensorElementDataType::{ + Double, Float, Int16, Int32, Int64, Int8, String, Uint16, Uint32, Uint64, Uint8, + }; + match val { + Float => sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, + Uint8 => sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, + Int8 => sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8, + Uint16 => sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16, + Int16 => sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16, + Int32 => sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, + Int64 => sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, + String => sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING, + // Bool => { + // sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL + // } + // Float16 => { + // sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16 + // } + Double => sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, + Uint32 => sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32, + Uint64 => sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64, + // Complex64 => { + // sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64 + // } + // Complex128 => { + // sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128 + // } + // Bfloat16 => { + // sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16 + // } + } + } +} + +/// Trait used to map Rust types (for example `f32`) to ONNX types (for example `Float`) +pub trait TypeToTensorElementDataType { + /// Return the ONNX type for a Rust type + fn tensor_element_data_type() -> TensorElementDataType; + + /// If the type is `String`, returns `Some` with utf8 contents, else `None`. + fn try_utf8_bytes(&self) -> Option<&[u8]>; +} + +macro_rules! impl_type_trait { + ($type_:ty, $variant:ident) => { + impl TypeToTensorElementDataType for $type_ { + fn tensor_element_data_type() -> TensorElementDataType { + // unsafe { std::mem::transmute(TensorElementDataType::$variant) } + TensorElementDataType::$variant + } + + fn try_utf8_bytes(&self) -> Option<&[u8]> { + None + } + } + }; +} + +impl_type_trait!(f32, Float); +impl_type_trait!(u8, Uint8); +impl_type_trait!(i8, Int8); +impl_type_trait!(u16, Uint16); +impl_type_trait!(i16, Int16); +impl_type_trait!(i32, Int32); +impl_type_trait!(i64, Int64); +// impl_type_trait!(bool, Bool); +// impl_type_trait!(f16, Float16); +impl_type_trait!(f64, Double); +impl_type_trait!(u32, Uint32); +impl_type_trait!(u64, Uint64); +// impl_type_trait!(, Complex64); +// impl_type_trait!(, Complex128); +// impl_type_trait!(, Bfloat16); + +/// Adapter for common Rust string types to Onnx strings. +/// +/// It should be easy to use both `String` and `&str` as [`TensorElementDataType::String`] data, but +/// we can't define an automatic implementation for anything that implements `AsRef` as it +/// would conflict with the implementations of [`TypeToTensorElementDataType`] for primitive numeric +/// types (which might implement `AsRef` at some point in the future). +pub trait Utf8Data { + /// Returns the utf8 contents. + fn utf8_bytes(&self) -> &[u8]; +} + +impl Utf8Data for String { + fn utf8_bytes(&self) -> &[u8] { + self.as_bytes() + } +} + +impl<'a> Utf8Data for &'a str { + fn utf8_bytes(&self) -> &[u8] { + self.as_bytes() + } +} + +impl TypeToTensorElementDataType for T { + fn tensor_element_data_type() -> TensorElementDataType { + TensorElementDataType::String + } + + fn try_utf8_bytes(&self) -> Option<&[u8]> { + Some(self.utf8_bytes()) + } +} + +/// Allocator type +#[derive(Debug, Clone)] +#[repr(i32)] +pub enum AllocatorType { + // Invalid = sys::OrtAllocatorType::Invalid as i32, + /// Device allocator + Device = sys::OrtAllocatorType::OrtDeviceAllocator as i32, + /// Arena allocator + Arena = sys::OrtAllocatorType::OrtArenaAllocator as i32, +} + +impl From for sys::OrtAllocatorType { + fn from(val: AllocatorType) -> Self { + use AllocatorType::{Arena, Device}; + match val { + // Invalid => sys::OrtAllocatorType::Invalid, + Device => sys::OrtAllocatorType::OrtDeviceAllocator, + Arena => sys::OrtAllocatorType::OrtArenaAllocator, + } + } +} + +/// Memory type +/// +/// Only support ONNX's default type for now. +#[derive(Debug, Clone)] +#[repr(i32)] +pub enum MemType { + // FIXME: C API's `OrtMemType_OrtMemTypeCPU` defines it equal to `OrtMemType_OrtMemTypeCPUOutput`. How to handle this?? + // CPUInput = sys::OrtMemType::OrtMemTypeCPUInput as i32, + // CPUOutput = sys::OrtMemType::OrtMemTypeCPUOutput as i32, + // CPU = sys::OrtMemType::OrtMemTypeCPU as i32, + /// Default memory type + Default = sys::OrtMemType::OrtMemTypeDefault as i32, +} + +impl From for sys::OrtMemType { + fn from(val: MemType) -> Self { + use MemType::Default; + match val { + // CPUInput => sys::OrtMemType::OrtMemTypeCPUInput, + // CPUOutput => sys::OrtMemType::OrtMemTypeCPUOutput, + // CPU => sys::OrtMemType::OrtMemTypeCPU, + Default => sys::OrtMemType::OrtMemTypeDefault, + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_char_p_to_string() { + let s = std::ffi::CString::new("foo").unwrap(); + let ptr = s.as_c_str().as_ptr(); + assert_eq!("foo", char_p_to_string(ptr).unwrap()); + } +} diff --git a/rust/onnxruntime/src/memory.rs b/rust/onnxruntime/src/memory.rs new file mode 100644 index 0000000000000..1688d433fe276 --- /dev/null +++ b/rust/onnxruntime/src/memory.rs @@ -0,0 +1,81 @@ +use tracing::debug; + +use onnxruntime_sys as sys; + +use crate::{ + environment::{Environment, _Environment}, + error::{assert_not_null_pointer, status_to_result, OrtError, Result}, + AllocatorType, MemType, +}; + +use tracing::error; + +#[derive(Debug)] +pub struct MemoryInfo { + pub ptr: *mut sys::OrtMemoryInfo, + env: _Environment, +} + +impl MemoryInfo { + #[tracing::instrument] + pub fn new(allocator: AllocatorType, memory_type: MemType, env: &Environment) -> Result { + debug!("Creating new memory info."); + let mut memory_info_ptr: *mut sys::OrtMemoryInfo = std::ptr::null_mut(); + let status = unsafe { + env.env().api().CreateCpuMemoryInfo.unwrap()( + allocator.into(), + memory_type.into(), + &mut memory_info_ptr, + ) + }; + status_to_result(status).map_err(OrtError::CreateCpuMemoryInfo)?; + assert_not_null_pointer(memory_info_ptr, "MemoryInfo")?; + + Ok(Self { + ptr: memory_info_ptr, + env: env.env.clone(), + }) + } +} + +impl Drop for MemoryInfo { + #[tracing::instrument] + fn drop(&mut self) { + if self.ptr.is_null() { + error!("MemoryInfo pointer is null, not dropping."); + } else { + debug!("Dropping the memory information."); + unsafe { self.env.env().api().ReleaseMemoryInfo.unwrap()(self.ptr) }; + } + + self.ptr = std::ptr::null_mut(); + } +} + +#[cfg(test)] +mod tests { + use std::env::var; + + use super::*; + use crate::{environment::tests::ONNX_RUNTIME_LIBRARY_PATH, LoggingLevel}; + use test_log::test; + + #[test] + fn memory_info_constructor_destructor() { + let path = var(ONNX_RUNTIME_LIBRARY_PATH).ok(); + + let builder = Environment::builder() + .with_name("test") + .with_log_level(LoggingLevel::Warning); + + let builder = if let Some(path) = path { + builder.with_library_path(path) + } else { + builder + }; + let env = builder.build().unwrap(); + + let memory_info = MemoryInfo::new(AllocatorType::Arena, MemType::Default, &env).unwrap(); + std::mem::drop(memory_info); + } +} diff --git a/rust/onnxruntime/src/session.rs b/rust/onnxruntime/src/session.rs new file mode 100644 index 0000000000000..326426e35982c --- /dev/null +++ b/rust/onnxruntime/src/session.rs @@ -0,0 +1,806 @@ +//! Module containing session types + +use std::{convert::TryFrom, ffi::CString, fmt::Debug, path::Path}; + +#[cfg(not(target_family = "windows"))] +use std::os::unix::ffi::OsStrExt; +#[cfg(target_family = "windows")] +use std::os::windows::ffi::OsStrExt; + +#[cfg(feature = "model-fetching")] +use std::env; + +use crate::{ + char_p_to_string, + environment::{Environment, _Environment}, + error::{ + assert_not_null_pointer, assert_null_pointer, status_to_result, NonMatchingDimensionsError, + OrtApiError, OrtError, Result, + }, + memory::MemoryInfo, + tensor::{ + construct::ConstructTensor, + ort_output_tensor::{OrtOutput, OrtOwnedTensorExtractor}, + OrtOutputTensor, + }, + AllocatorType, GraphOptimizationLevel, MemType, TensorElementDataType, +}; +use onnxruntime_sys as sys; + +use tracing::{debug, error}; + +#[cfg(feature = "model-fetching")] +use crate::{download::AvailableOnnxModel, error::OrtDownloadError}; + +/// Type used to create a session using the _builder pattern_ +/// +/// A `SessionBuilder` is created by calling the +/// [`Environment::new_session_builder()`](../env/struct.Environment.html#method.new_session_builder) +/// method on the environment. +/// +/// Once created, use the different methods to configure the session. +/// +/// Once configured, use the [`SessionBuilder::with_model_from_file()`](../session/struct.SessionBuilder.html#method.with_model_from_file) +/// method to "commit" the builder configuration into a [`Session`](../session/struct.Session.html). +/// +/// # Example +/// +/// ```no_run +/// # use std::error::Error; +/// # use std::env::var; +/// # use onnxruntime::{environment::Environment, LoggingLevel, GraphOptimizationLevel}; +/// # fn main() -> Result<(), Box> { +/// # let path = var("RUST_ONNXRUNTIME_LIBRARY_PATH").ok(); +/// +/// let builder = Environment::builder() +/// .with_name("test") +/// .with_log_level(LoggingLevel::Warning); +/// +/// let builder = if let Some(path) = path { +/// builder.with_library_path(path) +/// } else { +/// builder +/// }; +/// let environment = builder.build()?; +/// +/// let mut session = environment +/// .new_session_builder()? +/// .with_graph_optimization_level(GraphOptimizationLevel::Basic)? +/// .with_intra_op_num_threads(1)? +/// .with_model_from_file("squeezenet.onnx")?; +/// # Ok(()) +/// # } +/// ``` +#[derive(Debug)] +pub struct SessionBuilder<'a> { + env: &'a Environment, + session_options_ptr: *mut sys::OrtSessionOptions, + + allocator: AllocatorType, + memory_type: MemType, +} + +impl<'a> Drop for SessionBuilder<'a> { + #[tracing::instrument] + fn drop(&mut self) { + if self.session_options_ptr.is_null() { + error!("Session options pointer is null, not dropping"); + } else { + debug!("Dropping the session options."); + unsafe { + self.env.env().api().ReleaseSessionOptions.unwrap()(self.session_options_ptr) + }; + } + } +} + +impl<'a> SessionBuilder<'a> { + pub(crate) fn new(env: &'a Environment) -> Result> { + let mut session_options_ptr: *mut sys::OrtSessionOptions = std::ptr::null_mut(); + let status = + unsafe { env.env().api().CreateSessionOptions.unwrap()(&mut session_options_ptr) }; + + status_to_result(status).map_err(OrtError::SessionOptions)?; + assert_null_pointer(status, "SessionStatus")?; + assert_not_null_pointer(session_options_ptr, "SessionOptions")?; + + Ok(SessionBuilder { + env, + session_options_ptr, + allocator: AllocatorType::Arena, + memory_type: MemType::Default, + }) + } + + /// Configure the session to use a number of threads + pub fn with_intra_op_num_threads(self, num_threads: i16) -> Result> { + // FIXME: Pre-built binaries use OpenMP, set env variable instead + + // We use a u16 in the builder to cover the 16-bits positive values of a i32. + let num_threads = i32::from(num_threads); + let status = unsafe { + self.env.env().api().SetIntraOpNumThreads.unwrap()( + self.session_options_ptr, + num_threads, + ) + }; + status_to_result(status).map_err(OrtError::SessionOptions)?; + assert_null_pointer(status, "SessionStatus")?; + Ok(self) + } + + /// Set the session's optimization level + pub fn with_graph_optimization_level( + self, + opt_level: GraphOptimizationLevel, + ) -> Result> { + // Sets graph optimization level + unsafe { + self.env + .env() + .api() + .SetSessionGraphOptimizationLevel + .unwrap()(self.session_options_ptr, opt_level.into()) + }; + Ok(self) + } + + /// Set the session's allocator + /// + /// Defaults to [`AllocatorType::Arena`](../enum.AllocatorType.html#variant.Arena) + pub fn with_allocator(mut self, allocator: AllocatorType) -> Result> { + self.allocator = allocator; + Ok(self) + } + + /// Set the session's memory type + /// + /// Defaults to [`MemType::Default`](../enum.MemType.html#variant.Default) + pub fn with_memory_type(mut self, memory_type: MemType) -> Result> { + self.memory_type = memory_type; + Ok(self) + } + + /// Download an ONNX pre-trained model from the [ONNX Model Zoo](https://github.com/onnx/models) and commit the session + #[cfg(feature = "model-fetching")] + pub fn with_model_downloaded(self, model: M) -> Result + where + M: Into, + { + self.with_model_downloaded_monomorphized(model.into()) + } + + #[cfg(feature = "model-fetching")] + fn with_model_downloaded_monomorphized(self, model: AvailableOnnxModel) -> Result { + let download_dir = env::current_dir().map_err(OrtDownloadError::IoError)?; + let downloaded_path = model.download_to(download_dir)?; + self.with_model_from_file(downloaded_path) + } + + // TODO: Add all functions changing the options. + // See all OrtApi methods taking a `options: *mut OrtSessionOptions`. + + /// Load an ONNX graph from a file and commit the session + pub fn with_model_from_file

(self, model_filepath_ref: P) -> Result + where + P: AsRef + 'a, + { + let model_filepath = model_filepath_ref.as_ref(); + let mut session_ptr: *mut sys::OrtSession = std::ptr::null_mut(); + + if !model_filepath.exists() { + return Err(OrtError::FileDoesNotExists { + filename: model_filepath.to_path_buf(), + }); + } + + // Build an OsString than a vector of bytes to pass to C + let model_path = std::ffi::OsString::from(model_filepath); + #[cfg(target_family = "windows")] + let model_path: Vec = model_path + .encode_wide() + .chain(std::iter::once(0)) // Make sure we have a null terminated string + .collect(); + #[cfg(not(target_family = "windows"))] + let model_path: Vec = model_path + .as_bytes() + .iter() + .chain(std::iter::once(&b'\0')) // Make sure we have a null terminated string + .map(|b| *b as std::os::raw::c_char) + .collect(); + + unsafe { + let api = self.env.env().api(); + + let status = api.CreateSession.unwrap()( + self.env.env().env_ptr, + model_path.as_ptr(), + self.session_options_ptr, + &mut session_ptr, + ); + + status_to_result(status).map_err(OrtError::Session)?; + assert_null_pointer(status, "SessionStatus")?; + assert_not_null_pointer(session_ptr, "Session")?; + }; + let mut allocator_ptr: *mut sys::OrtAllocator = std::ptr::null_mut(); + let status = unsafe { + self.env.env().api().GetAllocatorWithDefaultOptions.unwrap()(&mut allocator_ptr) + }; + status_to_result(status).map_err(OrtError::Allocator)?; + assert_null_pointer(status, "SessionStatus")?; + assert_not_null_pointer(allocator_ptr, "Allocator")?; + + let memory_info = MemoryInfo::new(AllocatorType::Arena, MemType::Default, &self.env)?; + unsafe { + // Extract input and output properties + let num_input_nodes = + dangerous::extract_inputs_count(session_ptr, self.env.env.clone())?; + let num_output_nodes = + dangerous::extract_outputs_count(session_ptr, self.env.env.clone())?; + let inputs = (0..num_input_nodes) + .map(|i| { + dangerous::extract_input(session_ptr, allocator_ptr, i, self.env.env.clone()) + }) + .collect::>>()?; + let outputs = (0..num_output_nodes) + .map(|i| { + dangerous::extract_output(session_ptr, allocator_ptr, i, self.env.env.clone()) + }) + .collect::>>()?; + + Ok(Session { + env: self.env.env.clone(), + session_ptr, + allocator_ptr, + memory_info, + inputs, + outputs, + }) + } + } + + /// Load an ONNX graph from memory and commit the session + pub fn with_model_from_memory(self, model_bytes: B) -> Result + where + B: AsRef<[u8]>, + { + self.with_model_from_memory_monomorphized(model_bytes.as_ref()) + } + + fn with_model_from_memory_monomorphized(self, model_bytes: &[u8]) -> Result { + let mut session_ptr: *mut sys::OrtSession = std::ptr::null_mut(); + unsafe { + let api = self.env.env().api(); + + let model_data = model_bytes.as_ptr().cast::(); + let model_data_length = model_bytes.len(); + let status = api.CreateSessionFromArray.unwrap()( + self.env.env().env_ptr, + model_data, + model_data_length, + self.session_options_ptr, + &mut session_ptr, + ); + + status_to_result(status).map_err(OrtError::Session)?; + assert_null_pointer(status, "SessionStatus")?; + assert_not_null_pointer(session_ptr, "Session")?; + }; + let mut allocator_ptr: *mut sys::OrtAllocator = std::ptr::null_mut(); + let status = unsafe { + self.env.env().api().GetAllocatorWithDefaultOptions.unwrap()(&mut allocator_ptr) + }; + status_to_result(status).map_err(OrtError::Allocator)?; + assert_null_pointer(status, "SessionStatus")?; + assert_not_null_pointer(allocator_ptr, "Allocator")?; + + let memory_info = MemoryInfo::new(AllocatorType::Arena, MemType::Default, &self.env)?; + unsafe { + // Extract input and output properties + let num_input_nodes = + dangerous::extract_inputs_count(session_ptr, self.env.env.clone())?; + let num_output_nodes = + dangerous::extract_outputs_count(session_ptr, self.env.env.clone())?; + let inputs = (0..num_input_nodes) + .map(|i| { + dangerous::extract_input(session_ptr, allocator_ptr, i, self.env.env.clone()) + }) + .collect::>>()?; + let outputs = (0..num_output_nodes) + .map(|i| { + dangerous::extract_output(session_ptr, allocator_ptr, i, self.env.env.clone()) + }) + .collect::>>()?; + + Ok(Session { + env: self.env.env.clone(), + session_ptr, + allocator_ptr, + memory_info, + inputs, + outputs, + }) + } + } +} + +/// Type storing the session information, built from an [`Environment`](environment/struct.Environment.html) +#[derive(Debug)] +pub struct Session { + env: _Environment, + session_ptr: *mut sys::OrtSession, + allocator_ptr: *mut sys::OrtAllocator, + memory_info: MemoryInfo, + /// Information about the ONNX's inputs as stored in loaded file + pub inputs: Vec, + /// Information about the ONNX's outputs as stored in loaded file + pub outputs: Vec, +} + +/// Information about an ONNX's input as stored in loaded file +#[derive(Debug)] +pub struct Input { + /// Name of the input layer + pub name: String, + /// Type of the input layer's elements + pub input_type: TensorElementDataType, + /// Shape of the input layer + /// + /// C API uses a i64 for the dimensions. We use an unsigned of the same range of the positive values. + pub dimensions: Vec>, +} + +/// Information about an ONNX's output as stored in loaded file +#[derive(Debug)] +pub struct Output { + /// Name of the output layer + pub name: String, + /// Type of the output layer's elements + pub output_type: TensorElementDataType, + /// Shape of the output layer + /// + /// C API uses a i64 for the dimensions. We use an unsigned of the same range of the positive values. + pub dimensions: Vec>, +} + +impl Input { + /// Return an iterator over the shape elements of the input layer + /// + /// Note: The member [`Input::dimensions`](struct.Input.html#structfield.dimensions) + /// stores `u32` (since ONNX uses `i64` but which cannot be negative) so the + /// iterator converts to `usize`. + pub fn dimensions(&self) -> impl Iterator> + '_ { + self.dimensions.iter().map(|d| d.map(|d2| d2 as usize)) + } +} + +impl Output { + /// Return an iterator over the shape elements of the output layer + /// + /// Note: The member [`Output::dimensions`](struct.Output.html#structfield.dimensions) + /// stores `u32` (since ONNX uses `i64` but which cannot be negative) so the + /// iterator converts to `usize`. + pub fn dimensions(&self) -> impl Iterator> + '_ { + self.dimensions.iter().map(|d| d.map(|d2| d2 as usize)) + } +} + +impl Drop for Session { + #[tracing::instrument] + fn drop(&mut self) { + debug!("Dropping the session."); + if self.session_ptr.is_null() { + error!("Session pointer is null, not dropping."); + } else { + unsafe { self.env.env().api().ReleaseSession.unwrap()(self.session_ptr) }; + } + + self.session_ptr = std::ptr::null_mut(); + self.allocator_ptr = std::ptr::null_mut(); + } +} + +unsafe impl Send for Session {} + +unsafe impl Sync for Session {} + +impl Session { + /// Run the input data through the ONNX graph, performing inference. + /// + /// Note that ONNX models can have multiple inputs; a `Vec<_>` is thus + /// used for the input data here. + pub fn run<'input, 'output>( + &'output self, + mut input_arrays: impl AsMut<[Box]> + 'input, + ) -> Result>> { + let mut output_tensor_extractors_ptrs: Vec<*mut sys::OrtValue> = + vec![std::ptr::null_mut(); self.outputs.len()]; + + let output_names_cstring: Vec = self + .outputs + .iter() + .map(|output| output.name.clone()) + .map(|n| CString::new(n).unwrap()) + .collect(); + let output_names_ptr: Vec<*const i8> = output_names_cstring + .iter() + .map(|n| n.as_ptr().cast::()) + .collect(); + + let input_names_ptr: Vec<*const i8> = self + .inputs + .iter() + .map(|input| input.name.clone()) + .map(|n| CString::new(n).unwrap()) + .map(|n| n.into_raw() as *const i8) + .collect(); + + { + let memory_info = &self.memory_info; + + let allocator = self.allocator_ptr; + + let arr = input_arrays.as_mut(); + + let input_tensors = arr + .into_iter() + .map(|v| v.construct(memory_info, allocator)) + .collect::>>()?; + + let input_arrays_shapes: Vec> = + input_tensors.iter().map(|v| v.shape().to_vec()).collect(); + + self.validate_input_shapes(&input_arrays_shapes)?; + + // Build arguments to Run() + + let input_ort_values: Vec<*const sys::OrtValue> = input_tensors + .iter() + .map(|input_array_ort| input_array_ort.ptr() as *const sys::OrtValue) + .collect(); + + let run_options_ptr: *const sys::OrtRunOptions = std::ptr::null(); + + let status = unsafe { + self.env.env().api().Run.unwrap()( + self.session_ptr, + run_options_ptr, + input_names_ptr.as_ptr(), + input_ort_values.as_ptr(), + input_ort_values.len(), + output_names_ptr.as_ptr(), + output_names_ptr.len(), + output_tensor_extractors_ptrs.as_mut_ptr(), + ) + }; + status_to_result(status).map_err(OrtError::Run)?; + } + + let outputs: Result> = output_tensor_extractors_ptrs + .into_iter() + .map(|ptr| { + let mut tensor_info_ptr: *mut sys::OrtTensorTypeAndShapeInfo = std::ptr::null_mut(); + let status = unsafe { + self.env.env().api().GetTensorTypeAndShape.unwrap()( + ptr, + &mut tensor_info_ptr as _, + ) + }; + status_to_result(status).map_err(OrtError::GetTensorTypeAndShape)?; + let dims = unsafe { get_tensor_dimensions(tensor_info_ptr, self.env.clone()) }; + + unsafe { + self.env.env().api().ReleaseTensorTypeAndShapeInfo.unwrap()(tensor_info_ptr) + }; + let dims: Vec<_> = dims?.iter().map(|&n| n as usize).collect(); + + let mut output_tensor_extractor = + OrtOwnedTensorExtractor::new(dims, self.env.clone()); + output_tensor_extractor.tensor_ptr = ptr; + + output_tensor_extractor.extract() + }) + .collect(); + + // Reconvert to CString so drop impl is called and memory is freed + let cstrings: Result> = input_names_ptr + .into_iter() + .map(|p| { + assert_not_null_pointer(p, "i8 for CString")?; + unsafe { Ok(CString::from_raw(p as *mut i8)) } + }) + .collect(); + cstrings?; + + outputs? + .into_iter() + .map(|v| OrtOutput::try_from(v)) + .collect() + } + + fn validate_input_shapes(&self, input_array_shapes: &[Vec]) -> Result<()> { + // ****************************************************************** + // FIXME: Properly handle errors here + // Make sure all dimensions match (except dynamic ones) + + // Verify length of inputs + if input_array_shapes.len() != self.inputs.len() { + error!( + "Non-matching number of inputs: {} (inference) vs {} (model)", + input_array_shapes.len(), + self.inputs.len() + ); + return Err(OrtError::NonMatchingDimensions( + NonMatchingDimensionsError::InputsCount { + inference_input_count: 0, + model_input_count: 0, + inference_input: input_array_shapes.to_vec(), + model_input: self + .inputs + .iter() + .map(|input| input.dimensions.clone()) + .collect(), + }, + )); + } + + // Verify length of each individual inputs + let inputs_different_length = input_array_shapes + .iter() + .zip(self.inputs.iter()) + .any(|(l, r)| l.len() != r.dimensions.len()); + if inputs_different_length { + error!( + "Different input lengths: {:?} vs {:?}", + self.inputs, input_array_shapes + ); + return Err(OrtError::NonMatchingDimensions( + NonMatchingDimensionsError::InputsLength { + inference_input: input_array_shapes + .iter() + .map(|input_array| input_array.to_vec()) + .collect(), + model_input: self + .inputs + .iter() + .map(|input| input.dimensions.clone()) + .collect(), + }, + )); + } + + // Verify shape of each individual inputs + let inputs_different_shape = + input_array_shapes + .iter() + .zip(self.inputs.iter()) + .any(|(l, r)| { + let l_shape = l; + let r_shape = r.dimensions.as_slice(); + l_shape.iter().zip(r_shape.iter()).any(|(l2, r2)| match r2 { + Some(r3) => *r3 as usize != *l2, + None => false, // None means dynamic size; in that case shape always match + }) + }); + if inputs_different_shape { + error!( + "Different input lengths: {:?} vs {:?}", + self.inputs, input_array_shapes + ); + return Err(OrtError::NonMatchingDimensions( + NonMatchingDimensionsError::InputsLength { + inference_input: input_array_shapes + .iter() + .map(|input_array| input_array.to_vec()) + .collect(), + model_input: self + .inputs + .iter() + .map(|input| input.dimensions.clone()) + .collect(), + }, + )); + } + + Ok(()) + } +} + +unsafe fn get_tensor_dimensions( + tensor_info_ptr: *const sys::OrtTensorTypeAndShapeInfo, + env: _Environment, +) -> Result> { + let mut num_dims = 0; + let status = env.env().api().GetDimensionsCount.unwrap()(tensor_info_ptr, &mut num_dims); + status_to_result(status).map_err(OrtError::GetDimensionsCount)?; + (num_dims != 0) + .then_some(()) + .ok_or(OrtError::InvalidDimensions)?; + + let mut node_dims: Vec = vec![0; num_dims as usize]; + let status = env.env().api().GetDimensions.unwrap()( + tensor_info_ptr, + node_dims.as_mut_ptr(), // FIXME: UB? + num_dims, + ); + status_to_result(status).map_err(OrtError::GetDimensions)?; + Ok(node_dims) +} + +/// This module contains dangerous functions working on raw pointers. +/// Those functions are only to be used from inside the +/// `SessionBuilder::with_model_from_file()` method. +mod dangerous { + use super::{ + assert_not_null_pointer, assert_null_pointer, char_p_to_string, get_tensor_dimensions, + status_to_result, sys, Input, OrtApiError, OrtError, Output, Result, TensorElementDataType, + }; + + use crate::environment::_Environment; + + pub(super) unsafe fn extract_inputs_count( + session_ptr: *mut sys::OrtSession, + env: _Environment, + ) -> Result { + let f = env.env().api().SessionGetInputCount.unwrap(); + extract_io_count(f, session_ptr) + } + + pub(super) unsafe fn extract_outputs_count( + session_ptr: *mut sys::OrtSession, + env: _Environment, + ) -> Result { + let f = env.env().api().SessionGetOutputCount.unwrap(); + extract_io_count(f, session_ptr) + } + + fn extract_io_count( + f: extern_system_fn! { unsafe fn(*const sys::OrtSession, *mut usize) -> *mut sys::OrtStatus }, + session_ptr: *mut sys::OrtSession, + ) -> Result { + let mut num_nodes: usize = 0; + let status = unsafe { f(session_ptr, &mut num_nodes) }; + status_to_result(status).map_err(OrtError::InOutCount)?; + assert_null_pointer(status, "SessionStatus")?; + (num_nodes != 0).then_some(()).ok_or_else(|| { + OrtError::InOutCount(OrtApiError::Msg("No nodes in model".to_owned())) + })?; + Ok(num_nodes) + } + + unsafe fn extract_input_name( + session_ptr: *mut sys::OrtSession, + allocator_ptr: *mut sys::OrtAllocator, + i: usize, + env: _Environment, + ) -> Result { + let f = env.env().api().SessionGetInputName.unwrap(); + extract_io_name(f, session_ptr, allocator_ptr, i, env) + } + + unsafe fn extract_output_name( + session_ptr: *mut sys::OrtSession, + allocator_ptr: *mut sys::OrtAllocator, + i: usize, + env: _Environment, + ) -> Result { + let f = env.env().api().SessionGetOutputName.unwrap(); + extract_io_name(f, session_ptr, allocator_ptr, i, env) + } + + fn extract_io_name( + f: extern_system_fn! { unsafe fn( + *const sys::OrtSession, + usize, + *mut sys::OrtAllocator, + *mut *mut i8, + ) -> *mut sys::OrtStatus }, + session_ptr: *mut sys::OrtSession, + allocator_ptr: *mut sys::OrtAllocator, + i: usize, + env: _Environment, + ) -> Result { + let mut name_bytes: *mut i8 = std::ptr::null_mut(); + + let status = unsafe { f(session_ptr, i, allocator_ptr, &mut name_bytes) }; + status_to_result(status).map_err(OrtError::InputName)?; + assert_not_null_pointer(name_bytes, "InputName")?; + + let name = char_p_to_string(name_bytes)?; + + unsafe { + env.env().api().AllocatorFree.unwrap()( + allocator_ptr, + name_bytes as *mut std::ffi::c_void, + ) + }; + + Ok(name) + } + + pub(super) unsafe fn extract_input( + session_ptr: *mut sys::OrtSession, + allocator_ptr: *mut sys::OrtAllocator, + i: usize, + env: _Environment, + ) -> Result { + let input_name = extract_input_name(session_ptr, allocator_ptr, i, env.clone())?; + let f = env.env().api().SessionGetInputTypeInfo.unwrap(); + let (input_type, dimensions) = extract_io(f, session_ptr, i, env)?; + Ok(Input { + name: input_name, + input_type, + dimensions, + }) + } + + pub(super) unsafe fn extract_output( + session_ptr: *mut sys::OrtSession, + allocator_ptr: *mut sys::OrtAllocator, + i: usize, + env: _Environment, + ) -> Result { + let output_name = extract_output_name(session_ptr, allocator_ptr, i, env.clone())?; + let f = env.env().api().SessionGetOutputTypeInfo.unwrap(); + let (output_type, dimensions) = extract_io(f, session_ptr, i, env)?; + Ok(Output { + name: output_name, + output_type, + dimensions, + }) + } + + fn extract_io( + f: extern_system_fn! { unsafe fn( + *const sys::OrtSession, + usize, + *mut *mut sys::OrtTypeInfo, + ) -> *mut sys::OrtStatus }, + session_ptr: *mut sys::OrtSession, + i: usize, + env: _Environment, + ) -> Result<(TensorElementDataType, Vec>)> { + let mut typeinfo_ptr: *mut sys::OrtTypeInfo = std::ptr::null_mut(); + + let status = unsafe { f(session_ptr, i, &mut typeinfo_ptr) }; + status_to_result(status).map_err(OrtError::GetTypeInfo)?; + assert_not_null_pointer(typeinfo_ptr, "TypeInfo")?; + + let mut tensor_info_ptr: *const sys::OrtTensorTypeAndShapeInfo = std::ptr::null_mut(); + let status = unsafe { + env.env().api().CastTypeInfoToTensorInfo.unwrap()(typeinfo_ptr, &mut tensor_info_ptr) + }; + status_to_result(status).map_err(OrtError::CastTypeInfoToTensorInfo)?; + assert_not_null_pointer(tensor_info_ptr, "TensorInfo")?; + + let mut type_sys = sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; + let status = unsafe { + env.env().api().GetTensorElementType.unwrap()(tensor_info_ptr, &mut type_sys) + }; + status_to_result(status).map_err(OrtError::TensorElementType)?; + (type_sys != sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED) + .then_some(()) + .ok_or(OrtError::UndefinedTensorElementType)?; + // This transmute should be safe since its value is read from GetTensorElementType which we must trust. + let io_type: TensorElementDataType = unsafe { std::mem::transmute(type_sys) }; + + // info!("{} : type={}", i, type_); + + let node_dims = unsafe { get_tensor_dimensions(tensor_info_ptr, env.clone())? }; + + // for j in 0..num_dims { + // info!("{} : dim {}={}", i, j, node_dims[j as usize]); + // } + + unsafe { env.env().api().ReleaseTypeInfo.unwrap()(typeinfo_ptr) }; + + Ok(( + io_type, + node_dims + .into_iter() + .map(|d| if d == -1 { None } else { Some(d as u32) }) + .collect(), + )) + } +} diff --git a/rust/onnxruntime/src/tensor.rs b/rust/onnxruntime/src/tensor.rs new file mode 100644 index 0000000000000..0f383f3ad59b6 --- /dev/null +++ b/rust/onnxruntime/src/tensor.rs @@ -0,0 +1,31 @@ +//! Module containing tensor types. +//! +//! Two main types of tensors are available. +//! +//! The first one, [`Tensor`](struct.Tensor.html), +//! is an _owned_ tensor that is backed by [`ndarray`](https://crates.io/crates/ndarray). +//! This kind of tensor is used to pass input data for the inference. +//! +//! The second one, [`OrtOwnedTensor`](struct.OrtOwnedTensor.html), is used +//! internally to pass to the ONNX Runtime inference execution to place +//! its output values. It is built using a [`OrtOwnedTensorExtractor`](struct.OrtOwnedTensorExtractor.html) +//! following the builder pattern. +//! +//! Once "extracted" from the runtime environment, this tensor will contain an +//! [`ndarray::ArrayView`](https://docs.rs/ndarray/latest/ndarray/type.ArrayView.html) +//! containing _a view_ of the data. When going out of scope, this tensor will free the required +//! memory on the C side. +//! +//! **NOTE**: Tensors are not meant to be built directly. When performing inference, +//! the [`Session::run()`](../session/struct.Session.html#method.run) method takes +//! an `ndarray::Array` as input (taking ownership of it) and will convert it internally +//! to a [`Tensor`](struct.Tensor.html). After inference, a [`OrtOwnedTensor`](struct.OrtOwnedTensor.html) +//! will be returned by the method which can be derefed into its internal +//! [`ndarray::ArrayView`](https://docs.rs/ndarray/latest/ndarray/type.ArrayView.html). + +pub mod construct; +pub mod ndarray_tensor; +pub mod ort_input_tensor; +pub mod ort_output_tensor; + +pub use ort_output_tensor::{OrtOutputTensor, WithOutputTensor}; diff --git a/rust/onnxruntime/src/tensor/construct.rs b/rust/onnxruntime/src/tensor/construct.rs new file mode 100644 index 0000000000000..97f70b131ea0a --- /dev/null +++ b/rust/onnxruntime/src/tensor/construct.rs @@ -0,0 +1,34 @@ +//! convert module has the trait for conversion of Inputs ConstructTensor. + +use crate::{memory::MemoryInfo, OrtError}; +use onnxruntime_sys::{OrtAllocator, OrtValue}; +use std::fmt::Debug; + +/// The Input type for Rust onnxruntime Session::run +pub trait ConstructTensor: Debug { + /// Constuct an OrtTensor Input using the `MemoryInfo` and a raw pointer to the `OrtAllocator`. + fn construct<'a>( + &'a mut self, + memory_info: &MemoryInfo, + allocator: *mut OrtAllocator, + ) -> Result, OrtError>; +} + +/// Allows the return value of ConstructTensor::construct +/// to be generic. +pub trait InputTensor { + /// The input tensor's shape + fn shape(&self) -> &[usize]; + + /// The input tensor's ptr + fn ptr(&self) -> *mut OrtValue; +} + +impl<'a, T> From for Box +where + T: ConstructTensor + 'a, +{ + fn from(other: T) -> Self { + Box::new(other) + } +} diff --git a/rust/onnxruntime/src/tensor/ndarray_tensor.rs b/rust/onnxruntime/src/tensor/ndarray_tensor.rs new file mode 100644 index 0000000000000..dea8d161b243b --- /dev/null +++ b/rust/onnxruntime/src/tensor/ndarray_tensor.rs @@ -0,0 +1,210 @@ +//! Module containing a tensor trait extending [`ndarray::ArrayBase`](https://docs.rs/ndarray/latest/ndarray/struct.ArrayBase.html) + +use ndarray::{Array, ArrayBase}; + +/// Trait extending [`ndarray::ArrayBase`](https://docs.rs/ndarray/latest/ndarray/struct.ArrayBase.html) +/// with useful tensor operations. +/// +/// # Generic +/// +/// The trait is generic over: +/// * `S`: [`ndarray::ArrayBase`](https://docs.rs/ndarray/latest/ndarray/struct.ArrayBase.html)'s data container +/// * `T`: Type contained inside the tensor (for example `f32`) +/// * `D`: Tensor's dimension ([`ndarray::Dimension`](https://docs.rs/ndarray/latest/ndarray/trait.Dimension.html)) +pub trait NdArrayTensor { + /// Calculate the [softmax](https://en.wikipedia.org/wiki/Softmax_function) of the tensor along a given axis + /// + /// # Trait Bounds + /// + /// The function is generic and thus has some trait bounds: + /// * `D: ndarray::RemoveAxis`: The summation over an axis reduces the dimension of the tensor. A 0-D tensor thus + /// cannot have a softmax calculated. + /// * `S: ndarray::RawData + ndarray::Data + ndarray::RawData`: The storage of the tensor can be an owned + /// array ([`ndarray::Array`](https://docs.rs/ndarray/latest/ndarray/type.Array.html)) or an array view + /// ([`ndarray::ArrayView`](https://docs.rs/ndarray/latest/ndarray/type.ArrayView.html)). + /// * `::Elem: std::clone::Clone`: The elements of the tensor must be `Clone`. + /// * `T: ndarray::NdFloat + std::ops::SubAssign + std::ops::DivAssign`: The elements of the tensor must be workable + /// as floats and must support `-=` and `/=` operations. + fn softmax(&self, axis: ndarray::Axis) -> Array + where + D: ndarray::RemoveAxis, + S: ndarray::Data + ndarray::RawData, + ::Elem: std::clone::Clone, + T: ndarray::NdFloat + std::ops::SubAssign + std::ops::DivAssign; +} + +impl NdArrayTensor for ArrayBase +where + D: ndarray::RemoveAxis, + S: ndarray::Data + ndarray::RawData, + ::Elem: std::clone::Clone, + T: ndarray::NdFloat + std::ops::SubAssign + std::ops::DivAssign, +{ + fn softmax(&self, axis: ndarray::Axis) -> Array { + let mut new_array: Array = self.to_owned(); + // FIXME: Change to non-overflowing formula + // e = np.exp(A - np.sum(A, axis=1, keepdims=True)) + // np.exp(a) / np.sum(np.exp(a)) + new_array.map_inplace(|v| *v = v.exp()); + let sum = new_array.sum_axis(axis).insert_axis(axis); + new_array /= ∑ + + new_array + } +} + +#[cfg(test)] +mod tests { + use super::*; + use ndarray::{arr1, arr2, arr3}; + use test_log::test; + + #[test] + fn softmax_1d() { + let array = arr1(&[1.0_f32, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0]); + + let expected_softmax = arr1(&[ + 0.023_640_54, + 0.064_261_66, + 0.174_681_3, + 0.474_833, + 0.023_640_54, + 0.064_261_66, + 0.174_681_3, + ]); + + let softmax = array.softmax(ndarray::Axis(0)); + + assert_eq!(softmax.shape(), expected_softmax.shape()); + + let diff = softmax - expected_softmax; + + assert!(diff.iter().all(|d| d.abs() < 1.0e-7)); + } + + #[test] + fn softmax_2d() { + let array = arr2(&[ + [1.0_f32, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0], + [1.0_f32, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0], + ]); + + let expected_softmax = arr2(&[ + [ + 0.023_640_54, + 0.064_261_66, + 0.174_681_3, + 0.474_833, + 0.023_640_54, + 0.064_261_66, + 0.174_681_3, + ], + [ + 0.023_640_54, + 0.064_261_66, + 0.174_681_3, + 0.474_833, + 0.023_640_54, + 0.064_261_66, + 0.174_681_3, + ], + ]); + + let softmax = array.softmax(ndarray::Axis(1)); + + assert_eq!(softmax.shape(), expected_softmax.shape()); + + let diff = softmax - expected_softmax; + + assert!(diff.iter().all(|d| d.abs() < 1.0e-7)); + } + + #[test] + fn softmax_3d() { + let array = arr3(&[ + [ + [1.0_f32, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0], + [1.0_f32, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0], + ], + [ + [1.0_f32, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0], + [1.0_f32, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0], + ], + [ + [1.0_f32, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0], + [1.0_f32, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0], + ], + ]); + + let expected_softmax = arr3(&[ + [ + [ + 0.023_640_54, + 0.064_261_66, + 0.174_681_3, + 0.474_833, + 0.023_640_54, + 0.064_261_66, + 0.174_681_3, + ], + [ + 0.023_640_54, + 0.064_261_66, + 0.174_681_3, + 0.474_833, + 0.023_640_54, + 0.064_261_66, + 0.174_681_3, + ], + ], + [ + [ + 0.023_640_54, + 0.064_261_66, + 0.174_681_3, + 0.474_833, + 0.023_640_54, + 0.064_261_66, + 0.174_681_3, + ], + [ + 0.023_640_54, + 0.064_261_66, + 0.174_681_3, + 0.474_833, + 0.023_640_54, + 0.064_261_66, + 0.174_681_3, + ], + ], + [ + [ + 0.023_640_54, + 0.064_261_66, + 0.174_681_3, + 0.474_833, + 0.023_640_54, + 0.064_261_66, + 0.174_681_3, + ], + [ + 0.023_640_54, + 0.064_261_66, + 0.174_681_3, + 0.474_833, + 0.023_640_54, + 0.064_261_66, + 0.174_681_3, + ], + ], + ]); + + let softmax = array.softmax(ndarray::Axis(2)); + + assert_eq!(softmax.shape(), expected_softmax.shape()); + + let diff = softmax - expected_softmax; + + assert!(diff.iter().all(|d| d.abs() < 1.0e-7)); + } +} diff --git a/rust/onnxruntime/src/tensor/ort_input_tensor.rs b/rust/onnxruntime/src/tensor/ort_input_tensor.rs new file mode 100644 index 0000000000000..f2cf0ee8a1d4a --- /dev/null +++ b/rust/onnxruntime/src/tensor/ort_input_tensor.rs @@ -0,0 +1,325 @@ +//! Module containing tensor with memory owned by Rust + +use super::construct::{ConstructTensor, InputTensor}; +use crate::{ + environment::ENV, + error::{assert_not_null_pointer, call_ort, status_to_result}, + memory::MemoryInfo, + OrtError, Result, TensorElementDataType, TypeToTensorElementDataType, +}; +use ndarray::{Array, Dimension}; +use onnxruntime_sys as sys; +use std::{ffi, fmt::Debug}; +use sys::OrtAllocator; +use tracing::{debug, error}; + +/// An Input tensor. +/// +/// This ties the lifetime of T to the OrtValue; it is used to copy an +/// [`ndarray::Array`](https://docs.rs/ndarray/latest/ndarray/type.Array.html) to the runtime's memory. +/// +/// **NOTE**: The type is not meant to be used directly, use an [`ndarray::Array`](https://docs.rs/ndarray/latest/ndarray/type.Array.html) +/// instead. +#[derive(Debug)] +pub struct OrtInputTensor +where + T: Debug, +{ + pub(crate) c_ptr: *mut sys::OrtValue, + pub(crate) shape: Vec, + #[allow(dead_code)] + item: T, +} + +impl OrtInputTensor +where + T: Debug, +{ + /// The shape of the OrtTensor. + pub fn shape(&self) -> &[usize] { + &self.shape + } +} + +impl ConstructTensor for Array +where + T: TypeToTensorElementDataType + Debug, + D: Dimension, +{ + fn construct<'a>( + &'a mut self, + memory_info: &MemoryInfo, + allocator_ptr: *mut OrtAllocator, + ) -> Result> { + // where onnxruntime will write the tensor data to + let mut tensor_ptr: *mut sys::OrtValue = std::ptr::null_mut(); + let tensor_ptr_ptr: *mut *mut sys::OrtValue = &mut tensor_ptr; + + let sh = self.shape().to_vec(); + + let shape: Vec = self.shape().iter().map(|d: &usize| *d as i64).collect(); + let shape_ptr: *const i64 = shape.as_ptr(); + let shape_len = self.shape().len(); + + match T::tensor_element_data_type() { + TensorElementDataType::Float + | TensorElementDataType::Uint8 + | TensorElementDataType::Int8 + | TensorElementDataType::Uint16 + | TensorElementDataType::Int16 + | TensorElementDataType::Int32 + | TensorElementDataType::Int64 + | TensorElementDataType::Double + | TensorElementDataType::Uint32 + | TensorElementDataType::Uint64 => { + let buffer_size = self.len() * std::mem::size_of::(); + + // primitive data is already suitably laid out in memory; provide it to + // onnxruntime as is + let tensor_values_ptr: *mut std::ffi::c_void = + self.as_mut_ptr().cast::(); + + assert_not_null_pointer(tensor_values_ptr, "TensorValues")?; + + unsafe { + call_ort(|ort| { + ort.CreateTensorWithDataAsOrtValue.unwrap()( + memory_info.ptr, + tensor_values_ptr, + buffer_size, + shape_ptr, + shape_len, + T::tensor_element_data_type().into(), + tensor_ptr_ptr, + ) + }) + } + .map_err(OrtError::CreateTensorWithData)?; + assert_not_null_pointer(tensor_ptr, "Tensor")?; + + let mut is_tensor = 0; + let status = unsafe { + ENV.get().unwrap().lock().unwrap().api().IsTensor.unwrap()( + tensor_ptr, + &mut is_tensor, + ) + }; + status_to_result(status).map_err(OrtError::IsTensor)?; + } + TensorElementDataType::String => { + // create tensor without data -- data is filled in later + unsafe { + call_ort(|ort| { + ort.CreateTensorAsOrtValue.unwrap()( + allocator_ptr, + shape_ptr, + shape_len, + T::tensor_element_data_type().into(), + tensor_ptr_ptr, + ) + }) + } + .map_err(OrtError::CreateTensor)?; + + // create null-terminated copies of each string, as per `FillStringTensor` docs + let null_terminated_copies: Vec = self + .iter() + .map(|elt| { + let slice = elt + .try_utf8_bytes() + .expect("String data type must provide utf8 bytes"); + ffi::CString::new(slice) + }) + .collect::, _>>() + .map_err(OrtError::CStringNulError)?; + + let string_pointers = null_terminated_copies + .iter() + .map(|cstring| cstring.as_ptr()) + .collect::>(); + + unsafe { + call_ort(|ort| { + ort.FillStringTensor.unwrap()( + tensor_ptr, + string_pointers.as_ptr(), + string_pointers.len(), + ) + }) + } + .map_err(OrtError::FillStringTensor)?; + } + } + + assert_not_null_pointer(tensor_ptr, "Tensor")?; + + Ok(Box::new(OrtInputTensor { + c_ptr: tensor_ptr, + shape: sh, + item: self, + })) + } +} + +impl Drop for OrtInputTensor +where + T: Debug, +{ + #[tracing::instrument] + fn drop(&mut self) { + // We need to let the C part free + debug!("Dropping Tensor."); + if self.c_ptr.is_null() { + error!("Null pointer, not calling free."); + } else { + unsafe { + ENV.get() + .unwrap() + .lock() + .unwrap() + .api() + .ReleaseValue + .unwrap()(self.c_ptr) + } + } + + self.c_ptr = std::ptr::null_mut(); + } +} + +impl InputTensor for OrtInputTensor<&mut Array> +where + T: TypeToTensorElementDataType + Debug, + D: Dimension, +{ + fn ptr(&self) -> *mut sys::OrtValue { + self.c_ptr + } + + fn shape(&self) -> &[usize] { + &self.shape + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{ + environment::{tests::ONNX_RUNTIME_LIBRARY_PATH, Environment}, + AllocatorType, LoggingLevel, MemType, + }; + use ndarray::{arr0, arr1, arr2, arr3}; + use once_cell::sync::Lazy; + use std::env::var; + use test_log::test; + + static ENV: Lazy = Lazy::new(|| { + let path = var(ONNX_RUNTIME_LIBRARY_PATH).ok(); + + let builder = Environment::builder() + .with_name("test") + .with_log_level(LoggingLevel::Warning); + let builder = if let Some(path) = path { + builder.with_library_path(path) + } else { + builder + }; + + builder.build().unwrap() + }); + + #[test] + fn orttensor_from_array_0d_i32() { + let env = &*ENV; + + let memory_info = MemoryInfo::new(AllocatorType::Arena, MemType::Default, env).unwrap(); + let mut array = arr0::(123); + let tensor = array + .construct(&memory_info, ort_default_allocator()) + .unwrap(); + let expected_shape: &[usize] = &[]; + assert_eq!(tensor.shape(), expected_shape); + } + + #[test] + fn orttensor_from_array_1d_i32() { + let env = &*ENV; + + let memory_info = MemoryInfo::new(AllocatorType::Arena, MemType::Default, env).unwrap(); + let mut array = arr1(&[1_i32, 2, 3, 4, 5, 6]); + let tensor = array + .construct(&memory_info, ort_default_allocator()) + .unwrap(); + let expected_shape: &[usize] = &[6]; + assert_eq!(tensor.shape(), expected_shape); + } + + #[test] + fn orttensor_from_array_2d_i32() { + let env = &*ENV; + + let memory_info = MemoryInfo::new(AllocatorType::Arena, MemType::Default, env).unwrap(); + let mut array = arr2(&[[1_i32, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]); + let tensor = array + .construct(&memory_info, ort_default_allocator()) + .unwrap(); + assert_eq!(tensor.shape(), &[2, 6]); + } + + #[test] + fn orttensor_from_array_3d_i32() { + let env = &*ENV; + + let memory_info = MemoryInfo::new(AllocatorType::Arena, MemType::Default, env).unwrap(); + let mut array = arr3(&[ + [[1_i32, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], + [[13, 14, 15, 16, 17, 18], [19, 20, 21, 22, 23, 24]], + [[25, 26, 27, 28, 29, 30], [31, 32, 33, 34, 35, 36]], + ]); + let tensor = array + .construct(&memory_info, ort_default_allocator()) + .unwrap(); + assert_eq!(tensor.shape(), &[3, 2, 6]); + } + + #[test] + fn orttensor_from_array_1d_string() { + let env = &*ENV; + + let memory_info = MemoryInfo::new(AllocatorType::Arena, MemType::Default, env).unwrap(); + let mut array = arr1(&[ + String::from("foo"), + String::from("bar"), + String::from("baz"), + ]); + let tensor = array + .construct(&memory_info, ort_default_allocator()) + .unwrap(); + assert_eq!(tensor.shape(), &[3]); + } + + #[test] + fn orttensor_from_array_3d_str() { + let env = &*ENV; + + let memory_info = MemoryInfo::new(AllocatorType::Arena, MemType::Default, env).unwrap(); + let mut array = arr3(&[ + [["1", "2", "3"], ["4", "5", "6"]], + [["7", "8", "9"], ["10", "11", "12"]], + ]); + let tensor = array + .construct(&memory_info, ort_default_allocator()) + .unwrap(); + assert_eq!(tensor.shape(), &[2, 2, 3]); + } + + fn ort_default_allocator() -> *mut sys::OrtAllocator { + let mut allocator_ptr: *mut sys::OrtAllocator = std::ptr::null_mut(); + unsafe { + // this default non-arena allocator doesn't need to be deallocated + call_ort(|ort| ort.GetAllocatorWithDefaultOptions.unwrap()(&mut allocator_ptr)) + } + .unwrap(); + allocator_ptr + } +} diff --git a/rust/onnxruntime/src/tensor/ort_output_tensor.rs b/rust/onnxruntime/src/tensor/ort_output_tensor.rs new file mode 100644 index 0000000000000..5176a58c423ea --- /dev/null +++ b/rust/onnxruntime/src/tensor/ort_output_tensor.rs @@ -0,0 +1,347 @@ +//! Module containing tensor with memory owned by the ONNX Runtime + +use crate::{ + environment::{_Environment, ENV}, + error::status_to_result, + OrtError, Result, TypeToTensorElementDataType, +}; +use ndarray::ArrayView; +use onnxruntime_sys as sys; + +use std::{convert::TryFrom, fmt::Debug}; +use tracing::debug; + +/// Tensor containing data owned by the ONNX Runtime C library, used to return values from inference. +/// +/// This tensor type is returned by the [`Session::run()`](../session/struct.Session.html#method.run) method. +/// It is not meant to be created directly. +#[derive(Debug)] +pub struct OrtOutputTensor { + pub(crate) tensor_ptr: *mut sys::OrtValue, + pub(crate) shape: Vec, + env: _Environment, +} + +#[derive(Debug)] +pub(crate) struct OrtOwnedTensorExtractor { + pub(crate) tensor_ptr: *mut sys::OrtValue, + pub(crate) shape: Vec, + env: _Environment, +} + +impl OrtOwnedTensorExtractor { + pub(crate) fn new(shape: Vec, env: _Environment) -> OrtOwnedTensorExtractor { + OrtOwnedTensorExtractor { + tensor_ptr: std::ptr::null_mut(), + shape, + env, + } + } + + pub(crate) fn extract(self) -> Result { + // Note: Both tensor and array will point to the same data, nothing is copied. + // As such, there is no need too free the pointer used to create the ArrayView. + + assert_ne!(self.tensor_ptr, std::ptr::null_mut()); + + let mut is_tensor = 0; + let status = + unsafe { self.env.env().api().IsTensor.unwrap()(self.tensor_ptr, &mut is_tensor) }; + status_to_result(status).map_err(OrtError::IsTensor)?; + (is_tensor == 1) + .then_some(()) + .ok_or(OrtError::IsTensorCheck)?; + + Ok(OrtOutputTensor { + tensor_ptr: self.tensor_ptr, + shape: self.shape, + env: self.env, + }) + } +} + +impl Drop for OrtOutputTensor { + #[tracing::instrument] + fn drop(&mut self) { + debug!("Dropping OrtOwnedTensor."); + unsafe { self.env.env().api().ReleaseValue.unwrap()(self.tensor_ptr) } + + self.tensor_ptr = std::ptr::null_mut(); + } +} + +/// An Ouput tensor with the ptr and the item that will copy from the ptr. +#[derive(Debug)] +pub struct WithOutputTensor<'a, T> { + #[allow(dead_code)] + pub(crate) tensor: OrtOutputTensor, + item: ArrayView<'a, T, ndarray::IxDyn>, +} + +impl<'a, T> std::ops::Deref for WithOutputTensor<'a, T> { + type Target = ArrayView<'a, T, ndarray::IxDyn>; + + fn deref(&self) -> &Self::Target { + &self.item + } +} + +impl<'a, T> TryFrom for WithOutputTensor<'a, T> +where + T: TypeToTensorElementDataType, +{ + type Error = OrtError; + + fn try_from(value: OrtOutputTensor) -> Result { + // Get pointer to output tensor float values + let mut output_array_ptr: *mut T = std::ptr::null_mut(); + let output_array_ptr_ptr: *mut *mut T = &mut output_array_ptr; + let output_array_ptr_ptr_void: *mut *mut std::ffi::c_void = + output_array_ptr_ptr.cast::<*mut std::ffi::c_void>(); + let status = unsafe { + ENV.get() + .unwrap() + .lock() + .unwrap() + .api() + .GetTensorMutableData + .unwrap()(value.tensor_ptr, output_array_ptr_ptr_void) + }; + status_to_result(status).map_err(OrtError::IsTensor)?; + assert_ne!(output_array_ptr, std::ptr::null_mut()); + + let array_view = + unsafe { ArrayView::from_shape_ptr(ndarray::IxDyn(&value.shape), output_array_ptr) }; + + Ok(WithOutputTensor { + tensor: value, + item: array_view, + }) + } +} + +/// The onnxruntime Run output type. +pub enum OrtOutput<'a> { + /// Tensor of f32s + Float(WithOutputTensor<'a, f32>), + /// Tensor of f64s + Double(WithOutputTensor<'a, f64>), + /// Tensor of u8s + UInt8(WithOutputTensor<'a, u8>), + /// Tensor of u16s + UInt16(WithOutputTensor<'a, u16>), + /// Tensor of u32s + UInt32(WithOutputTensor<'a, u32>), + /// Tensor of u64s + UInt64(WithOutputTensor<'a, u64>), + /// Tensor of i8s + Int8(WithOutputTensor<'a, i8>), + /// Tensor of i16s + Int16(WithOutputTensor<'a, i16>), + /// Tensor of i32s + Int32(WithOutputTensor<'a, i32>), + /// Tensor of i64s + Int64(WithOutputTensor<'a, i64>), + /// Tensor of Strings + String(WithOutputTensor<'a, String>), +} + +impl<'a> OrtOutput<'a> { + /// Return `WithOutputTensor<'a, f32>` which derefs into an `ArrayView`. + pub fn float_array(&self) -> Option<&WithOutputTensor<'a, f32>> { + if let Self::Float(item) = self { + Some(item) + } else { + None + } + } + + /// Return `WithOutputTensor<'a, f64>` which derefs into an `ArrayView`. + pub fn double_array(&self) -> Option<&WithOutputTensor<'a, f64>> { + if let Self::Double(item) = self { + Some(item) + } else { + None + } + } + + /// Return `WithOutputTensor<'a, u8>` which derefs into an `ArrayView`. + pub fn uint8_array(&self) -> Option<&WithOutputTensor<'a, u8>> { + if let Self::UInt8(item) = self { + Some(item) + } else { + None + } + } + + /// Return `WithOutputTensor<'a, u16>` which derefs into an `ArrayView`. + pub fn uint16_array(&self) -> Option<&WithOutputTensor<'a, u16>> { + if let Self::UInt16(item) = self { + Some(item) + } else { + None + } + } + + /// Return `WithOutputTensor<'a, u32>` which derefs into an `ArrayView`. + pub fn uint32_array(&self) -> Option<&WithOutputTensor<'a, u32>> { + if let Self::UInt32(item) = self { + Some(item) + } else { + None + } + } + + /// Return `WithOutputTensor<'a, u64>` which derefs into an `ArrayView`. + pub fn uint64_array(&self) -> Option<&WithOutputTensor<'a, u64>> { + if let Self::UInt64(item) = self { + Some(item) + } else { + None + } + } + + /// Return `WithOutputTensor<'a, i8>` which derefs into an `ArrayView`. + pub fn int8_array(&self) -> Option<&WithOutputTensor<'a, i8>> { + if let Self::Int8(item) = self { + Some(item) + } else { + None + } + } + + /// Return `WithOutputTensor<'a, i16>` which derefs into an `ArrayView`. + pub fn int16_array(&self) -> Option<&WithOutputTensor<'a, i16>> { + if let Self::Int16(item) = self { + Some(item) + } else { + None + } + } + + /// Return `WithOutputTensor<'a, i32>` which derefs into an `ArrayView`. + pub fn int32_array(&self) -> Option<&WithOutputTensor<'a, i32>> { + if let Self::Int32(item) = self { + Some(item) + } else { + None + } + } + + /// Return `WithOutputTensor<'a, i64>` which derefs into an `ArrayView`. + pub fn int64_array(&self) -> Option<&WithOutputTensor<'a, i64>> { + if let Self::Int64(item) = self { + Some(item) + } else { + None + } + } + + /// Return `WithOutputTensor<'a, String>` which derefs into an `ArrayView`. + pub fn string_array(&self) -> Option<&WithOutputTensor<'a, String>> { + if let Self::String(item) = self { + Some(item) + } else { + None + } + } +} + +impl<'a> TryFrom for OrtOutput<'a> { + type Error = OrtError; + + fn try_from(value: OrtOutputTensor) -> Result> { + unsafe { + let mut shape_info = std::ptr::null_mut(); + + let status = ENV + .get() + .unwrap() + .lock() + .unwrap() + .api() + .GetTensorTypeAndShape + .unwrap()(value.tensor_ptr, &mut shape_info); + + status_to_result(status).map_err(OrtError::IsTensor)?; + + assert_ne!(shape_info, std::ptr::null_mut()); + + let mut element_type = + sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; + + let status = ENV + .get() + .unwrap() + .lock() + .unwrap() + .api() + .GetTensorElementType + .unwrap()(shape_info, &mut element_type); + + status_to_result(status).map_err(OrtError::IsTensor)?; + + ENV.get() + .unwrap() + .lock() + .unwrap() + .api() + .ReleaseTensorTypeAndShapeInfo + .unwrap()(shape_info); + + match element_type { + sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED => { + unimplemented!() + } + sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT => { + WithOutputTensor::try_from(value).map(OrtOutput::Float) + } + sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8 => { + WithOutputTensor::try_from(value).map(OrtOutput::UInt8) + } + sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8 => { + WithOutputTensor::try_from(value).map(OrtOutput::Int8) + } + sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16 => { + WithOutputTensor::try_from(value).map(OrtOutput::UInt16) + } + sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16 => { + WithOutputTensor::try_from(value).map(OrtOutput::Int16) + } + sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32 => { + WithOutputTensor::try_from(value).map(OrtOutput::Int32) + } + sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64 => { + WithOutputTensor::try_from(value).map(OrtOutput::Int64) + } + sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING => { + WithOutputTensor::try_from(value).map(OrtOutput::String) + } + sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL => { + unimplemented!() + } + sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16 => { + unimplemented!() + } + sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE => { + WithOutputTensor::try_from(value).map(OrtOutput::Double) + } + sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32 => { + WithOutputTensor::try_from(value).map(OrtOutput::UInt32) + } + sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64 => { + WithOutputTensor::try_from(value).map(OrtOutput::UInt64) + } + sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64 => { + unimplemented!() + } + sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128 => { + unimplemented!() + } + sys::ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16 => { + unimplemented!() + } + } + } + } +} diff --git a/rust/onnxruntime/tests/data/mnist_5.jpg b/rust/onnxruntime/tests/data/mnist_5.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2216a276c4c0a687b9c437791b555e9a2132e596 GIT binary patch literal 555 zcmV+`0@VHg*#F=F5K2Z#MgRc;000310RRC1+Wgv=4-_35A08bV92_7dE+-%&EF&BoC^soAFflYVG#@89JvcHvE;BST|G)qX2ml-a z9036l0RO}Q9{>OW1pxs80RaI300000000010s{mE1_uZU3Jd?l0JRVR0s#X90t5pE z1q1{D00Dgg0s{a95d{(Xb($mz{*4NnC+Tr5k=I%|A#=SW> zC$F_=-S|txc2-lC@!p>`@>K-wq-16488Ciq tjz=RkOX6RMG_6+mO49UcrMj}wti_t%&G+sIJhp$k(;tm+)KNth|Jf6c;EwfyL)km;uI*fH~+(3YoC*s zBoA}uo4seU6Q!;yhYltI0{{SY1$k*r*z3-J7YGSeKm>ThFD(Mcd7oXZb-jCu8JnMEZFIf$COx~={stW9Y_}s@v&3!X4A{FI zBk2A?@H)oxI)>d5RQXdu&)wHEzy45^a~d%bso=bqm$*|ad}gZniK2al5d9gvCKz^+ zBiBFmjG z`HP!l72v>7zJ}?V^#=I1=>kdp!?~Qjjza_L~P)Bx3@Sx9lU4CE< zC5ib3^(i!M7W5@mDUbQtO&(8Ox`NsK=X^=pT`8PzfeOprKA5dXeDoy!@Ce88L(rSVdV)Pel?Ka{H$<|xb zXf5nJ`n4123r*k)HuQn0D+@s$i=v2`tcaS7JM?4tHH^#12ry{ehayG^C}t_g{Y$ui z>#HvOHjsqw{&5O~bSZ}icd{6Pn!ZJhmQf^HQBY_5hZ;Wu-PVp$ru$BNbqV3ilBN zbALkf!h5XeJ$S1-OfRe+)V;YeWbE-pWC4XQktvgW9hgbe5zm0kfrwE&QR&uuuAx>A zN@~4lQ2V&tAE4o-lP8n@CC`N_Se0KTC3r)L!Qw%)-j^w33vUub;r=&Gpf$@B04G!- z5bPvD_lKy6IK~Z4LO7^^JZOa?NGpCe5tHc$&0;Cd%NR6LF{IfJVOI*T3KL}j$L^}p z=T=I==87SCT*M4zH|*+n|AeO1^92_;)AFE+Hf$5+bFLaCBwtjL{geD1sQq5*U>?u( z7gpG&6y9BeIwM(;zKCRlU#!J5sC1l*xy4fZoS|YlLBoNpcsQTaFE}~8q6}hZ$7mhN zfx0y&>D|QHMTRDZ`fhYi`lQ+#!;3{NC1h(nxw)x+-5{f8@-r=JQ>s!~X2ja$Ajq9F zm@JG)=F>ZM@>^ixx*fsLc!Q%Q9uJNQ3d=5&3^cgIHJ0QHSyku(e3iDKbi*qsY)|R{ zj7dCzwfDyi%_9q<@KMT>5~B5#aYSPkL)$(-LW`&0^TMT6i}UX#VpK=8FKB7H&%XRS zPCrUNc3>vR%!MB!ex3`^G6F216`WxQmf+-xH`-2G$`5l%zej+AOx_nc4AAf#Si->U z5EYP4WpFZV{|hQzr}|=tw#S<8hLMe?Ff#?wHd6MSfT|(jn1$DldG}k}S;U2c)5Di|89Aml~OgD>--xofq zsypNqOy?G`=g6k2sYuOptH-kT+k7+^5F~QOD9aBa#zND5H&ZWo3HQc}`CTiL4wFHd zAG0&%vrV;^h)=R^5G0oB$iPz!c`HVg%9CG->!@&Du;P(jQej4NksG0qL=GC=;i`%3*ZPEq)sPvRX!|L({2a_LRfdf8=%R+KJfKEwX6>bgl!GXMef62xjfGKQ{LU z-+uz?R@Eam0%%4D(QZqG7EYWAI&O!aLk#8ehZx%00q3EXjNXiEgL)3YKFeOi*Vc zFE$pHX!jccET1tg3MM1v!!eXl$=j{_T2q}HROn(%Q@co0Q^n$CgZe{datbC$)+-imaV0yau008_%gnHFk8huqj3V zTY>u2IP%H_?dSUg#r#=&xjCI*h3cI%_4wzM#rppchF%yLdF979yK~9fy2-i%2-t^k zbiZHfcjH|IWDOBp?O-$+K}I@P+|gs4id) zm`e+%&bbbF6nrY4`J?RXYpMbahfVtb)M6i~`cOPHj*V`jIg zdCCD(?2;{D^+}Pc@Z_|xf|CA;5}F<9Tj0`;njOA~!fV%BR{cF^lHUzbsn%62$#GY- z5z##~l_TOgl)`08s=*gC?Cd>WC3cVIK7|xvgx>De)j=QS!44)!vMxVNY_z`=1zYfq zwaT&cr14r-5L!$Ie}d6;@%cxucPJ-(XG+)>B*x!irYPUZ?Xz8LOnLk1kD?EJY7k<( z4s|u`P|u|438vd`jCn=uIYr~S1|T<&2sx|fokVFO&zL%4V`ohN(d zrW`p%vbYi=vUJ|Dn!iG|M+Q)W3`J#H{DKV+@s{kPOLf%>f69m8a_@wqLAl^1#>KTq zzxXxgh1iEbMRRsjJNANv8T*_GcAH1PJUN#m^y(E}kAq(Do5S&WkUt^}atE8E^w)?A`4ROQEnu5SNwy!MB69S;u#Wsh_xT`A?NT9E2IO<5?HzA-M8TE>w+aNlo{}WRk55g^|81rbz$z@0{kG{2;Ybm(5n?*r=e7=T4fS4f#v5{eXHTum=#f-@(;jU z1=wOA@KmaQ!?`H<0r1`8uJOZ1@J~Jp1fVs%3E9ODXj|5eeGcYU)W6*P9(n^<08)`NsN>?OeGD?gG63*MY+AWeInAZeE+g~E2o^g$* z(tKr+8>^JZEn?MyxxT1yK~M)W&u@)dxoemo%-RAIs@gBX+=ddSm-}oM-dqknt}>X_ z&1%E6_uT*W>WOPsxZ<-)<9H1p?=0#g-5ZI|3#&cqVE(FLY59_lg< z`mmDttSw`6`>UBD^?OP5a+{(?c%6f7+&Sx%23wMuKJ0D9|FW4uzB*4^B(Ki+CgpH- zS|KanxFQiO@x7YBEESxw;duS+7Q7zbq1OsMfQoS_?7gW8oN2p=D8 zSt2SHF*It$HWf;5uylT5S4w`(f)a&1fl%Tp(Ok+F;5dtTZA)8O@R|Y_~C(C;(AOTR9Sg3rA|6H<(0suNBnERTT}KHF!*KG!UzI>~W~rn%DVN+`nk8GDrC6FHUV)YTW8nGJS(InlQm&^>>gpd7 zp#_rl6wXi03~>kpH4A;H6&aB5Hj%Z_kbu?aydSH1>8mTPUlLvee2yP5ez+w;Vi_;?CCY$ z^!IiYu{)px+iKc!kN)#$l-r}hmQU^o`ClVS4(tvmB$n=GV5 zVqP>L-z@RUZ0_P*?lR8ll>N!fd6cJ3=jl|{$#mt(^ckKRey&|^S;vL~OI_`jJM|h7ycBIvqf%N##Ss;w0mz*qCqpJ= zrf6ASE7VTgzJx@Vx%{4teP~vP<@D`(55Jzo9Jf*sv_$s9p2C6-xgj}*ivF)EQXF&8 z$kM_NDKW^H047UGXX!TC?BJpDY#Ac2q}KdWq8@OU7q7+}sK*e_q-)%%_nr;G5!v%6#raIW*-ZBnWx-|!vUSV&ZU#y?vUM7aOGv~rl_ZgQHO&bxk-R4M zIa(3OO7_SA5=(Tu! z?b!g+ndf2+J6SQrStcH5U|$DLWmZ9Y|D;Cnq^^p{(LU;aWvP2Dyd$P1`d8z)E4{kv z1WIVXfQSQgqtnHcIfw9MoQjq{n|tf06=3ZV-YQ1sxYrn?BptE_Q((Hg?X|&TJDQV> z0$ug8<;K4j)_%RI0$=1dC{Y)9NN<7x0A3MN6*tG?%#t*?1l(?I6oA?!O%M+4+ zouv_Io@z$F9H0|EbCu}Djv&uW*}sy__gF;AN)mNXh>n)FX#|laB0b@N!jO7-m;9?b z=vS6scZ#cQ*I>B;LnUmiE#-G%3)rVC}dSi*M%>30lFWp1F^XzB7K=2Wt>- zH-ZX_q!z0BAspVfL~$b7o#Z7>(Tmxgu{j+97bW)r=iKXCluqWMDu$gFTp=vUvL={I z0;Wvve?Ky-|7H$Nn>5qfN_i7cA*vEKzdWyQ4B-^=U}nJbeY0>Zq861&LLj161~f|r zT16;xft$fc#Nw)vSj!|_p%-ZE9t?c~BJsMynl~W6hO`L%38~6Mh(_pc{%)1v?QeTQ z*A}uTUjo07KHO(AX;EazPv3FGQTwE39a-!wjZkL-^L1=-&9_QdBb}#9l3;ebWBCWv zjcK7O)eV4@=OEn(H#9*SOYojhwGi^!8NUSU#5A2u{OF@p9068#Ly*9WSDYAT zLSp%-fY}>3yJw#y=|pa<65C~6bxn;$t;qZNrV4D&WH;~_ElqNx&$!vp2*cpS>`h&m zAdx8#XZz;((yx`_8{Km zQEE1l+TAD8FK;(x2yT#J43W_urP3tW`fGs!-3dfAqWX(tFSy(wY^65kY~@?=nnM9E z)sv7VMmJN!&#ybAlg~t5!x%j`hJ8>rv-@*;qzo!NPJQqR2f{rewXX;8Ei>@1vT_MO z{;ZO=bP)A{B?uAWeV(|Z$2FrQhi%`Dy=xZ^sd|4%y-J%cgWIxtRa?zxERRUs8V@ZTM@E} z7%5V-YJ}1{`WEGqnS+5OCcqA@&|SPjC*aoyI9gszjcKU@Hd|2r@@RpsTCuK@mfyjr z?kolE&I+I66SO&I!5Ja5qY?+ViBFPnd8`Bl@-p)7*o~T@ z)dYoSGPbfP;X2lAX@^+QR?{$C*vyXR^x-C zC&f;Qi@eEGo~BivF3DZFxhVR|X&lAFjf9Fa7#7-ZY-NsBj^m4q-5nMAOCVmOWJz7R ziKAc*7t0g+8vi@ugaj;o7u zf=aBYst$-)D27HGjTjJ2kfQNbJ`N#Ed-6{p$=$Ew={-pYKNX8(So4d>j9A|27wmfB z{Qdiy#?X$<*}aDg6N`P+_;4Gd2pck_Y$T*}B&pS0SZiOxiRc{D4;sNrvjoJ{ z2jigC0S4LT?jXU5)$mtHM72Pvz#GSXToOcrc8!uJuzDNL;;{205YKLd7@26KTprJ~ z`eAX38{SqBUmyTK-ZYF{+wR;TV!M{+WO43{cb0-j|KVI!z|alUvnv_dl0{Yp-ET<* z2nZ@!`?5DovePQhq?m3ar%4anvS~zehqDkDm&QUuVE-j=k1B8(S*?zFlO{U)Om1%*++`91ij9s8@Sak(Lx9*G0f$x zxB@l2uqjw6?lQJAxgIN8w1!Wo0~Q`t95t{L#$=o0h|0W(%44Pukr?a4!Zq3s zQu3fv6R-0qctQQC*XcX2U*r8}-y)Z^faf;U6G!bo+#e-}*HDSi332#%&W~|)XJ}dT zBO5B2>c<5-F=#Bj*erbMEZmta{2*qY>}K^x^14?26k>Yda^`_KdcI+4xr|qn)M?B# z%NkpOE~9YRkSZPsD7-(lZqhOqEUq31p1~Cc^yR;T`P?XM{8ryp)lQoR)sLZG;+PP+j3TY%iCG< z;P`NkXBGmaeUe5yl7ur0RV_G)Jy<5|1BTuZD$0o(H+lCWE&X{)9B!;NUtb6yvmr1Na!` zx5HGu#n3s~LCj`3fa$SCKS4Uk)VN>gkknFo|Qb7NV|7>R`LJ5M0vkZ>4!Ol=?&V6u83SuieR z>-CECfTDghkX3vatPTaMv(jJNDWX&=7N;`xaeKbo^i+VmH~OUemi3;ZMl?b~* zQ25C1frgZ#WIPgKTQM5=)dFBS%KmGZz}lzA!l#Bi@P?K%uPP(=TSoXAV@9=(G7eTT zRwP&8JL0!*l34Myp0eEQ@F+ETk-B=~RWt=_1X#^bO~h^_$8wC7Z-Xrkw7N9F+4-v3 z795@ykPZ!{7er)`y@3;YUzxs(XXbk_jMAU(Hz~)B__%IwZP5BJXK8Yms@nA!Z*zzx z^WoZN5_5y~}P7h9ieI>+HjXV7?jr>ijW%F|B&vO0(xs zi>_3z-XzY>G|uj1c_m+V3r~8pdIHb@;>d8;0E^bgkN@8Vumig%ZkoLdo!fEdD6~=tbun433LHLDS=LdIHR+f4&b86uj zzO5p0hvGT}Hgznp;F+fGG*#B;Fgtz+k5nZsH^EwreZktU{9*Bv5PaSa>CH*p*t22V zg#5aQ=Gz_sMKl>}TU5J>^s>9DYqwq(KH@QeMqg;XU1`OHUvczQ?)|8&Iuh0>?38Do$%YZaB$yy&7 z;p^QG6W2`e-sPhd|0GNZ*QDyZoUxJh>hObOj$S;_SMnX&o+WMC-{$+K2!)=z`Gsn? ziO|f+s0W`UhmHjM+kDG3Bf$jUxe305tyj3q^l-}3xrdLb!%3rEi77Ud2RDAjmg6Tj z7uA^ixqc_e&Z(2p)BOol*B#UCr6q(YB1T#y(E(M9uu({Kn5EL0IZMFG?O-M9C@I=2 z2hJhGMrO_C4(-Ax=Gi4MS`ApLYGAQyHja5nu62&B6KR9)*r?Xj1`xs*VV!PMWAk~o z;9C9#H%M2kaZJMSJ<5J+=&5X!;WSatVQTc@Qh?*R(fg#%9@PM~k+Weltn_rDv~(l_ zLSI(;qhlmkg~{VYv|*aLuoeqbpQ0Vm_R%NwxlmmTKCvTj+tow)M)op3>C47l#LvTj zo}f?RF)ue?Al90ou?*NHSBls}-XS`hxlw&@1<9vRVn@uo@5Nr!5zAWf`n_o=< z9XybR(3xOM#U9sb=X6sU^v>1?lvzAzmx6x7lQ|DaZM*>n!9>VJ=x06T1+WR(wzL8~ z&K1h#2Wi79U`{9Q!75^TA1P2D^`bm2v=U$EwD> zYmsQ+uQHv}$u_t6Mx`vJXM^fsp-pwGsRCMi1ix~ZbC{=Dcg@6*ksU@Q&jF67hdM0Z zy*w}zLZy7Wq4Fb>-5->3Y{5l|3_sFnZ39R4e+<^+s`Ih$UB7DwSHh-mlui zt;!;x*vzZc%$Ld1lOA^VEb8NF4o)tpb5bYZDC&l}0b|_Tx&f zi}3vvJ~s+^BfniAei`n28K&CipG;MN80woCqCxBw&^1w3gvuO3;1;=Z9EhfQ@5{k2 zxcJ=-!9Ueyy;04on5eIP5ZDv?@MrI(5o!bb@Vx-*^n!V=C%p~(%d5Rjq>DR9yaE#F zs_$~RsUaS%I>`aNZY#-KE}JRK^^GY4qah5YMyqiWDX+e8yM*-cZE@FUEkP4%3hXAQ z0$;va-Wd-bwTCa^qQ@8ob0P?crgIA6a&1ON&6Pe~JKLPUD9&ym*CS0-4S2F}UV_o8 z1^|`5nGo$oZmUvh>hwZ&sGDCFPSLl4l4}@rMm2*PXDFe6#IrP0ci(-y^^cW4tmJzF8}IB~YjWlL$r z?FDR6B>jL;h`bFCEVxVxTQrG04-fCxv5EX$AGp+`kBiIZVB7Sw+kJ@Um@5EBA7+0FBM+^VvDt^Xch6p4MV;y zo4X9@kfIe9FB;%r9g;Zcv6UieRShmmdVb0R6?|MVI7D6X4@k*zLv+HJ!bq@=kn3b9 zc!#C%KpJ*!g=WJo>ycHOToGW7n3J3IVQi>$ct-|M{OIQIVBxRi+Kt&qD~p_Mgqv;j z4(R6ZWDcuOA1yz9WT@PE-7L|W?elapYn)Cy-5A6c8(LEvsGBZV&QbIZYmPknB;g~HD~}$h2_q3(kw=Sw`23khAmJIiD427= z<`N+C#s2c(LgDt|s3D|jWSM>7H$Wf;NX8NO20#XP_#O~DF0$cV^kO5T0c?ad=|F9^ZdP}rln`@Z|Yfpa^W@QdeJATi{4dky+GH;!;y z4su1__4IS-?utz0&+_ns%RZ?12jpjfd6J*={t^1vg9EBwzCj!G35kI_)A>Ug|U!7d6sel#m_Bptc7yzZ+`xQ@d=6_8p;qMTDEk~9VtHO3=(ZMX!09-EHzE%+UJ z$TvkmJg84q%^DC=7O6{Bh>zvB0eGMNqvh=S0?993Y9}T1>D4-jDg>D8Wepfrs-h6rfF8in6 z7L9AenPX1!5^c7#ElumDrYr zbI;i4MdQU=ngp=z{FLF9N?kqY^p}LE8>~~u9T={w^rJ*ynQhJeSKNGHxq9TiLwqZ# zQr#e*q$inU)AUX|AM(uZt}mH!Lddb))4+1m7A|adjlnBuBiyl4fH?2Q07x7OxlNv1 zfjIIdaN$pTfWhAf4t&%3LIPjTb}piF13O{+MR8G6aWPYIr0^%|fqVqVv`ukQYr{XR zK3a6~aGIPDO?jf(2qEF3za>Tg6sH3SaU{g6pj)Xf~wwK z2otLHqC>**=bnv#0zxUc`1Qi2Ps0e?E(o)qhzjOPG=O7;c! zPsG{9bJnzQUJAS!eY$QEvr~BVQp-7k`2Rj*L#@RMEWE4Ftecp%&c_1;Y9b~v* zf=n03)-ZC|DGP}T-qrpH8@W?~o5SDHUEKqG&;D_7|gV%g1ITu%XdX-D#9<2b^JT zU>UA#=oJr9pKgj=)mRFq;ERMDGbOqE3ExSglwYlG?3U#LE;0L4w;Dkk%Usw;3}i?b z)Z}e(uqxm8>Hd51N?pJiSLn=_gPrH!B_yJMriW5oXvcHmNN`1z;> zrfC&B^b4U?Aq`d!a-O3EmYGC+t`vb`3>DQ&M3s zXnqkPz9Eg`^1~4}B4uES`8_rTc7J)$ zN=_HUxZaZ!g^^lvphPE?BN~o8=*7mwMJj8-}%)#oIm#Htt)zG!X z)kzH~PX`YlgJm5ML&B7o197o)vDGLS2j%#SI^q0$S=JqRsRQqoM{Y=9amrR=yjs?Y zEV|J92uqzA+de)40vptA(ZD}AMRzK|JhpkzH92+=Lj#XL3Y)1UOCpm96R>`P26?)> zF!(RyxLLJ@fCg7iWi{L|aC(^s(&qvHPUsWP?mOIl^C(5=M~hFENq1dF4#?(Y7Wce1 z=Q9EVpCti#TfTS{V`t1ge`EHWI6tH)6Ce$v;uZ_m1Xyam`NHGp)`sgE$#fFyh#}45 z6L+Ms6dlgj`A}z3B&2a9p2>;rtba&ya&Gap>MK-9fC|w7yA?njKFzq4SEO~JH8E4W zrPg)~H=omXOOHZU@YX;0mlArlw**4{)*N+U190Eg##ePjdt`C2q|`*J)bO&v*F7cjpWGOGsg zdo7*dGyetG=IdzYQ{_S~MGJf^pI;`X@~4$h^K%I-(}}QKV(Rh5^TPNZm(Uzr*r*Cf})}(%c5>;)tn6$QX$T5jn$2~IBV(T?(lb?-6S79NzlHRvq#I1 zp3@$LmYIvJoE*hOpDJ#{tJ?sMqs(pnIbho}<{(L7?BMCbE1JOU5moN*=7%2jJ?;%1 zp7mX@iiADNjnxb%M`c|CdKT3Eu&|rW*!7u)_@Cl3-}4bhadaisI&jS<2xl>*o9!Yb zL{Dg2JC(~~5IiHOJ7}=G?X=$>`?H<#kk|vhCvS1Dd1U(hwu8gL*n@Sd{VS9KdWok6 z0+Y9rC_=GY;sKlOXXs-tr4i~+^y`cEs0S??By~$RMgw?eB6d4cp_J7-2_c#regvWc zhrVVK&Pk#7=`ri7{KlPgy4>}TdhG&jt!xbQ?sREo3{A~`h?ixG#RUtRvkojLpGMnt z%L(_i^bbj6dC`VRYJ6WzU{8H5%c=+BLmBw_=0m2wi3?40Ll52RfN62ts>tKxG2gPU zNr%3P0IV)n%>!h2lE!T;8>xwB*kY~e!?oZ;qJn(byuao1~w36ubLJ>W$Y6IC! zrG>HP>|!+1vEf$>pL}Vs-x`Rj0uZR$m6eg#XTIkC`Ff#B*ZrZfE%~?ApY`-_`;9Pm+Q3!dS;U86umj>uj6|2wZ{Q z<_;U&Lh;G5l%?bqthr_3&PL@Lb51NL-rDz_VAzp+E0o36?Z(RQK`tN>_MPQ4BK-`U z9(?q`?C=Jn;UW`75Lp;+3%LM^YW_XKdoqfXF0@Epv(*U0)bAeDW39A?dhO)Um&Q@F z@D-+e4VWUCPOhKUJd>ouH;y+3FFX_9l;hDGc&_|Nyl zZOX_4MBj!7Q|kOhp63VmIs8Qa4vfG=Zl#VK$@MOWOe)l&?>RM^ctjT4} zd@}R#(Us!1TK>hhL(~8n3C;Y;i~^%}MkA2SF?|CTbkF(Z5YugPqO7MTtMyVo~@-8;_FE(#Nbh*`h`bNKsdVTgTLc5QUfNzG)G0#x{=b65-Lp9hLo{ZJ% zF_iVziZ$syB{$q?FMh#CezUdaGXwMt`tR_wEzsZ@g0+{Lk4nNj9kt{0cg;Wk#LLY) z*q`w7dS;V!u#tDSN;D7?-B0SH#l9fP!sEga+gX>Z>$UYI+AQ2qHvL>V>!u8zipV1r$P!-_MRa&E+fl%T&s0BqSbg-^!W3Dd`p5G0rH^FxJU zPriBKJ#|g@a9Oau?hWxeCr-b?o-p(6qB@mWWcp#Ll!12!!&=E>n?+BPL?ZADh(H0O zGc>9YZ)-CjXA3uH3m*kdu(*Vg;3uv_wB zYVlY9Uq3*h#0ri$w=C4zNcy{yp<3f~+Ubp|Ut5qep~|x1Ez+3G?F#|%_CiC?oMp%v zJ$oL^z7hPjzYHUq!GwEyIcD>F+;9r#tPbS}kiNa>1kCg}_2BP3E0Nl8IOe=>!Uc?3 zPqO!2-*_Sv;C&XalBb+~AqAj~e6>AvxU}Qytc~ueve>cje1annp;zc_No@_1SQ#jA zFkfUK9_A*|C&E-#mL-s}X;8K%7$OAHUZYK>(OWCQHXYm5ul%-~x}9;4SI*-z*v{f@9e+$0eMw zMOFADzx3`Tu|4RW5Xa}glf%y*uwlp1xU1gnIfpisNR8~UKmAL2h`)lQCP0# zI`r+1&*Wcqiulv*2*S?Y6`aT)_P&KW|LEDlDN@Xpx(~*^UT)9dV(!v0fKQvlFHuoT zH4-i|wizbL$MWG$C$c;v}+du_MKp}*JOV@fQl9(8J%@bhz zRiv#T&U$TZd{x0YbCmtcZyvPwSJuJ*P`DV+{dreG> z0ldu{;B_|l``O2j&A;KkdRLm3(-khzcjk9Xc=oJQvKTF?4M7zE$xT{pZgzr>f`g=# zRsvfCLmsH<9IVDY&2G>0!%H!o1nl`nZGO&sXJM((FV8t}uV+_YxNEXRl74rnS^3_r zSgq8d+sM$rY<5y(hMawu$D+XBB%&XVqUI>QaNmB|###XH}VUUhsl;I*beT!LJ9QHd> zhhkWx5;lc}JkJkVkPi=AI9c{I7FDZE^3xB=?bNWdvT{s%>vnYm>d>p}_J2QC|Argo zGqAR$_5@}Eapm{R1pga`Wi+lb+WsMUlTfGi|+@U z{FJaw>@)HlG8{b{iKX(fiIG>y$F2}Lu^F<4(177jS$UK37#%&)d3CZ&?3|pASv&|o z+bqpDTR4BMWjs16`|{EIBk%WwoQ{6XjC-H_A5cSnN9Lk;7Ihz3k7(~oBVfJP0)KeO zeSc0s-CHNm}!hYJuy4n!0qqXui5l@{^S>cu7JzEvv^f|x%%5~2FHB*3*mK8>+tuqP?mZGm9@bCyUGsL=2K{IKY7y2I(KH9liI$k9 zOtpQBPe&H*4GPm26^ocWM&Ws{K;NqVHnh;ZQTvpb*{9l~?Wgc($GJ+I{k?B0*Y6o3olVx- zo)`P4$}yb9KDquOMt@tMDnFRH&>q#}!{~Yb?0+#K8mS?y2&mfu#30Mjj;q9izuZ4w=wFk{*0cbl5-S>bsmqAaefe^_R+?|ewal*ls zTm0tNlh@kq#nE9!rI7+Bv(@K;1vd@rP<KS;FgyH^|&Hl4ZRqd{a(%eaM{bW*lFF1;^Umd@G zecIk~Vw7svdi#m)D!}^_rG8@>)F-|LlvnRTFOZBN|7zQ3{-49(LP_{!vBdF1t}-uf z!Um%@FGYGOKJfFa?-$9$xzE{$I~&`iX8cJ zzlPBWGhpGzLLHpQTV)i3DN>t{6u-6To!j&H(2z%VD-4#FptZ30vCQx}!$D4Ift0yy zX@@>9Q^_V4YTFx-p~$Ov;PKtHA4ToRyFiL=55nHhBS=N?+%@wk+V^neZ`P~s!-i9A z(5AwOA%Q*%GBeRzPD*TOOhdx+=%Ce!1t0ZwnI-9dQW#8dJS{A{9nA$@EO}k_EEa~> zDx>aHP;FQ!v+oG_j@fcdTN~D&1UhXkJmv<@CP$8nq8{24rov*HCID3)l;<&kb0y^L zsv6_%mUlkCosV9Yht7r-axQC}$*Ns`wmiEd_e?+^?&`a)R}UuqU`pmS`r<;~l8;aj zHuqnuc#n+q0uSHa6BMu_URPYaL4C@@%YD7Wa`I&F2p83Njs$akVa@rF9%+4VZu`)n(DPuHnZDJt0cfjQspza;TSW(6ig5{BFbJ+r?|GB`9>PNyIMikDD zX&LXYmVlXjePMu$=`&t_1Vi0>5g)1Bdyjv}hh~J^Slh1wnA@*^0jB>1@2(O-y+B~JpGQ2 zCTykWG2g;@xt8(ZG>MEc1_SHj9dtq=aA|wP-!<6ZKA#rE{_;N^=KsU_a$EP0=*#`gKg7R> zV*d!A^8bzf3;lAv{kHP-`k>{gD}^>fkw%&IulxVI0Ox#sget~ct!{2ASkAQ~;OW8V zDd&Lepl2gMt&4GWZ7yR&?)0T_fX2Dqh@o0~OD;me1S}YJ<>C5*)#r?6#vfT)TC}1| z)JO3i{sQy#5~t$^$HYFbc2r>dHm5pH;g21?p;|CU=<)s-DW#ei!*!m&#O>=uV0XZ0 z5i-Aj$TW_}onk~g8)8Iz+1r%$es7=Q{bZ`BC02(Bdl$-eIHNc>V|||Y2~7M*okyFm z@2jlgc9zO1YE3!>zWmy`$dp>2CB>a!!M#&Tar{xuyBr)27#vCY?u+yJe>7cXR9oE= z4N%;QI}~>a?oylf`ymw$6M=Ut>kac+?hRl@0sf- zIfd;7cn%KfrcX1b_U3HEm~&*Zkg5Ay<5HtK+23qUyTuyDhA&9~{om*7BkwCyy7^A@ z($Qfn=|%jmpSfP+-;XG^-@%&@uh-M})A;Mfz+8Pp8xQ1cG9c0z zaaoy!zY9~9>+bAAvh_OHx4#P@XbPY@jF+Eq?2rmz=$?)ApBZm zn1Q6ewBeRb@@!tXQIKG&N6lE9bY0$>Ox0`wCG zX3Krg2B6`{5;m4lcC!bRb|IQnqOT)>X{EtofOfy*tyg%MW}h!?!EDux-EL~|vR@rA z#UlY{XNX6L*stOeTwy?rcmrfLqRxQD?tj*1bER#WxLamNW&#g2#4oreK?D6mMNZ~x zTs4F*I*Nke*M0TZq`=qsYreo+KJcyBdlNWG>`DFg_W>`h9?uW)khedBG_iZ_g_frqOpP0PT^@H?9}M2#8o!xyem|hbV=}~v$6AbmfqeWAQ0o3rx0%zS;auu`h|(l6FOU3GznjIVop+w1kUFP zUd9X`7mK|pxyi_TSu|S#j-vg+#8eXIPh*jkJcsK54_zl&f>%K<*Zo_EmjEJF%2QEF z*q^Cda50ca7=@)6jaH&EZ>|}zGF)@wlNP{HoM|ISqZOvioku*wk6L~Ny{E!3f4`JX zgat`u*iY5MF`741Ot*iYa z55UilKXrkWNX`yKna2Orejv2we-3()ws@&C8Jr;G^ptv8{J{)p(B1yS#alS%gd)0- z(0q^(H|4b>wIj~>T=z@u?yy3FMi1n-(~KUm%lk<;>CIvWxEG1ltM!8gY9JTXv9n8| zsvptOkjJ8l;`>z)B}+|^P?gFh!W%WZ-LrjV(jWlLARC)t-EtQMW}P*!iI$gV5t5S* zo_m_q=+va=7j`#QtCBv`BEe|Gi5>_WJs;S8*m4y- zK0V#(&Ug4zk7cosg2Z)e2Axi#wXX=dW2Kt84M7q|v*_PM`b}YIL)0p|`&ab3_F?WT zgSh6%$&7k8eTV@bmT*)W4JN%RzKq$9l?Ae?#6vUzM&s8-(kIm1j@b2&uvh+K;*>A) z*rG_hMp_T=5-Dl+wgsK32=w;#(uy5-T!NhfWIY? zCEh5M{UfLrDLE3$X79SFJh3_O5reLfD6KCXWHV&vvg7V!Yv*Y0Y=hgj9=B{Ubo_AL z+!{JDVRUeC03?oWUN`#kmBQ`$4i6213!{|mXHWMYn8oWW1t*VX|F*M9gC0i~>)j*f zPW^Xhf%fKX(bTjwTSp%WrQ9FMcMLOb1;b;})U5PzF{~0FK31{HZz5bd?3d6{yUsr= z-^RsRA9?mYtFs6Al}*!}ZXf_9x(Q%aQgHzcYot#sfS zQf!Ub#Ba^Bjfa^$5w${4;nN#LR*VA>t_Es7jL!s}g)j!{c2$IhFr$}!bVy$VhVsFh zpWyQHggm7OYnVCXLZleFr@nb#75r0lx$^5Rg*9lP2lEDfHv?_3x=U!o;SQ1>)lyGc zgyl~l$rfYYcHw=BVN}*|{PdYEh7q$8P7-Et@a-E}ceD34l{JI#ab~T0E+N)f04Tsu zub83eCs8aXC9H>MWiUOGAeE{f!$*eJNSr1vbvNtzBB+Alu0B+!h{O3%>kw&2`#ZOKSa zFDPvzfaaU=gKit{x+BB>b5C_=KMFyAwDROwi#?gd1OmyyRGn=u_OQG5C65eX1c%V` zU+ACC1*<3{^)a7rcO9^3-N_IrRlW5!QV$;J{J}vJUn4B3DB=?bW44s;hselnnlF6S zu<0MWTrWKDVZgDJ*D#USSs|#CGqDxdv<^BRLuzG4Evnf3bH7KCFst5SX>ArBY+OD2 z6WiR1r?G=*f>1-!4Q2b!gh>fv$ik&AVJ9&Z72h7nR-`N(;iF0k43c_)xrT%b?$C;1T8W!> zPa~Lm%E^WFqOOgicB>Cl{RC@7rJK}Av&MhUAdKYXz|N8A&G&D6S~P~BRE$UV?VFnU zg+V$^E475!(%`bzB{k={&O9-Bvto+};*XDswjqTC^^Nzp*Rov&b%lBGeZp+j@tLg{ z59eaBt1n?K%K~EvaO969o|mjeEiI_k))_rLrjT>(_weOqAL#Xw$HyOobgfDWu9{Q9 zN8{|?kapyCeDOYYhuzbp1nkIeThTBj^>;a!H1x%Tsc6G z3i1m!H@CB7$+W<-3!J6Z#g|UB;3AerL-ah@b@kQJ0h)O=1N98t+b{ag@676!U7Cf_ z;@PH_mV$?N2Q2D$_3zAFz<>TY7FKwXhR=>pO>s;1QG2&L`aYKk+;1C_YbDz8b=b(TC*hY$AUW?L+DpcS?gohE|s7sIl+SuX-M z4qqa}s$4Q;C+2i!paab1Fd0weSWZ+X_R`RcYGTC zeTTGt%(PCoJx`i)r4^*WIHr&=B8t>W@F~?1DULOQATUU$7N$%}oFD?wUL^J2@G0;Q zF2`p-Jxx-k&IBW8&=p3c&{IXyM?W8QJd z#`EUNh52!W#QS!d-PXpY_gt^wO%t=DgL;XKT)5LRSJ(lXIq~DIkt^ooCJ%RqYu5j$ z7fe})CbVecejBlHj5TgjygmkKSKn&wMxzKEW8bIFaQRy9uE(?g+y6Z$RiGjJTMomv z6Wo@E9J&434<-nC2Z-HoN;4QviGygy#d&QP!Fer1S(Nol9^UV!XuLOu?1?TBH%kpAo+l*c@tHi-hfn}jn)gsGBYFb;Q`AAE&&p&P%_{_QqXY-81Qq2U2g zkl=$_zDw_&8NsBtP^ZQ?z)JWVaWe|^a{QU%`;eRoGR}x}y z_Pv{=TfTmmAC88aV1Z1{8|vkuFY&PArso7_0nbC89{_Z6y`p$qCrc3uwYFs4G*1Vb zfKY_7kzy+kJHnNEpgR!+`;H41joU;NwYw?!)xZ>~J4MBLFY1*GZq18sI*`S|Qe70_ zJftfBajTrAnLV!E-(~M2lJ>hc>2ljnh?@I%mScT$xm9dQJ4bFt-RUFF!l@!zl%EC- z5DeS71UjN-ZDpMr2D& zl=67Fd(E<=Z(wHOo|z#)y`E#39;2kU8kUjUz{S(j-QQ1&DkH>to>cD#&)62%^`N$yh^rEPA^Aa==&6GEI|j>_jEs8d(%FK8(Xht-KN=L4H8t z*#PQ-tdeF|7tT07T)ra%HdA4^8f75ifHgOT-dyuUO|ch)Z4X*_=_pN?yK(?92F^HP z2uruk477qrQi~r$>2J?s>%(#6_WE5bBjIRtHqU`U4Zy66E6}wfJ3zwCWk7nw=n$L$DdGvPQePXaCbH)c4EtgorA}G zTl0_xAt~c|Wl=#<*}Mi?FAlfoPm$X<_}slFOgP$rdO2t@`pvovW9>b0qb{=}%)Fim3He+L>w`0hP%%owcZ7KO4Ws!DjdD8(5AH(SVwA_HA$Ac5sY>lmsXoi1 zBfBSQ%_5?yVtOTY*69y%rhc8Y2ZynQvE-?#V%sP8Nk=p0&_0FEx-DXi)uqS*E1^+3 zM3g;sE?Y;f7A^ybVyK0-GG-G<&yAw^aq|QTpmAzGRWwkrFoaZtl1GutOB_l^gIcaN zGD$P=qsr&AHy2vS-&fgyGSU*Ih@=h4b zcA2BdN~TZ|)j%m0UvK#YDH&pHl__{jf)Zy?{%GDc z#toH_U`(e{iI~`klS5$gXqa0mJZ%}C{vO?sg;25GS774o6;|tollt~6H}}KNy}y*A zB8hquYO|jcVV;|s`zdk~7k+A*XvvN#*J71ew?(a~n|FG;hnj0?tK3L z!_EWS&6zaCA)EG19<^xxO&b>W=&<&fm}AEI70fCZ6ZN0KCuM5IGEZ zLM&L|&L+AFIpKZ!$3K3&)M?d|e!*vHmCI7K?;m1#9k^CD0b3`EkIFFKXX^kjvdBX> z`d0uP(jqLHG~u_w5P-0^(nN5bJS7s&4`@ z1dQ1Q`9PT+6hk3@E$*_6>7yBKK)Hu%?^c24Y2=k=zJ^Q{2{IOdusf|FlA2Ze2b};) zhC5f;ij-Xi`8Je{8`J-{EaL7N&mm;}UprD<9M2&jx&!IMFR`{}V7Bi!shyhT1Kn1^ zJFfdg^&Rg9NaGl;_YwYO8$2{vEAv*eUj<-&&&ooB^m z#h7*T(LE-+^6PGX7sfm8tpWjde2mK+dlw0bDeRQY#u5jf*YEFuH!v6PpCC*TZT>X_upE1vH`r!Zlu zd0phc_Z9&xN&%?iw6fJz$pxo^1qAz~t-omceV+6`qGUKvmVNLf__AckV7cgm{3N@x z?8%?YiYT>r5ld;U%p%aHrPL4YS#D&xp<8IC4`tvfycEWxWb*JN?W|bJj%$jFBBx3Q zktCz*r~h!PLTt^LPf?4RIQf6l;{Mj6mZ-qWpN){4A54N?j5LmH!5&<)yejEZT@LLg zr#ZEH42JTKJL^rWTTEo655n`1bE_YkVtQHCW&ggX^OGwr*R4#96~$kXTVel&pF@vf z2Lyc@^X=USdKM#F`#bk`xuU22=SBryHo<*EU;hhy=Vu+DK+DS(kJkwC)pIJeo;g=u z5qmT!>KGd7t;pJZBQmw{3np;rAZ2e4UbZuzFxhrd6si`I3r38- zg_YS-JFvGox20wnr(2&+XxwY>JlO#3qnR&!h(6euNCcpe5%9Z$<;* z*+wvdOdm}4gkVOBv7F!;woql+?kDx`;WUQ(elcso8SreVHW6u#SiML0^uBee1od*% zd%+k8m_0P!8=G`IW&rSwln=aB0l&@2y6<&WCi0^Lf(UQ}f?P461;^xL!N>2)UzQCS zth9sE|Aw0lZ{SETR6%Ki=%q%ngteUOy)*2j8WDVdBlj!Z7+z*$m1FuXvO-ZCoaU4pR9+@Bl6+)~a!e2W4zcNx$6ei`oaTY#= z@-wkTHvJ0m@g|&l>7;q)j-J8!J@R2P*2fp*R<33kS)CR zZrti{mbJjCkiPxm`u2>Sta&I}i zyIX{X`Ve?sh!&SR!CvmrqoFIC0KKo80|(At(GcS->Kv+ntQkVnSTr<^QTl%+eKtq{ z0EO_+iGwol2)Qe+27V*kwzs33{C>jz*8ZnZrM%z=01lkE2SH0GL;;%NiMMb|02nYH zWf{7Gdow^>i0KeY2GB{94O2272?y{@AA>3x%a>4zS29wj-Zxbgnm>vpn}*kjv9;Rz zJ752!eHPG(^|K`ODw777P@)qVS-u>Ad1&FawXZ@4Tkp4|iNe{21>B9cF==JlZI}Gj zdV}3KF|v~q0uA~f`Z)F|Qhx=0iX?lPOHK}u)AypL8zt?O=Hr^h@%F>`hJk}H%K|4U z;{MoK@?^=%b8l(DDku$!qe+xpAEQ(MahmvRzriSY9rCMH#frI~_4` ztju~H?q?lt%4Ovo;-7*%_#RPH5xO1M1CRcIV#C~w1s(OLpWnt@0BXvKE7BP;9{*@c5$?`fG0 zKJ%|>P}iqJFVag86nrH7JB`@;yFRJB9>(t?^2CP#W$v)U*=b$&5`Xj&zy~2tf&|<@ zj|bOrK4iz4FGhMdLQAF^h(qutQx`a~ma2ZL_BSBx7?ntTXJGR(}(;-27{V-*dXMZOCAu0<@LnM~}J z*qa1zK#=LISx5yuV{16zv<|VWpIN!N@1R^$nOD!ralzXM@K# zE=^~2j2Tp}UPDPK>~vhlUse_Xz5Hsf(v-zVzCkPQSj7G;cK&BfK}=$uQ!Ibxq^c}> zKZyrEJ=p_Bim+(lYx8JGTlmc^W#Vbp#8TMMETNlrJBxtHQ9#@gATCIPLDQ6BzE{Ou zb9lpe3-Hhes1iZOL2=%viQ)gSeG>c7u(g45<(=>melZ!>K;3nW!hUp0bN{HACe9$y z4O}4HjIs8ZA3_FyeI{0xf8{6IfNt5`(fi)26potI^JBoK$Fat=N$}|SqQ|ab>)&#E zURLq0hY)WCeE52pX;09+S&s#$8E2NW;94`OQYya4+`Q&Lhb(f%D5MV2xHA^UjAQJX z+2W$(fg|C&pf^h|7mSJA6O+cGn?2CBIcxfylKY!QF#@bR9de&LRF^RY3Os~t;< z`G)bOGozDJBTf+q6SIklq=Nw8XF6S=4+B$~mBlymq;c8J^zc&UHja$*daucv$18Z> z7y6oJv4VV$n8X=+jrn?$nvQLcO=0@Ujgks$+`8q#x%TkdA@;JW@4>U%1y0UU)S=n} zUilHl`^G@~tCg!T>NZCC#o(3#15zC+RBz^^)Wg+Wh7qE*^}kZai9GiY;RK5ZE}Wi?nZ z{xN?v0v(1y4~B3!psePPdWg!g*$?~f20%)=&E5z@;+}+`9zt@cMK?gan46QIQ!0ay zErbQlGVvp5#5p;@8+77K$-Vuj-t>=$Y0oqcOi)?Ic9B(R2i|XIJX`p!)DDhstL%Di zBve7b07GaExnjuthn~L!_P~Re2KDHmI+OK}x|;e83>0mZyvCVdlL}?^on8?5IU+cH z`@a|96~qo?)A^^Htuv01t;7chCm9nLXI|IhEa*44$jima9SYTtpxkR)Mc&2*i~$CF z(ryw|D&6t;4*v%q)w2SCco44j2H7f=jT%8Xu1Qh%-|Rr8nk$}{Hbsl*w*Q< z0Xxm{+)arA`^MU%vx?F0&=t6NE%&qr+#EM5d+pq&ul&;8w3%(ZpRw!uhAMJ(9oPIN z9Q>Kx5D3ic952=am7q<0jBnO0@<^nyw#4_VE0e6#1DPKG+%Z zXC_~1{-+0AD+To++gt*g--2xLx*HBfHa%az&FfAj4Nwv}qj#GxCX=?6Z!M)*Y~Mo3 z7VTuxj>?j)s7z{tVp4vB$c{OH>+sCEh| z1Nofs(rZ=ZY;>9$d75tvf!9c}l1eq~XF^}ky zRHd^EWx`CC+p8Wfd^BD<{OmQBzoqqelo*ek=C?*wYn3~W$L5)tMqCcM2P^A>CntK| zDn`NMx5(^*^a3s?OmFLkOF0y((2U-jbm%iZGq_YAxpJ6e(54!-eo|n)BzN3r&CM%3 zWa2ut!KJ9_K|)=K;J9988qgzazRIvJDI?fQ=h7@%Q2Q*PG&H23MC{zQq2jm8R3_kJ zaUMl)txcsX)NgSRJ#8`Y0j9)a2-(u6RSc&JmXHJ+QrTSo;Xa&hWiteMJZO6Ob>F(+ zZ1$S52y2*@5>O238IDYVoDr~Di23lNNb`^EIVWmv(^T}mBJ%ZyoTXvMS&d6wC&!2* zpc(VQX*45aHMrsgD37&fT~&~9o;nFv#_|9{>BjVeU_#)vQ*aFw9|BQa&T1122~tQq zfAx2AVaMf+TRGPgw2g-KT@09EJEHNE8Oemvx$x)k8?ed9S|4AM^f`KMyZWT(@~B>4 z5wVrRMbAf0TbNpT2Df>Y>!de9nVs9(2FayYPv4{C@(4iCy$Gox<>BM+iS+T?@5~infsv6$oRybr# z*Z2r`#uE|N+`dXvxVdQ^C{n0H{-hvc<^h4+LqA6d3|ifjAP`0bIs!&kr!7@Y9EY4j z?k@7uKYJJs)xNfuxrNpixnH%budX2eD?(v^Djg3-Qbw~yJ!9yV^}UyQMxIc$TbwWW z4gGha>Xysxg(RC{tIYcpDSIm&f50>Ly@#^xIqBixI+GG9YfKsQRILM#1SO$rQyQd+2yiKLO~{GLT@3Yl?(H!(@{ zwzKLd4#LVqab68b5ZmBQ+^K_GbCIiu{dYx%1n5=g=m>55;}VEyiNNnCqd3JH5*rv^kfgImJ3(d_XXK!3%c6QVpH>Ct({=FOgBP+MZ zCzrc^W@TkEM?d8*KHYno1IorBrOZ1hkffz&-kS2Ta|rGBo<3=_Xd%kRUz)Cx3oHv2 zS?r^t6ZtzD{QIB?4F+0faziWOl{;@W&i*0`xvAplpZSW>vW4-INlZ6zxcM`YA3{zD z#kzH%ClQweR$Lc1GEx;fp%*DZOdG=QN*jRQZDASm9$F?hw}SM(%iFu6jKjHLJRFb! z<$!;slF92S(b)PVQ0`mTPrAZK4ZJn@ymthVlV&2XGsL>jb>Uq-6-+SNzQY(Uk z8a+RqzgjUe^qy&HAQ1EdS(nb(VyvLXl+2ptHJwTfG*oiO6k#jVi8!b$W&4Ih*bXX3 zp^V{^IHsi~)G)3^i$+B=l~p_SZoAhugOLf?)_!+F1lSI#w4$R+?#>f6DeN@aP!1Q3&ZCP zI<*FKX1-jkGv@Dn7TDKROx~#8St&j>XmgDoIiYjhWZXSyFuPd$TlzOJjEjEarU*iW zH?Nqy<|C)5NXD+$#=u@}>OCT3{Z9IqUDn&+Hfo!ZU~+Zv$WJ6>`l!#upx^F(W=cg_ zD>vr+yQPI=Jhg*;FH=w}OM;58Rn)X)g#38&uSCjg>t293#|k%crUq8#9RD1n%b$9x zL>`xlD^Lm)F2X>OgT-_n(Qn}Ze*_I$>4H?*d@Y%RFL6@E*m3mb4ey&U2r^UQaa6DI zfacv_ascmgsqsi`p3s4lPu;uH`CB3UF02_pDp2AKHEZ52I302vbQ~PfWJu_KM%PT+ ze&gk1U-8YzknwpDS^ zC-sPkGE6PzBwi{4=;*DzXMn@!wFrW32rd3~scm?ZEoy2?^M$@67n*9`Cn$W5sp=Yj zP@Hh#a%lt~PcC*aGH>?9QRAOTpxAUmBeOkThuq^hkrP~%N6wH6nO_TAh)7& zb?}Zcga>Nu)fVsrGrYfUh$W?@_K9^pia@rR-@AhiWh0T}KM7GMH8T()Utg07d#7h< ze_>$#yY+mIlH25deWc2>JIiaA>oGGUBDp?Gps~n1JGB(r^NNK&-Z3X^*Z*&;Xa5P$ zb4ZqjWoappCT}grthR9oN|pxC8JLo#XR=b!a(%omvOpq&%6#eYvotpMX7#=tnRDL% z#RIf;*27{_{N{XKDi`;;DuPm;Trsihg;6T`BQgYOE2$C`v?C`bzHJz{S0o2T+ZsJ% z1XwqnW+Cs4P0sbU2{yA-3tQ1rc{o_YFGLY3H|pMYEY{j{tpdN9cj3)JVhmCy3!~PO zgolVal_mf9gV-}9#JZ>W0~Ci6zPh5YF*5k8{FrSBnLp^YsWsia$qu3^__+hyvJQTrCpW8!@;3g?44UC+yv z@%Ps^F$rASU8hd4Cgl3-=HAA=+5|!N)@#k~T8CC|aT0}^=PvJ~)0#B1_U#7W#xCaj z)vX+jTIXBrBh)Ycwn!@Gn*G=bA>V{5q03$Nn3w)<`lM@3@!)jt&UfWaDPp`}Ufmw$ zz#Dc0;qP&`*2-$CMxeJSJ-gG@miH~9*zt~JO7Nc@O z`a*Nf@{|qUN64~hufGrzI#vsf8)ksImZbzT%8mBkHp;UZvi0p>af;dZL(@mczu{x9CZ9K7q{8Wm+rxGKoKf7(%56_w zGNnbTjREOrL~DMR1AI8ix@KX=uyAVu|MHN50cD$Sb^}(oKp-$~oX()HtgMWdU3N5; zl(uD=)3wghqC_4vp&(TnNky&L2VO!`SC=vBJhSxvmK#^aSZ<$nd_umbt1dP_&#IuT zprXG%R#&1lk=xcku3v1>;u^c!G{VT zm-o79Y_hr_@_J42CSd%w>f~GvCdYJK+dByTnrzIz{g84h%)+|s^Tsv9!$>2K4K=dA zaRoej1*S>9U|tP$fj3{zjo;9}dz~NH2R>Q`+;mX57X1Eo>GwiHD&U+fRhAs+dwKfJ z_q~)<#P1h>LIjz9V`9~8Qi*z@>M&?*xTO6R>ybj#(rjXMswlivv6@35U|_C2J{Y2F zFkx&`)6B2-OL>8p@p#qZ$c+|q@U-q^KFPr4Pd@O!_1bQlkL`jID3{q$ z{1-~$(1h!WP_7)*|1C};ys>=TLcXnQNXY|tc3D&x3zyMc7G66tjjv=ek6!yx_%NZsrDjFng{h)=GYJL zXo&iT_3alzY=xjOf|WhISxarayZZxqH9?zOLROQ#fJ-f5^m6cQgOta-o1joQ5izl~ z?~0t)Bk=at6AEQ`+roEF(?D`bNm(gUj;S#*$=-K*SfCrlv$f8YWg?Q;SO;!CubEk~ zo{k%94bgzX*|5X%z-d zoMRBk@AULt-1iGH#p{{BS)m#Dc{4DpqYd+V6Y@fFb?)@`D>I6?Do;e&$AtqLAq6Ln zM3%ICWi7^_x2)p5!*AQ8fiF#d+o;B$G_xc$y!iuPv;(Cf3l77yt{X2aiyx(BlX7y$ zA3JZANkzSr2~}A(Umlu<6%@Cpw&5bo%>@2^C(y4`Th2}D*?9H4;jH7Yzx-r9X;4yI zn=p8QL1%S#hc>~*&mX;fG+;gpovC>dsCp5#rZI5Ad2>ReU_1T4+#PDjVYJyssO53fJUpLx+_2vBeFW9Trw zMsn5EUq~xJaZc9+^^X}Xo)vK|@FcfxT4;34GHq!?4eJKXt?@lJ#TC}s$7S$zCEt?Q zZ1)b%W%u&-x6K5HjHn5sZ)SJU##sP7c!BjiTr*DpasoH;kpV0 z-V6nfP`qxuTMRv%zm6M2XpNzRU;A^{b+6!;&0T0d!E3ZUEYxBGPm2Ynsh2AK%BI;J zdcSkPxLE(ZUDJ1zh)25I75=mBvDW$&1Lc9D=xPflj$&yN^6D2yE;v!qC>j=K1x49b z-{yiqTO|z*q1g^Nh~LBS&8hQ59u^Vkf3uqoas1${{Ys3fpRRDojv zw3JvZpF^#C;DfMe=%ov^r1I5d^0j31XJqmhphvn;5gmYL(K(3#j9?f0u$7y>#SqoB z8-1TvKowY7i7bYRu!^G zek+?mLE-4-J&S+|XifQ5*mjEhyMg0q{2Y!ce6)bOxt7m@kPd7Z_6n6;-c%pTzLEYY+fy`P)~#;wA)NsPW+H=jAI`=n%$rF*^iZsPS^E33BUnJKpueYg?xK`r!0N5CHy}S0nZg zc#Lwo1z-5vU2c}Po{8e|ef7~=>`sabwG ztM{sPQZeuJ*&&prCV!IBc=GM;IKM`#1DAp=-2F##4gtp=Q++ejiT;TW#=VQ=#2TL9 zdSPTnT{Cy0Yl)(vzbo%wLBA3c+0d=T3^PCEh<+AtMlm7%qDhKUj1($S4+~^O2o>9l zoIV9GErv*t2CMA^X6WEc*9v1daOxcel*4;V97N;iJ}k%<&dcU2;;^|PewmybV|b6z z2;7rS=%mFpR`_UgQAOAYwMad`h2uG|V)p-g6IZx;nHn7PjR^>X#u6pcd^XF1st(O- z<<=K$__UN{k=CgBgC56QGcRvKl02x!u%|02B(iRvuGq^fY5Rm@b@;84!~mB5KZm|xNGoQsr=y)Q|L&8X~m{x+>vTl~(B>l!(Q;$Jue z*0REqD!kIZ@S&Hm)%{6+?>$a3kv?#$uyFas*2WthG`hy-3pfpOE$vQ>fE<<80_cYgy1b*djY~VM^#?(J;rM4X%LsF6x|^zoNZ9g*N+U{>?Vm$f%u< z_d5q|8ap;YCvvDpKW5jhqMC-QC^+phDXr?TZSz&ICBrx1xsO8CdGC|h6C@rqK5^g$ zsopf*?Fz|P`}3syek^v|Gua1P*jMpAM`(DYxY=4@Egu~zYt`;aDi_RM6x)9VNd!Q4 z1Git^@8BRqfp1QJ`=iGD&&BqyZ)u(V+MRcru3WAvbi?37r+}o(fEWF2QIE{wf8(!v zh@1OoWwSJ2aF*J)4@ozg(*8p;X&I8+39z0H%M;f>W%)gtgVe)O!B=1xC340+jUHsS zoUSj%lQhs|4J2B_P06Nmvc@LT$DTfv(*~4OqjNAGCwc%H%4UpFRp@XUDFW1kzR2(C z!<1{P+5=FMaO!PUd(N6}s?2boE9KPcsDFY(UIM3~I!lF-1a{-6f<>}k2zO8(f(u?Z z7avb{Xa2rM(s!!%f02ucH}`^gK3s0}RkizhI*cC9P48y& z8;JLG_8GigQ5`DfZtvyql5PeD?Tjwk{t1j+y+!D2k00W(+&h<}GLMd_U;Cg|N7KWT z=X_#G>u#&&{v2YmfV%TF;Z=r6g)JI#1}ILTc_9IUZWy>h9Dg)>Ex*Nf-OZKAQqD=yNv3Af> zNN{JT>R6-YN!E*i|1r7cz9t%ajG0Gg{)s#$JpI@7OwO6LrJkiDT)IO}<;-VB_Q1ih z%7myW>p2ZxXtko1F7`0!G?IL&V55_f~Ev(?f3HzDwGe3znQSlIaGd&JN0jn^HQemU)9xDbd_=WwVx{nJHL zSI@=y)9XF~gtS9uXfp5tNA$f5LQu1f3z z{o!m4x=rPVw)2%BaPUR!3HiI%<>$Y0W_`*5QrijUG^2RmMpt{cMP5KNrEWHl0T}0^ z-Jv*#0zO9{ps}+*l7Z^yea9gCNdCLC7228(^tPwbPm(7k(9@YmqxmbrGG-*245E?> z;82mr8r7m0ecY^u0XMQiJsJ>)a2}+jVpRx1bm&DX9dr%T-vy~c`N(Uy7^%{DgN6gt z2j<`CL&^f z_o%pa(~)1}kzH7N;e0a(UqTl{7^7s=o_nyi|0OeX#n$oocU=WVseM98TH#vzEn44? zq|(?NgBEuvLYkOO6-Aj?FKR7M0p(*24qE>ycRE~aU@EuMzk1$qZfXz4{MX9#@&8@` zcHuqkyjNx)eN;lyw?Ln^>Z9gKuE|@$4la>dq4jeb{nSbe%cv1tw~};mnjhl%u1XBQ z%T+YRDG$-8u%(ow=v*g=J|O)Nh)$M@8fm0$`aa0j!9~z^*hUa4r;5?B(E=t5;%|S; zK78uh_Mreyi4Bae=D8eBl)gT+1?~pER=RI@3a`Sg* zO>Dw7J07J1-X7Mdna;;LbiJ0`lj00>W%pnG&%--xA9iT{i*ju89f6tn+%;=?f&|G zd&7lp+?kl1of_T7hT}J1H?pUUxv4FYXWA^(i{i;co^-mG6h_|Ho4pDA5o&)K|3Ybv9BTy{m|zKBmz4Spgj(FCBw zM1$|x{JsKsh$)Fv#E6SOHuMHsZYp;YXKj54pnnNQ&0E1NTxfu`1A4vw^Yx)U1{I~&_hHnyE?Y;3HJ zwXtoR8*P$}ZQHhOfAfA-zpAdO=Z~qHsi*t)J-7SxxhPk@JrkM4cP~d(#uh;{}OjuO;T;pCA+l^+gFIr&iQZqWJlUGD6xH=eI8!W28>p0XdM=Lw(j79 zODYW6;_U4Tt!(Za7D61t2Mc~Q!DeGiVgdBD%F|3cSLHgU&eD9WB^0Yb2H)YbdFvT6(t%Wk+_uz{KLX8 za&Zc$ilU5yERj)!(Vt~yh5wopdjEL;Rh=D~8a=9H@&oaq$+6n9@=$q%k#Q;4W{u9& zxroOj;hYG0bigeUlR)w(Usr&K$Sr>q2t+J;2zD4IpSKo(c6NXtc6TGgaUr(m=d4UK ztb-XQd(B|uH|K&$Df8fzc{H96lMC;=hpyWL&8^$*xtf>Pu`8V1_U;Q5;_C@}kX-N` zq>rex_wx=_R@*!9pj3joyv)MrlQ&QlUAN@}ze1-(dUW`|4d{%^?ruf%D{$};H)9PN z6bxiCva~Pf^))ZoiC1OSXQk!&$*}@2AsMSgcvhLu_Fo@gCDzBJ$(Y&Rk0U+`In;ZR zHNHdTB{T{c?2Zqoq2sw-;_4sVM~5|RtRCsJqgbB@Pku!$78y)kuhkxSI3wX#FIo5Hn=6zTs`g*U8K1 zBes!rm?xA}0<<;Rl762b{O!{ghKOKv6=MtX4J8*}sHXla2jAwFoQ)W(^BTF!_f<3K$3+oEQLV+A#6LTCLIWnIm{KznPMqrp?x^m$?At_4@}c55 zzjw&0UsXwmsVb`4qSm$u42=v8OHImY>Ov}N-%q4Z)?0s}TZd(Iqkat_H18g)T~(h+ zRO?*H!c<59E)e&x{x@fsV8|}bl2L>MPJSG{J3#+0pZ6vga2)H(!kaG0yYh5%V(NR0 z8Triai_=&QHYQPM=y5e=iiK(kM8%;0q3aH8hW2Ct*VTT}{kV8K0#rBT#%^A{<=^G_ zA@4o?BXLs19j=~@)5@nRobTUwBfgu^>^iEf>D=25K6~GJ`odP7OMTwopvLu@hfX$S#i-0@7FYY`=gBqgya$KVQFu0Z zhdUJgMD>0_82El7p>)2IJxm0$bNqAo>fvH2bYn-3oo!;&#Qg-ueKy^|9l3v%vKg zy%P^7a&DBBt!KEiZf0(d#m(lGQ+~pR1A$0zcT>VFnAyJ!ThLoIOfjVN{Dx}9R(p`W zYBTfbnuDzMl^bLyEY_VE)!7%z<&O-FZYV za=mjb)xdXbr6tn)sqJfW_!!*TqQ>cIZ_BB|C~G%D!*266Xa%9^1h-Av=GJjg@(30s za_9DeiTV!#B}aYQ=q=vlWX{f*{U^+wW>?X_y3EQY!TjphoVz>h7OyKEPiS)*TH2l2 z18c;X++y+?|I8&jW^Rx8fW+h|6qLAZt>IxqK`!F=l&mb?Q|q&q&mEsrovDOWgYktm z110U5iC~1K^qh{#^%eZ}&tp2f&5w+=rpr4}Qk(@*pA13#(G7_4Nk(QCkC4Yn1s6|_ zmlmzGXuNwXIF&lC9LY!EL9Nrg39&B`OnA8C-#A64>;8zdcf%QBocYANS_fS;e`$EG=xz?x|)B38gx| zI2~Bphz|cPpzph5rh=}za{Gyu^Etrg(%`alaVh_E2TowQ1@m_))nzD(isN5!;-)?u z4+}ySKfiqy%wnntSfw)MNbv@ahJ@@CQ5_$AGpf;JYhuG9i>SHAD#Uw47Eb0E?dtr- z4&=fEYR~#5eHt#!aVCeP5|kfOr+kTOrW3SFNJ$J7Y@J!UUM{Yxa=%i_ z%g4Mw4%}SsCs^Db@p64`41S*SO8QFE9ZeY=^k8qPta*oI^-3t@+7(y4v!_;jkDpZE z%{N9XD=UkuYtd*{ubb_R9wEwS@4a5ca&Ih4TS&6y`aXIw>UK(Mc>q-t(WB(_3JS{N z%IioNx1IwkGPGCmDlM%}_g&d&&>ws_v5!^`TNT;T2; zUQFQ6;b(wkZvOW>N$87n&8MM63Slqil&aQ1tT_2Mt?xExS@+9re7ULo87lSTT zeND{48QFp08yOEf>VAAumX6I=j$H0CV*>&eI6H56wbR2OU><9I`n<$=$bba=qh~ek z-HT6LamlV(Qj*+V&ohcEjlcL5`EM{Le%a2a|Tv!p6OH4_D?P%UBzLb_rR=iIvcKO z;8mW%PUFz3OsX<1C?XrwFLe2-)~56^t6TSy53{*J9F^ueM9390=7_qR%o|LnFc zEI{0w(Y>0_uNuDD6QLKEPB+K@-z)6usoSRBo#AII{xwQdsWre{^JIitHSq^E8LM9J z_&c|O-yOr9g}u{RAI$i;{?QpzS7C`qQcH{Vy4ST=NXeCo+RDUv`kij)E6ayT(+HDo zaeGC|;y+&nuBT~!_nd+XSCzSX8ftD~#Vh8IzPo!QMW>fh`4H$3YWiWLonDT~af8zG zmiW5gb`m91Z1Y=+Zq^KmbJ!>6mxV1QS?TE>5l*peUXN&Gdx`tG!-u^GIt}+ziKzy? z#rRL<_JqY99kDP5?{pSZKEo3z#7bM!Qx|->cPrRiH!rDQA{IJaJ{u5O>3ZMj*zg7I ziAj4!!((H<@si?L=i${W2Xjcts(5pEN(y|FqB7YSn<}_|60K3KJIHQ=tB7aSl#n|XImNGNrp$xC<&i7;hCHLTHk?`0p`dL{s?dg-CUpO?S%=LSyxqm{MTiEPNXFFPMb7tC!QOe49H6|6A zp$;bR=Z%N$fJ5QfJK76*%I|=LsaR=x@F7?fT1P$?J@-l)pNG`<`26`Dg{bIHZJJO= z4t9=5h)GTk4hQe;tn>4W;1|x*2h4JlEuXm$Dn#}N;?vXg%Ie7QH1%HJo}6N~+2^^q z>8Ei80zx{@`O(DE;tsP`EfnOl38i8%jcQO2`66_vM6+RIH-mhb&VZ24T29 z=m09f59SUZ(K-xe%BlA{%ElKe<24HN;(_XrBV)Pf0 zGhZsY7pfE#?hzSx{Q@Z*i~($SdL_XT!}a^I=}77AE_TNX?fKtN#M8R)3>6xGN3n9i zQ6*)SMeWrybsp@+3hrKJzh&t)4-C*fy}VXlQB$Ao-k*Rm>U{)t-eeL?*>Ah3Vz&(4 zakKd)7Tj|FbskI45|A}A!nMubXmCY#{`-spAuLx-! z#j6;zhHK>2YPz&_z2uIPE82h1G$856>UoUjTWR+t^bGt>Moh53y}1$g@o~d-g8iC0ol5R&QEY2l@!>$c%%J0b0&p zKUOl4J@QK!%OOfchk%ZmL{%cck9PXhFg0)2(G{KCr|gVHM6U(*M=ZZkFD9Z9ACXyD zI0qcQnCDb2()pcp`qLl_L`9CRt%di`BG+<0*frGeHZ$#wW^sq*r1yd?a`KDv039L{ zc2=xt_KNL~SB2jNvCF5F>3PX~7zo9*2*p!|1u*4oLqzzZG#CwhYdgkp= zlb@C5K_WEs{v5DxaonRM-fZn#u+C)_a1I7xj*!pzGxnw!r_Wu&)=(y8l?~C)a8|ri zD;&I>f_m$StabR;S4j~2m4c2hb>@Z*BXwzJnODmG3zEu7AucywRKtqbBb_Jd08?30 z`$C(v#{Qr<)xhGT)s2Q%x!1YNGlXlgL*B|5PiNCN@+kKO({U%BV7)Cm(96wu#v0Vx zr@FKYWwGKAlN?G+$Po6iWx%f4EU~xz9V_sGZI$`j{Irdt+x5V~b>%%FN$ex+W;1hQ)-C9E^o}V%v2yu+61p{mPwVLno5%ARm5OzC<$dz)-tA#_#wp|X zrZDuYWLh@Q(EPmCe?_?VX0+H3n3JD8@~&3!SJgUG*6qHiM#3Sy^w!SXdF+ETB(7KHqrE(7q$J>F^ zp%I`!rmy~?t32|dEDcqLT^RqIkM&kv5g>@j+D?6%qVn}a?6AK3u!mr#l8|kHm zkEp;U%q;oyC!uwf5-)@I_!ze^zb;YU-ky?+BC&U`MJ9T!JeC3)EXF9j@RMRF*`4$WQT{_hP93F6F-`9;Y#C+kM}xq0v7SaKl!T!RP>mE zA}XO$O4jc|4GpHPZ9^?C2SmRn^wvARBumWg9Zp911wQw=KsrJRsm^G{x}q+o7?Vmc z6a&FYnAkufD=HFBw9<_ztn9cqID{ zNpCfR8i6k~^Nw(X_q(ZM-i;BynvR&EPn0qyrh_U?{nDDY+?;B=S1*_}9KpW*ecZz- z19MN;-<{Q9k2agqqUt<)6Y}TFwc-C(wg0NB@HAValv-P{oVnPnT{9lMye!Lmjv^?a znKv5G(5B(yl9LIf6r6ic=B87T&<7M9Gy8s=&tc?z7xonrWp?P-!_5#fjz|i12XN8h~^@|Wy9Nm$D6?bVjAaLk71l4^YF|W#j%U=JsuqI zgDBvDRE?FyDqN5@9MHH=;*Zu-s40xy1qPDlgoJ)Yx5R4Bp@+p{_?IgZAR4@!MH-O( zX?@MmqcaLG6mR`}nE}EgLOT@}BcFeH^8Da;OtebDM=Chu^=wIx)V|>sd-l$G^LXvh z=lVQSq%v-9Uxq$$^Z6xKcj8WFcZqCui(8$x;ew#Rq~w;89+!~q9AKkkkDQ`dRdZ|j zWKqE?qt(*(qNJrfxg=x`13QHzxeP?~5SaYDAfDD=lKZI-Ca?a`u#i(y*clu%=<2!l z27>uVCl>mKb0HT=7#Q^yLqy2uZ~hq(w63-r5ukLBqXxZO(96?oK?jD%!(v2Yv+<0) z@>BW;ctgq1Mg3(%k4dwFVKDB#E#~l$f?!qg>BYh4`-XbrtFC+2{LvH zSfJ<$3SJ)O$x(0#LN8X6QcDYDZVP6(Y8RGJq33I|6WjMfY~zTt$aN=3UM99S1c0P< zsZ2|X>Mny*;SgM}6~Ar-=jLYn9+B{s!-Lp0Qtnq7F04qUit21E%b_yll#{r z*dBqoepJl7m`+RmcTws|IEZxgc`GjPagQ)KWDY7MAxtu?o15;E68pUS=WOys$l@Je z;2ODGCZJPU57SlTyIICmUkcyeIpN~q0IbxtM5e!-X5 z1l(*YNBngtS$)Q+H?>7Z)fpRg+cQJg3gDKR~;Y7<>Aup zH>Z{D^_RAzkrDBckOZ4;k;h6fScza$qA*h7(WFu}B(&^lQT@~^d!%9EG3MUhId$cc zSzS)K2%48HZ1qC2&iKj692l6G%F6mOT0dDF>*9;Z2jg+Dqa{cv=;uRL%E+V&`)#Hj z=KG?j4H4o&!L)XbeEIf-W>tTENBotvSANp|`W`99oAth%>n9>?S(#78SIFF1of^>AaQF~Wr@A;l(5cs5GG#W9M zh_M)C--aG{pgazSiiDTWh*X$~uT@CYzS!v~LfQ9pg`OL++gK`}(bOddKu$ zEI%IeIf%b?ABM9mLnLlMbR3|9^=v{yz8U^~L9=|)5j&R5lW+D0a#1foTwq%!zkln- z3}W>6?67%R107&*n{wBQljV-VsWA&uUYQ_tN3Hy=6?(rTPVa(S{)@z3Q=DNH(rHGO zEW(jYatG#G0&`$3{(v6RuHEYpwkkTh7VN!M`PaWSh+HPWPL z7J(_$AetVZ_wCJ*ocyHoTc4M1+apb>P6#f3rjgwvG=-VGiwm-{x?{v^t2ulG-3eY! zew$lb0=hWH(NSV*_JB@Jdx%2$;n3H!qB{Zi@IU#?=a;D*p&CTvJRkprb8lXJonz~h zydxLBj;alpM%US;)#x{%VvfhDmV)VFVsgS@n|PeQojA``eH=$Gf+X_QyQUYe;KVj z#)NmFt^;lvulHqA9We2ig| zST85X#LVxPUfCpIZf+dY^C%!EgE`md^WUH7^r8y1eL(z|D?x_Ey0|1dbb{x-{y4Xx zE3TkTP)y}46^1}Y&kh@Y{mqd@IIChr!el*?#GROTOdzT!lo^Vcy!%kX|6`KW{*3frJAjsKE7PXgHx)c>{Am@ z^WS6GHSyU7;$BPa^=zr!>Uz@Z@*L`o(MhT5yGJJzZ8bGRQ%5*V{15CJ^}$_VZC?<6 zO!gS0BwzO^UqJ(gg~k7Z6LEPE8{F9DXGAST={{`v5nUcxc$g#jr^nF+zc3{q+xB2< z4Ou{P0c?u5_vo?CtTjnhS6yRlUG4;>3&Pqetb9$jgXZx4MT@9KbKkCSzfq77S;xU; zu84U$efQA%$xX-!LXRyj7XlXMOw)_&0d1)$RI-sYRn!!q9?E*w-u|JfQIj@zdak0p zI3_rg;;agq$JfJBc&!#l_oA(FzTjCAF8(rdrC^llwJc1~$d z$lIO!vdzuO&Ia--5JoC0!pP7tEPqCtaw=od;UO~Ay|GOT5W?S~84eG{j% zqcKj^4{YjYE{Fxr8)II3~_805#( zF17qgnBP$tI>*3|7&FesoQFXb`adoJyR5t|;RS9hV+^Ud@u}T?3wP|ffW_-0kYv$^S^;v#XRp#eEJ2bLF z?&d30#rOSG)4|dLO?;1?mm`nQ5h_^LjPTe9qrS`<))Mh-QP>eYQ;k zA7gQQXzA;UL zW7a@fevN^hBJG;_uZ99D4oK%CpXXJ+x(M=&4=DQ$5FJ`BFlxY>RbI%)l9^aNejRwR zcXyS5e@grRG?P$~MH&O_-C~FpTQ+;Vkxd#j_oS~lM@3IL-t=-%mfEp$dPGlNlC-vZ zm6uuN`8^{}OiXagdB%9xVppm4gOI(HgMW(O>y+QrrmYU@>&wkKQ;rx-5-KiQui*Po z*N_c+AqMRU&xW+5D4PjC1Y*Mj^YV(Mfdx~kHNJFY zduwRvj@q@JkoXDWkJk=vfk-yN>JwObU0w6ln}1rI5~&R*I0x@OxSxWlSv&@@Y}kg3 zw;Xrfj52wtISVhZ#7sD%X#v^%LXxt!u86k?T4^TU$hcf-Lvzrd@x7o>-f-6tW$(WL zXCc7KPnxs_s|>43o>SMb`{y6X=gT42Wn3@mCm{FB>wD5>Zg~c#ykaRV&F}qk5JW&1 z{n1GK2aFOLCKNX}-NN`0e9oeV&F=zhN$@pcu+?cY7DYM5K*~siHJf7vkyA4P%cQiE zJFxT6s-QP;rX0$&C^SI_HHLpQ7BEB-kQG`d?rG%OY7+T9gkv4Xx*2=XVQL_EvpVR7 zevl(!>hxZxorYt5k`WpHidk z9sgPb22$Kmii9i^_w_@|6Fv)_sm01$q3my64ghsn>O#Zysw^6wNxac|cXi=nQ&Ho; z{c6k}Q7KR%N7>P_;z`ZHNvs-u$|ZQa17)H}-q_+~Tq!!XIPD6mMlng7oTxBzo~A3` z{`GZ`09~i-S2so>S>e4)EDN)5ymPAE+>b4>jdI@8GdwuAAIlv(wgkVS1EQ@qb-ut! zggtY7c1x+nKW$c;e8dD&HklQ^95W{)QI%bSYZHiFy!q&>hopzVbKFK@#VH>TadQ5yCjvq-ana{h$`z zwtlC@Ua7yGl)MxT6I0Q{15WEinYte)?uEFh@Bt$^ z`2cjgK>X^eHavpDtW#?+suwlX3zS-_W?XLWSoj|~4Bt+6HFd+qpJg;OBHr@|^$Lpe zS}c+%#CiYz1VQlqo7r6M6|&&^)V~~$eTdIQQ;buXy1vFiuRp@g;vGtNZsPH}K-RqS zR**J=4xBiMdU}e(I4J5Q&q+KX$r_dx-u?hHFANGPVI8fkhl@-xg69e2@fQ3|5~@2z zEBTCuwigSE4zGC@8)bpz0^P2*-3+gvkq#5Wb>gtK!EFlR&6BbbwL$Q8ArW;UGPNNx zbs@ubA=I@Y)OFk#KM3J9=~~%Q25-Y6n=1i~8xXqC%|F11q5nTnpvr=L%YtH|%T!p$^7sS{jAy z;l)2<6uz}Jv#eZR6tfmP(1+`Am>*KI6`kNRkZczx#(^n-EM`0E$$2Ldh>CXRsoRf9 zl3&+5p!yXld!uYDQb5j4IJdDDBga`Ye;^zvD#&@1 zM?hQqL4VPRMvt>u@vd|#*r3_>SDI5vVcpd~21v}b;{7Hx6uc@CjpUdW(zI&msdQwt zqLaau2vIRn&QF(;4eTh)f_~{GbGA{6a>}Y}V#>lm_%D-VQ{QI~Z~v=!AAmOjYf^L(iNVYISnpj0U?dCZ!ree@AY$NlbLDUDcAHcn&6K~a#u zD*L{{b#87^)=iT5S7}8!ET^iT%A_?@hnT4)j=<#*2oGAkg&=``0lTs%OopC(-OBuH zkzqbrGw=S6f`>FKDU6!cB$0Y}n39GyreT@fShVoO^7I?i5qW85hGiL@go6#Xao5Ih z&>QOOd2iWmQDYrq@OWy-AYOt>VptUXka_*g>Z+Wa3pdqY)-Z=O7t;g25}eh%VU@hi5&8);!T>JluhSI7Hco#*eIhY)9z=ZqW!$ zSI!50_S`vyfqnG(1@zg2gWyhgWKvRdKs1tWqD#@*IV7DTz8;`Hq@+7&Z;9YP#fdpK z87rI@HU5B&Mh8WR8-*PWX_g~t?=A)RUJ=NG24&#F<6HD4A`*;ZlYj(_6x?zl404y? z$7=Kz#YpOg|7bKQ3qV|+&5$49mZ*Nj+q*Yx(Q>uAV5!$FtBRTk6<99?z_EZfDTrytts@rJ+AwpSYx=Hs4?BbFXLTrsA-2HNv|o0p8L zUh#VMj>%YUbh-KSm(1PWa?6R2aW*nI25S(}rCK)kZw2y_j(_pH=iYd9XD2qNc_j@Y ztFw56uS3Tt$3~aqkXFUmRs%-ly&Y1)Z*i|z;-_6Gez_M?ki@816+?Dx->^K8VJSCENqDYTa>-Zk)0l;jN@{G*aFGSs?~2I`;2yb(0?={HJ*x|LuqJPkQ)3_d4KQ1Ccq*t>VrTQ z*oXM=D?br|h58-%M$lh`QESrYi6{fy=@GX1@x|IBLGhn&ZM;ngXYx67K3}GE15H)u z9Mf#6`<-&0XJUaw52PHt;p+`9p4xoJKd%b9y!d88SJ|fQfZ_#lKugQa%Inrxf1Xo1DhI z&_`%AoNg0%PWKdkq#<=E4EC-g_6SSJMEnhN(muAT+#MI=x`{WBDGfqJFOhV(XQqG} z;l9vDzF6}wU6Z)IW5vnj9om0aH^;Jbz8s~}ZT=MtN&)DpM!jge!3@0|RV%=gST=1b zsUpGrncjl~XN#<)T6Y$s8%AB(u22doSlKw#9BRG9FtMd4AF+h2VRD{$>Cu6*Fex=4 zR!uK%B{{kFZ!#HS%cP7rDNK@hS^}UI$ZU*c{*b^2h$j6Yl%aP$7}cE)1>ivx`FZ)n z_pk~r5p8SxW3`2CE3t5v;z<6tCD7#0sPq~ak5wI^bqlDMA@87Fd+L>t%@|QWF^F!!I2HZ*sMWME&03eph_7{CKHT zL?w22Ti><&fNelJZYN2mYJ}o+ZSdK*WyFT7gm8f9>Kc!g+i!RU2<#9Ajaje?u#Qb{ zv-j}_p0759FShb7cHNUsaFjCB9)KWbdP0`X`V-;)_%(TiiEls8c@{10p{WMZlikLF znj?*c1XZ)g4OUZ)Pw(+IHG}*@TC@{AB@ljuM2(eeKOwMeUs7p|KQ0`Z1nXRq##NHS zRrItCA$&`LyqBN4|4oT*W4i2ncsMJN$UN^+4XmoPvUa5rW1q6lV(BbvJhrrfI(p)& zORgu}{X4&FViJbfFL}y?B%hS*AU;CqU@inw=YR2l&Z%ZkXZlQ?;f zQbLBs!EG{>L;EggaPAn=0Oq*U8;sNYVF&+eB+rF^nT_`ZF zHoF~~e$g7Vq--=XP5K_cpSIis9omqdAnZUw4>yHh_qk=%<9)VMzOSom*H5>`riN*o zo5F(P&)zqGtFLZ}cjdlaH^9jR)T5cC<{OIUml3m=Z}6pzoE zkB`;^Ruu3t4D}X|fRdL)Aj04*TF4!x?F$=x4{%!XO^Ogb{@qE&ojJW@`V>V~PZdN1p(vTuGdg@n0?|>%Uovx@9&Z^4fW`rY z0zqOev^o5<974mtj9hN+C9+^GveIhZ;dbDvPqA8ltdJ&Pij`0k12L7$tqN2YK+oYa z%E0tLq^!~)FNcJQA%VOM3KDLU1#S}#Zj**c*hY|-9fGWDdDNFh)UCm^F(I`uATtgi zA!#*0eH!rjpW?^ty&R>6?6TMn*s-}j#Ce9myem>6gQcfj8Gm82?mYDG^8pc9U^2DA z;f`W5aM3A1EIKdKhe(+O^lVatYTujm2s3ZE6;RH>DkLY-5f=G}aM<@U2T0ux*Wyto z(}H4FlyLbMS4I!U!Xtuqn=Dp3lE%k3*wP}+tFy+PlMyk86{LL{s!t+JhMAuj{@*gU zq!3Xv2DAVEL}!DAp1Gwxm9k_&sZ>cy!5Q(S@0Tub-zi|8FQRg|+lA6TJLC4LNXftC#Iqiwp&6%K$Mx5DM(kL`kKP>?b(Cay z5~6UHWOM~fdTArM6O7VzhYdbPr5)Kh_^MI$$kS-ek0TZ%5%kyo1)IHy6!;0tdjY)X zb#rMY!-={CS=#t!T9v?$06i8+_u1LS?qkqM&!nE2=MMByjn8OrG>lwsSJc8uXdlfF z{bCnAya$68Zagw_c2|b(YseEke_7Z<6?JR0>=G@p<^UF2Em5_*--8Bz z!bZ+*M=hXX6;iOvh*~Cv#KAH=GIKmilITKHe-R0N6oO8PUy1NbfBb!kw^^`-UPy%+ z1)-}1p*sM9H_8A#x)wpm>4%M@3g5b)5d)`9Dm!TPN#y%wL=g!=1W<51vCz$UyW0MA zrRFIBM)j^T3}Ea4Hg)(^BJh{4lxYdE^3X@v@bwVUk+h}(5}7F3xST6G_Cde#Yyk#w zUy*V_V))GyZ-98spve~zC?zcNKVi_o)q!}6$v*g72kzRe@vdx_Z?3gocM|lT3|CbK zihJc~zVEr8IVr2O|Jpy0>*v_0gTPm{+ilS#XCoJHGB#fD5vx$~JYn!MVXy;B0a+ROWm)m1B?gft#^tdZk)_&Y8HQaX z!wHc6w9k^Gdg~W2iVLsXJv54(@Fr-v!?6<&*zkuexJLO45qrM2;bwM@&$n1-wK!x3 zS3ae6`R0bdIR6mwaSTC#i^Kxroy3x@$Hu|>iIBcKiaj&sp2CJ}AvpZPjFQnn%`PD4 z6_=hT?LTJcnE9CzT0p@oF01y}71z+cJ>%N_g0;66XQIMYEs{(m&;+=Xp)p_aZTRn_ zMv(p$Z-n`D3o7FU#HTudqFOWlK0=)?j~5KOgE$1xlM&gBHG~2YB^6g=#h(nS6V4|0 zThL06J5kW+h#ObG=9fv0t0$zjfjj=%tv~3u7z-au0H!v!Qs`;*hszt3d&`1maU+?O z&@!pWPMgDq5y9edeZ|aPfteJPH_gATml&Pw>~N}7WE|q+ElZx8H+*5F$N$ zwXpH=i#lJ>+&U|=@_S|1V+$C&?r^F)FQsL*jrKpY%XFFCvoEUXB~6`Vz?)0(!p>F> z%SBN3qYb~Ov&clw)@fm+MRle{fy+q{Bl6nM!=prb8Z^+_yPTR>F6bY#Z-NZ;V5ycv z2U|=;O#!kT>i$b=it4WNgPceJ|8)8HnX+}|%JrEt{%Na@>9cm`5>= z`p&efp<2i{G#hL~IDwE6Qe#V-pzs!JIVN3`$KKF!gO?!Zr7IR@o^YQ2&^TsnA&VqI z5@3LtKMC+NNDSUl3=Yl*vYR4Ec_hLtNDxs;)+Hk*O^si9q*W5&qTYMGj2R?bWhwP6 zgNcZ&+^Qi~?td23=M8pz9fBc0yXsv-}Hc zgDitb5GnkM(@KO^+2nw-A!kaHa@FV-)Ik?_{q@$3_}&{UdbYSlaX0hKRUJsDmm?OHa#d zI$#Ip19e+HbNQZprw%f6+1eaJPn?OJc4iqwAmmL4_)hGqzfsQ1S?PY80Sc4%Yf;mU zd?W4km$?X$*bC7(3eh-=mO1z>?1qs&IQHNS%+$DOhCkO9l8su2f-Q_za{>&V2!i8DqveM0N&p9ZMY8(!GZ4a5+0RNEgjgTLDa2ZLzOEiQj9S2}{!tdmQHMhbu`ao3M z@fFUwR2Q9wDZB^>X+AhJj0O!clIXq=A?j5JoDOKH{igU)hd7PV0I3wB&!hw0+yGms z9%qIC#R@&nxp&uLcmL>->vHx;Q*EH z3jT_T6}|1s@yJLez~qnU2T{6%s!U_+@86(2hIl;pnX$4#&Nf^}IVWKujjFQmkT(94 z$dAtzhk+eF^tm_&nb>AARdeh={o$!!xlVlcGB5x4iD=%ceftg-o2~jF~5+V2$ z*7HMIjn<9bs^-9w1GF4BE!ATuRiNf8zJD4VUY_w>vB_SyeL`+M4=%adR#`WVnzfy4 zE&Z{uCM#JW{_KRpaw zJy8OZ=Ic#TnH1vBRVphNK6-ejc#b?k!%ho`n)nhIF*3)EYb?jEc}w;+K|Qq)fkrSA zd#-XNglQ9ebw7g6j~qDm?%DTlq_1Dsu3?JZKWxPkk*n9!ws)Vwv8k||IhzLi-}%io za_@Rdy{x=l)>+=RwkC0hGUe)%CZMWrH1L+62Z+LFLg?_9ReZ}!Kul$Pr^#JJ~Eb@pE{?VodB6MLzC zhl?Fc>uDWR$s_8F`a0dGYKC>3yY~%7O|r~u+O^Bt%o+GBnHkKu`27a>EO>Z9^v@dp zjTDwXC2Up$tm;jous2Wk5SHwhf5v<@tjSz05*jT|<#S2de1in76(<>9z|D z{P)VU^?{>PnJCQ<&q(}*9wOmbB91h54Ou_5^@Txlph^XvHMV4(t89iXaEa&Bn%Bh< zA)__+5IAG$VF&d6PT<|{i)v75tQvq2W9R_oz%-~|?jOAC`V0z9=GOipRe*m+7*qZ_ z;A#!y^bmsWIzeA@}|n?y(7`O;LvVP<~!yo#$Te zs%s9#zKYCrg>KyaIWwewPh4NF`=rwE zaC0v=Ev;>8`JQm_h$#L{YWt>~Wev}fI|xd*z7Yy!kuA81b#-VUH%wpl=WS@g)UM+) zSn$<%Qu5h_MFWECwFf-wp>mnN_3T-jxNHHis}-D22GL9s$K=L0 zEIe;mDi!q;-K+Qh#|20{zDAAW0?I}T*0I4Cl%IMqCnN!@TPE~66;wxjJ=FCeDAniV zX;0~nXDJH9rkZgL-`2neS)nyr)hB=Fr`3T^V^4zy!|LFti;|620n=r1parl( z>CGTL7ox?~W1Ix9IAaf>))lTm-X0^Ic4+l{r2DFu#%wH-c>q{X`4K`uM+ACC1itbH z;R}Bf^g3aUKUi@BFzj_)sHI$}-AfxZWsRCb-peE`2NBEq?aBfyQxWZ5VLHbZyhm(< z)^v_>IF;~Ro%fd+UqklpW5iaPnB?>6gzD&|NUbQ*m~R(mVwXFbVy6T)1}^n}Dtuhi znpBB#Jf+=*@2L1_X;$U3@b-h`UDdwZ6Y*xg>Q5Nr9pOpP<}8`+#9r`oOW z%`piM7^m#r&^R_1EoY(qdE})3r$K*cK3-BHqsEMfw}q?}H36v9Q&BBWsv=#_`oIgC zENuiArw3uL!{0|5DDe%Au4(fn*z8U`fACJ)ZkG|MR?}*TG)pE6VW6Qag8zxO{~)}O z-;Y?PyrI20^`qaI{t@s(_4+YdbbUoB#%kdJ%r3}Ud4Oe{A%}FGR?+eWtBnE;auZ^z zW=tEFp`0*tH939EH)9@-n^6=hjuB*}kpvCyuLS@-**OmLG)r0O9yVanyidL`JQu2r zd?g=))o=+$JHm?{pb3H8L1Vt!Mud0ONTH3^6$Molw( zZz=s`DdlD@?P?{}fFSsDRCLEuLHgUO1UtnLwjCvoH3h2e0J_cKiS-}vx_RsbG}y`; zsAO(_+DGE$q1Q^k-H|cDdZ#OzC5uakU#DP#y3HXA#l2@KySEh%tq055DmMQ(Z5B2D zq@?hMH3H5?l`a(IT0>-#+A`Js`Ir}CXgR;JAGkVBq*O%yth1v6HQ0AZ z>r%{C##m*D@;uU5-a%Er`+2yoT|@=mj4RWrTOnPHTg6# z@h}OSsU8bTXEs!;>oV*M3Qg^CYE^Nj4RPWnsix)9O$f0O#Ed%S^+00@%u0PPq8g&i=9Hx%Gfili|HHfONoWjJnZBF-5>R36uJ3jL!D`GY^Kj+kD`WhDE_aGxj zdEpJ`ln2*fn0wu52YJ?+Xym#_0l0F*v)B`lxD#13hdlJ5#0W?}qs8{6y2{D)nNZ?% zrpRpkF{B4VC@4f>*A+!XK_XM3K-G5~38LtlKYlN*^5hcN!7HFss{@m1V5`tamT2~I z9s7ad>(zqsc?q``YPTiQjD3YzzlJf47=-12cij6b)>skX*qF{Rkg|HOX% zH^WA?L`AVaCsR?E>QWeG+L0*UfE84NY|;XKVmbc-dYjWt9D0#%IqJS$nBR2Lg+2tH zem`3bSaE^#+D7Bu#^Bw;;Evij!q3Ba2*dLUu?zl6_B|5cWN7Gi9G+b40|ztjE%A5% zi($60MZmpiKG_pWf^7T>r0Yy;8_EE|UwB|`MaX5nraP*Y1{b~sbmBbG)(QY6Xk8Dv zt^-(C=sM$j=?HLaC}9C2^l-vp34D*dOnx3~=3rH2b$2{x-Qn`iGzN{m9&@P}cSMTK zf8nMP#?Y*yXQcR2v2MMBRxP}t;AwMm`K>k95=w%~maSu!g5#F073TPVla6bCFd`w1 zUMBgX{`O)TZcCM#fUeO|Hdu#IOIWDx-I>|BKXz&1UOes5o3hi^1eO;E;dNaP>N?0o zJ#wU7A6$i5<$NmmhS-0fZ3B(KGZzNjTme`2Naj z>FvMkfs5OZQM&HG(lnb1G)oX2i6Rf@Y5y#y{SuZ7w?cCH&1-v-`_o80mTkID0~5Bi zNMakY^G|etQ$+0z|BSWH{_SUU;*$-toA!jcR&?0{V5>Kx%7)77_x)o;97)aq3iS7y zaJe3F#*6)#-p@CWPbB2RFHy~@gRzrUuAci8=~a_v-VIymWsCqL6(S4{)a43CJTSgH zD!zsy*_Wa!U|k1RT}LfOf&7YYh#Q^Nn#x!U0&}>G5=j=P0g#!`+LR>CuX;;*6{oBX z5c#>+X1`pSNWlPUBeFKb#wdq$a5b^^AES9e@bBPe3{xIdZ7i7?{GakS28vUh;t?7e z!{85du$|e{vYkz@6|U%jZKJ#tm~&MWJ`W6IA}9Y}IW?{hmP#22YqJF7&;+^W^<+h` zh(u9s3r)A(IfF6Ex@%gLhl~VZh4>3E;QOoBYqKMq8@iMN_A(;NuD_!P<`dB>nYD*W z+P^?OzX6Bu;BV<8jSGhS|-7#ZfeE5(MN4aE%60D=YvHf#St8EikVl ziM9rlwg#%U1sA>~4sGtIY=y5-L$QoN(HAscVc+!=r|_7B{U|KceC@}^;(xKCV{y97 z%8O^?sW&D>1!B~_pA)g787>iNsQ-Yflvv!pc_|Np>!4@u*E9XbclzRi`CS(^sv^fj z!l0d1W6GgNw^hv$sEfO)#g(7vkRD~|%`@`owub-52p_8;Q!^rScK~hCIOZQM%-?wc zH&!$;vJBdzq~EIqYzLUUdP8j(>=mI8uqj8hQ6FY!_=Xoi+l-s66f8m87EUWH`Hxmq zf)!yN1ol zE!xnxZS-hkM;cUZooPD&{zEzasdm&Ot|h{+zSgWLPhK2G?#*!rs?~o=4JMMdechYu z^bzr~F2@mUO+Neftk6z6DDvIyepiQ7QVRU$@*wTm*|39ksw-irL_5gaW;VSiJ?qZaT}3+>Cnk45*GhwVCJ92Bry> z?a}ku#G3sl)itE~-z!iNV`0L!1%``J=nA<*H-`uLSjyn+)^8ora zl}=0OOp%zH$a;9psb4PHifrzKJ6C#0BDy z&oM7L;r-ly7;cNRT$d5Hr%G%1q^yrLs2o}(6Kn?i{YReoP!q7gi%UEXEk>pRxu6-X zh88wUvxn2)9)_dXBc+E`8G_YE3o2Cg3cA9;&Z?CApM)y(vW=i$SiRGr`VO zUE2kD(?7mXPpw?NVRspJ##bP*n@H%wXPj#$GBf&=Hpxnxr6;{c2A>wBXgZTzog)N? zIYMK~NIZjKOq-*nvxB0^Rzi}Ov?F^0y{sHlN2~DqOuc!l1HHT)E09~(?~@(F)__DV zy3VZ=Dw7(26S}9QeBF64*J9}0o2b>nMQyXU=>zxHZN0WJ$9?$jV=r;=SE}K*1j)gU z&8--nRo~aF%>F^cJj1=|UB+}LrLF7oE5DS-HcIgb&BrhC%8!P%Yt!N5tBj$);bRJs zV>;mLs)&*skc&#Npp;yKRo8$OqYYPTk6RN>Zx5Vzn#w@?5;EZ6RFP8P*??nuISTM4r_!p|(^tS5p zGCkEQTy+_h24gSRk2B66C-{K0tv&y-zYkf%54uc=Ck^k+T^`-nqX3JNARe$)^RJF4 zN>J>K)eloX`OpvLvoB9a9*V9np-<3FJ#AnTolh#-CZZ4#I#jLT8lVPh%_{JWu*^#$ z2)(P3#|AF42d^Xmsv2qt^w*-XfL^ECYKt^Yhfy&67}C88|FyWf!XdQYyYJMu<@oz4Hbv<$Jls=CgEu-*;Jiv~e_}^xwe2pn z_P18m*t&(?73)9EPYDL!H|WGYT6g)$IJW{!lQjX$(qL_in-yR=Nv%zG=oGTqJg*ZB z{TL=63~u-(uK^WCM2*K8exu2&8Bt+Il8cf&eM#4u z-SlalsrHVIMq?_Gp1u*T=_*GDwT3)DF(4u8-q>9kDYLqLK^vHH(h!w9RS7IXZD@pv zr5^ie0pRUTYplM_{xzFJ9w%s(Ru_3`{?#P>4cv2LUB5OdcgzA18cATa9rChe)Ja%= zu~V2N4k)I8?Z?_FdE{BDS;ZS)Fx;$Uk!Av{$bQ=mr}_0_Aa~1+!*WioyC_j&BJseA z_#%3=`BEpFk>4`1mWpRAta~t*T-xS)6`XJUHk3_J&2+U>N6_Z?91N|B-dvtr%%@%a z;cgw?C@Ltw_r{v8V1xCQx{=70knNl=aAFj-{=cjE? zg=5c8`vG;0;H*(9v|0^24sLTO@$zbVi#ek6lPYiLikUr;{jPNsic{5BR@F26k`<((F2 zBidm5&MaBMZSh=h&LyMVq0@9}u!0^)C&ox8vMBSXI#KR`7b*?Or}ib>=G#S+(T44Q z;n_EJt46Kkb#y0fKF_gyY#Wy|2|=5spM!SCAq#E3Z{cFEk2qo-&xw51A6*cFYTvtU z^vV2=?cuf3)fV3bk_G4vfn)B^=btH3J-0FB*o7J_T+b{1yM_DZ1MBumLGxY^?$Ir| zQ6({N@)1?sda|0)IXd^7r)qM2e>GK)nOA^39TzGhE0ujaKeRpxR(!!{w(j8+|DV2U z!AdSK#J2K;#gF@>?MS8VSQJ@9RM$`oQYmw;uoBn6!B-4v3NTs+VV%D{Nq%`Ek}r#* zt(V$HInBebQfA?*z5Ed+O|h>%pV@hrLV`LPs-cdpM{|^vr4-?!!myzOc?m3y4=9bl zXBBBacXN=kE}63a9TWe~`Nf{B>zbcZ<4+>P(omX%Wemn2X_8Eg*2pQ9^0kg{2+ZCR zZxXP^L7w=BUPmqhfB7ryDrjl6+hJm4;`J)KepSHJTZvvh&}xz3$4T+OL_tHSSESMz znqHc!JYQJoi+&o!O#F3uKRSS=?0A%UG^;6t`DE;9bfgz%U0^>e_rQviYn~z#mr5T` zcx<7o`kPd;XQMRdD`ahovCF|5Z%gP2i($Bf7Hh*_O2~U!^rz`%2*7&c^EX$sG$F{S zaNuhsfolHMbHCzxn_q7No5{`b6RNRr;K1ksoA1*r9$S9r_Mnc~)ea=z=jJ&9%4Jzl zz!T01bfvld76#7FOpbtiR$-zFwk;~JH;&8-FP0gOQ$C!t_;=C5}~je5AwCkdamA7o6k(0 zXI@$WRlk1x3okBL;ctWj`2{tg4KJ@Y00s8Z?LmY81HSK?D@hF}9)cwGi$Bn9z*^^j zQBGVKqG4w9_}H{X+IidV?HmW?Z|x)jnk81UFejX5Cy^!AwG#Rw`wQO4r3vwl!AETm z5(|4Uw2`0_jXkjrXH@rO0!#w*3lLl-Cu1$eFEt8T`?(BId&dW3$D%|b)u2OEg2xxb z*O!2out>QoW_|KHXl_nyJp0jA{t=uG+M8^}LPJ~U`P!d4ehjG7zNEnDE zOaZ?FX3WbT4xHJoy)7byOt3*WTN@|kjT#0NF$rqT9v)_@ST)=o4rvuO3F?6yO!_>X z3QR8KU*BHgkgo-G_4;G_x?E5skPB~J^_EtMfx2%>|0idduaCzGZBJMEK6l%6Z3` z;Ix=&k&C^8dW=(x@(&Tx~y*7`MMtRhzs=9jI=&yv$4P)2281 zc*RbN3CRf_^dF^19dzgNud-~b^3rq^gsb(5bK4d=491|=H0C3QU0md!E-iONpH;56Ve_u?3|?Y^!Br6%AV(r>Mf0)ZmU zW~`iW{+Kl19HnmEjb6d;!S$FQWvXf2i#9TfgiRZbf!5~sW5APgu2VHzr#kSD?*Dk~ zenGX^L1ZDe7-CJkGb$LWt0;@fLIoR=4;_lR&9@ezbAC<4dZD+Hyn^Lw_uY|v)oAB@ znepLb)yi=XA2Qmgn`2^W3yR8iS7c_XaPz<8Xw><9wfU|=F6uL|Bg^J6f_Q< zK;`2Sl$+Y`uVj>euzFM%dwxzs$Yt)yW6{88>dv!u-LQ2pj}%%D_r>q0Jkxpx)Nj(l zKd<0wzKx7YZfs(2?n!Ot`E$kqWs`9NN~cc5LQ&t2_O{oM>$0U5_?fptAGj$K0MkV~ zW-h}G$>sw%bd@o{I`o-4;F9}u@@WlVdRcqEenJ~X9@LadCN^ha*=oWrEsdJQ2pSn1 z(b&Hb{$_p!gkF$8J}>a2-X@7BF`tIHe=r>nc^%taOlua*6+(vno#Q|IzCoia;1(y&u#n{7vUBc5uN8ij6((u&v31XmV)szRxHNF0$E>%Uc4d~IU2CD zCKPt6v#j#7HSXUV$3&j!*zGR7HbF^qXUkA~J>~$qLp4Mv4BN-1Lk^T!#H<|l@d;?1Ty6cL zX3_^PoBsOba0(kM7gHVo8Fg9S^7F4Z2;guQXo2?mdgI^#Xa@mUr1wIMHVPhEe^|d8UWW<%Q&tAnYM2BQWqNap z9FJ*{<=&AHPug^H7f+|S8$SGI_S&)8YqB;`CgXINM%k!^B5bY#b46(SHvj8Vr2M}c zWN1b->#uUgU7T**xbFD)A~VZk#rUEF3qL6f|60#|3>!TSA++4-qc1C5Nrj!hgIZ3Q z-#)y3MgVorCr(EsOVY_}JPv5Mm>w<%_xju&Eo1DKTa|h~^csfzNUl=1@DQ1CBPu-5 z^cXOf`dpAWn571x!Y3gV_T3MvFAwmMaxE2A3UTqEu@gp;uPj-Y+-e(*c)CH%XjQ+{ zS2MoyP=r`*Qn9jM26@2ok*qD~!sA}%UNaoQ6&3ILEA{L*N_n2E*k~V_MLz~H0eZ{M zEZOvy`kuo=9XeiPG(4Yrnq4b!edc5h@@ATzi;p`IBLH`R+!Ih#d_)wtI-VG-7hzw_p%O*Hzigd-m-l zok5sI-y$BT;?WXJ;@Apk(*EKo`D8TtCTFHb7CNxXr$q-2ofVJGbWF(v{edC)JqOe7jSR^j@QAlYAL6 zG4P&PVZ%gkCVQB6n5I|k)D43L0@F@~iO z81|Vh>X|KE;BsWCV~+s46D=+e78)v&Jur=g-Kz)%q7tu1RXz4G@@S_s$z1LRD`P0a zh1%;Kp%FgAQfb*Kt}8)bPMt$)Z%=0Ia~G~Q!+L+dq$aAVB-`3B@Ulh zC>gf)>^{Qn=+I^~iH^sea2a-RPt4WYPUN?C7o%zzcKRjT>o{}r_{KCZ35`vzeY35N zy%L2$5}N7*5|)neSdB6^PpT6%==)uLa0!_&inuyr`qx9pmx4#B26r@YqBD@`as#|> z*%iI>IYBqP=_P6We4>;6M=Lq)0q_vzArL4ZBf%hUrLgCljw6hXIv@0r^dShglEl$Y zj+|U(+)mzflXvl_d4!op>9SFm)t~L@ zeOuu5%8}YleYrI>fA)!71K!Zlftjy0{ETPwG)3mjBVMr2<)6TDIVCA0m4_y@J|@Hf zZM*rD2HbBg2h>RuEd7kpY)zJGKrsCRAdJr7gT*Cb=MBMzXxEy({as(TsDbId}K*%}D!&>0HOWaYVlIwn}GPE?B>A%*Q&@nj!AAQl@OfjgvJ z3vBGJd!&2=6K-$U(p82nL-kAxM6DjVOzF-pXCt@=lX1{->eSXM3GUAkCx&h8y~Qzg zu}00qyR*bQIHO9+)hL7q7d36K;JrP>v56|zvk|n7Apdxg7nv)>JHj!csl$zaQ$NfY z8hQx9|6>6vfvt+cE08{tS?tIqqGh46k+%aSR;a>DAQ(>#t7xC(BY`u5WF8;53WAz`ClSxpSPP{?~CkfpSzu` ze-`k$wfLE7r4CQjtb%^l{>-wKTEuNr!fjdTzNkO!lUo!&QcVJMlF}u0QzLZwO6W4R zeo@dqDeYe!b5Bljy;g+|FtUsxpFU$%_oW-o36UBw9tLlf=>Z6F0ncrIt12uQ5R}%h z-?gf{DzE_BLiy9^00WMgmwfH^Uw;A|z2#h6?43n4Cqapp%{RYk^B{r9;_cs#^bR3Q zK1*tTD2Jg+3q0ke_L{1bPE;bT%G=d*^dS}ROL7aWRJpCO9`VEsafDE!#~`6ckD)nI z^JUe|hM#N0VL1QKa8;(P==<7d8vd(s&;V;-Z{M88DxpySVzYB|z`YP_uF!b%Oeu6) zHYdSEoa@#q9;-=9cczApz+Eq1$HzI!V29}Bk;YDrh){Q?d$J*M_QCcZQI%oqaNHFQ zEnRwMb}Mv7rO5uill#uIu(e9u)l(r`B%P5iS!J1Lr>ylO8SSXb>4df+IeZ5=O>I?% z^0PJQOjKhOlQ3Iv`!x=pRdr)~`xeDtY&~I(w*NqD$KBpVD5Z%H=m3G$C9#s`vieqs zRHG-1$ZIi;#7EEUw`IJg4OlZ~Fz&=P5tpGa#}_O>E`%jkJwXahB^AD9c*uXxtd9sX zH9KQS=DVo#d7*e9|KR!zs?D}32~+wi(P5xiIH`u{tg%nRIV0duXy4ZYFNkQ=nq%I? z^i_i2a3a(R1r`HnmLGF3$`@u#0T=4D+`76eWCSfA0ZqH5_2Dwc1nmG>jDR;Fj%We^ zx_KJ{P$mJ)$NkrLldnew0~PBA6!FZdfZr9lzxzlo_UcTY@W9Ixr48?=3)MyZ1vs3^ zNS9~DC1yBnjWDZ@zw8^4T+*Ya%WOx9wyeO!>rrXMNS+RK>UP^9s+>8G`;x3Cf)kI6 z4~FfJEYM*Y(RCZ2_r`{u`Jc{^CvpTwk2DDQ?|R`Ua>;gfAJX|!of3* zf;nf&q8D_T87TOK7V6`W=VzHND$$?sJ^t2Es$8`Q*y1Ii#TW1o%&X&r4m085tBLaqDR9DQjGSRSJkZG|hXLvJ*oY#1np`-0NLfxeXy?V==2 zT}7Luwu4U3?|aXTY4VyWlmkkr=$i-0 zq)_@6z6z*uvgs`3 zsH5pJH6!__&h;WX?DE=Y%$g(ClNvMF?DI@dry4NvFetW@UER66_>WD%^^|`j^7GW| zd21c17$~h*7PNW!kBd|=Ai7s}(&e;II-Xo;GRm3l=IkN;>3o;)U{cv-s|U`)f=fw> znFGqXv0F}^!IG!*R(G_7`%TQ_scf&G^Y2uCZ>_o&Lbb~VuKuZyuYx@s1jF}eOb_|& zwLS@>GfX)xoc&8z(pBTEg?z{Viz$rsg;jXSTitNPi~j7f01TwSEIm59JS^D&ePx!# zu}T=x2;6kV;6W<9eRT1i2bW@kZu3LV z%Slj4`-Und8^yx!@=@=is@FT7Yi^+&eyG7P9d(-p6Yc8}DdyiRH zMi*g`;}92c?|=BID7L13P`YDOqKcZ^bQ*q;7W^+XOm0kFC)m$WFvkm>w+;nj93@=} z<%}4j8V$^BgBjP~Ep|oM$PJLUe^~K}cvH&*kupP6aVmBHgmvJ8hDOB20bm}w(`Rpg z(rks3>YR7F{eZ=Qi|5<GSD-CLq_yUK8@2sX_}{rm|1 z@CV%D%65wO4qpuNyYCYduqwIS9g}KkFpZ6`o4+oTmZ}(SC97h7iq7EBBmP?cL7sd5 zkfF3vyO@1q0G(ip`X4WbvAdJoc3m)Oejfp$?e^)P**S-L#~$7|NY zzHbED=OWftM_6^EC#x+V!Kwb$!w;?U-b2IQzlXi;j}F(>AJhe2u>Ld=#Ct~zmiczKgxp@CEY*Z%3IR+}VgVJd9(T8&S_0m200Ugxy;?j> z)|o1CHE!DLenG~|KHhFpV370oAbut_--w8KybNJ`GW{^a zAbrik6cg-uPe^dv+*}B_@8=76o3risc;|%fM72Jh=H{znW9RaWAz2`3a@j(V?R*Y* z5RIMt7p!pOKfAnP&W#EnV?}nVMoCzZ`)_meao9Oizj<ouwFb139ZDoW*WsbTEE8>#4#}z)aB0=^s&C!~U2wi44HY;H>F5Hl4Y=)&9HmP&Hokqa5zvAS^6!^`%KeHoC`dlG|u0iW- zO54!+sHST(H`|S$85=kHS1|YPxDr(3*~(vlzh6f`=*&V; z!YeeJS#pL<*Lf#Ib+qq~(;SiGteJa6?H#5SMNt$YYYN2%(XG~om^zUpe&H&5kt7ur z9Kxi-ynbu^?7n>{D*99t(N0Ac&;*HCqiR1TKDM5$5;-Mb8_ZjKS!zAdkO>#FvfQL2LXc{d;phXIx^&f5q3V*70YFO1 zQ2V~EDhzs(O>^or_d5cjHs;mbZ_-swvLv!<9Rxs)+c$8%9<)3^ez}?rxIev`2cD8* zeM$=#NkI_7MHKNsYhu7a0-*8`vFo@c*?RvNN#!bF5C5c~#*~HWmqH2Pm3e(%$C z4G0w+MizR#g51c*@pp2vu=g$C}0 zDf5I+rP!Mq77 zpcMbSF5m^;oBS=&c`~130h*g*#AP334 z6Pyj*niExO-5jvuEEqwOHULEBq{ODxW0}&tpudd*pxER}!C%nKavM6;7;vQ#XT+HFw#r#`TYaaW^_sgL_`Pg zn!k~Hkr}6BbSfsS{OmDHi2@&+xfqhsTBR4+JOx9AB3O0s zO#iE(z@5X6I*|r1qjcS*5BfYGi`^Xdli+Kcu(!Vc*jr)qy}X?a0G+xEK11PbowTrC z7k^@wCOddK0?8?)0@}?T;OIT1j8YJca1nEGUvhC@3~~KPaa%}t`yld}7)+VLaUSce zY5C^&I0Szrv8cL#U{$LE4*%NL8gT2zF^*;Vf$KNka74veKmpQo(b6lFc^}Iz0@O`w zJ=gVY|3NAqkpW?AlRh%i3WjY}XCNr*oXxVcu(BoJ0QR+h$%P4A-}|H=`VIVKoC{#j z*iiYe72U*7$VieG12jKeYvKR+3=s(c)#Qux3J9*eS#DdYeO@G?>1 z*WvkTF=STp_o5{0g2aD>RD{tu(bRH%bso4aM;9b}y@7~OX6DGxfHzxG^pBQEu z?5*AjsKhC|`32~q7bpC&ElxqBT3IGjkDa`-hK49-z|()R&cdH z{zz?Q^Syn|CweW2_>!M#{xJ#sJI&)z?B6HzP?mdcgwfJh*4wzCd|`|!^0Mj-ugf9< zp^BO=_|y-iBMZb>Zaw?!y*z|pX&Ix9Rq$!JB2{WYfmy2~-jW)ZV-u6`@+sD5)i1{j zt3ZMUpowxL*4~D1%H*eKk5|xw2dTnqtwnFGM?XLSzyGM121ikArQX$P8Y`C%OCAUzJ7wizbLJ~7#H?m{IPctxNSPM3UH!}(hW8&$+e^mUpN`q z`nz?_fAuT^7Xq9e%_jQ|tDCtZMQR)~dSh;_jLIWT2|+f14CB`K}~u5*g4~Rwi&qogFa2k_{BZ|} z$)Q_QOw1AI^M>alHNW8=LM%TRbNF~YFv>bqr$^(=ZW+uz@C4-vQWPPS%p++0{ZjG<|AVF+KjmYK{IQEGWEjDY$CGZ*hC+F zdCs$~e>Y;6yL)1;55pw;+T^ZKhR#LwxW_E9BK{6G%y}dFfB^2b0e**T2Ph6-j}Ksleb4JCup@%+PV{`2H@Mjf;X&&0b?QIa&gI^JR4r^po>^**1j)e6$;1oae;emk_US2L+(0nS0NfE*TYBc@-IrmrC^g`)p zis@H{nsTWCJ7ljo+z4%2m=v>xQ5Qm}SLc?49|oQ*?AuY_zAC?boPC5L{({BwuPM)G zL`krkE^i)3m1($MRyXL5F0TVW{BkDdL9gvCw)I4;*5iO9(+P@}Qg_HaVdCoQBrk~Q zz-%+$ztIBRpWiIx1NkQ4>;d@Cr?`Gl)3)?8OsQLm(U~%nD`|{K{#RUIl8m`L*H(sv z51vq2HKNU&DOY*TaSDuh7+Sm1wKjO~LTphPn#nl&5efCd@j44M12%P8+g|mJl#T)&}0iSiYjoOZKJurOvA8q+$JC=KEtM702 z4*VZooH421^jd^^c4wTnDR3zAF{+^eKNdP-bu3h-sIa}q_DUZS$HxG-0AqMib+pTn zIHPUpBG#``y8b*75mJ8kkwQtt$Jd*L#YX3hygZoiKd_2?N!<1}bQIL;?*qA890{Qj z?ux$4lD>;BNlj+9tKTW}s{a@?XezBzx_&Emors-ucircejSmXxE$?^40_bz!0U(dR zbhTjGI7NX4hH&(QW+G0#xvf8&UFgLf8uWKhI*vA4k%mQ*+^~3$XmoNt9Ec zi1-|L`8sam^Zd*PEw+r@Zo^`2es1O!QRQFw^5;LAg-ZNMCdw`#NvU-;&$C+C@o~eIzTtIr9!!RY(k=P$4lD9$Cn%+ZQ|0oiR-BtiM&rK7$N5FB3&bg&OiLOoF-&@=wh_ zIz;c|(+JN;rwq_EM*NP3d=42O4^M8Vwf=q`I*O9jk9s6Q_A|I@S-xyBaBbDU^jSuN z$LsrvAgD;;b#c78`Q1$P+2zO9cgo+Pi0KAl>rG@)+ZR6=-hrw6-Zu3YKW7|!O&2Kw zkosrAP+Drjfk^b!&KwYC8|`mDl8VKSBd<-_qsA&%|I`|BHPF=cRCf?0qD}QVlV%oi zEayFCMD|LBi_niZVi_#dGF@P-vBg{TR3JrwS1H0S{Tek|-gl<9<@x){SM%{r|5bwQ z`;T}RB~@ly+T6Lsi7dGT9w`mM4iVq1gX;I5I-MZ%qkF3T^v^YJ-np#_?_3+XLMr?- zO68JTN6UE6XgoVeS9-`-w2}v9F;zzAE$HdjvDJ+wOVh0jk{hDc0QZFKJdr$N@OG#3 ztnxxla+O)Bi#@1IMm9h0%m3J!X$Zxi7v;R6QBV6}+qUx`%wk0mAxV3Le({3=v4HaN z^9>F$7yFsy@Z$h#Y^gt!<)#|y3se7U|9AXt4k1>V4Qo(%dgR2~DPQxkLq^xmT+4>~i>pvA%cR!ngNe-ApM zuv(wjo{^fxuCLTsg&Gwqz<>`yagO|-25;tGcdUF`-N%jirh2Jl&S7s{&=NV7Zy2rH zwGP0_iYNr2(FvIP`Spzs087G7`-IX(-47jniR$z&1OPqZ_NA+|XLll9Dr#NERF?(@ zUnl}%Ab$h)CN{9U8u6^}@sn=0%WM^P`i>A-8%awm@6ak)tef1n2ni-hcUQ00a)fm! zzay~KedYGhVCZT*wlL|P&~DOMAiRR@js27fl|PL6t`;Ky60)C)#5 zijrn3-42s7CEcK$`A=Mk2uA&>jwEsik5 zDvqB_?V{6eJw%A~Wd67*?c(dwGdF0cTlH$#bsIPpHZf=FT!6OEB20YW-kp^uL@WaK z8E#b9uMUqUXU|wDApGPBs>-g#to(JBPo96H!~24WHaUA#O~g<;Lfc;{@7;JDJ>ak4 ztp{g?t`=yvAzpBR{Ey7z=w+uJwOH3sP1M-(!_Rd=R*_J}<&f1tLt9sRaNF z7K2$kfmARl$-A1HD(c%RUw@D`ZPJP;l8Ui46nuyP#6-|*pM^Is_8xdm-|KS!m9Nn< zdJ~Dk>T3SpWAc92PP%T`8N&w()-`-JG~3>+bz|lI{Il{7mrsB4-@IEA69I<%`y*@o zS>>-nN<`I$%)it$nZFrP-0D-(mlpMszj*pm?ujBk`3g=?@98?Od_^UZ`vHoRy5>RX zrJYv)X88*;Bsa6^HCnr}!=d+6t(@=YtW?3YgUpnpWcA8-iO*rfO>S(hON3#=HL%I7 z#38KTg&r|j$hv7J4f8{__L}@~JSvspQn`fb z#$Ck(a-4^9oX4)2ut+T5@A9=I|7bAf{gIna%KAyeqO2;D_Pv4>dYkcuX?TT=Ns2*2 z{h~PQBMzXayP%u>Hxu9cW9tFyJT8ux*a*c7y?`sKJQRXvV+Qih;}}4?!PGVFLeO$B z=y}5WE-$5i@;OTzV<#;L#<>_q5GI@bei9HpH~9%JBh>;IXhkTeIui&iK*&DN2c{s* z2dSTicio8jZ=w@6Chcm%GaP*v!mmsDU;zYO{CBPZ`n(b38TVKXK+_i2Irk#fGmsx! zil(wBv4yXly_neB&7`erAl;KWv2`*Qk{%5Ufl7*`g%&wHkqq-Q89jBO%`D0Akfyp; zYENNEG3lbtNJn75rgrHL!=FvJL8?PO%2X(z120&m5v$IYL|c)Y*C4XQ#<9FZfXq94|>~Z^Zno>7ke2x zoYjCH@73dzs*;$WyjRrsoM|{=&>jKs`GoAF8VGu>3FHy|pnH600=QqJTOLGB3UmFr z{bhY9uQ->O^#^9k_KgiN8;D%MzfC1@_7McO#{#v4Jbj>V9vl5*12X@fztU1)=WHA2 z=cIDAk}_9a`2xz!6NymZ_cp_<)Jbq{-y=+)IN!P?XOnKWip`QUZwTy-KMN9P3Eg5) zv7GGA%@Op^xFJYzyS_EXAgwC0vIgokBcP?R(9cl}X`QH{n%UfK)5c8HkmZ^&3HEx; z_TZ-+%V}n@l!dW$qu6juS#V3iTcGq+Bz%D^ebM$pdE1#;B_cf-Xn$EfB;J&H_|HhP zE=LF&Jcs>179a>YSs_{(ukm_JwWy1M-qmi7`k0%)o;YXu8<;pLc6*|f5S|qR(`Y2| zM`IP>2n4LYU zNk#1BdOBan*glyUnY$qVt?)ZzEgQ5iuIW<(B_^(ZY4`c3R25%ADw3Hg$jSNlS1EkS zKbk0aA4A?N+%~P!?qAv#YUS)hsf4@NrZ2WAZ9t2eVOYM+Xm7z5Y`iAu>h01Ix#1s( zz>lKP_2yO5ckYQBoo&?+a1Y#3w}2`=~r`}{A4?=Pj! zL$AjEEzdV2Mly`vQ9GD{cT7ddF%XY{y^*zGDtoT`F}c$tEdFV#nqsG(RJ^dpk>SE3 z&_u5N&!+$!>v$bkA8(2!2F}?(7m0lVafa8{w3zs1nAxiC^`al`CHQ11LRv(#q4z!9HW!L3J2 z&^++6&&QQ>+uJ1*Ccy?L?_2Csu{SiQPSq|+^$SmK=z)R$7Q#?9t7iBcPqq}Xr10su z;?ZmQiiba>2wsP(VmVo<-^u%v*&aF8?#Spo(_`A8>Z4b5ka1jfg_pp#z0G=gmychk z1$+UW%M`z=qE}V?m;+g6rvCrf{iXw8+|7Uij1x0xx1;%+!nKcL@gL??bmcNq2`Jw~ zAvxd3Y<~WcA4>Om{hz<3N1an^xPQq!R&V`%(!~z!kCL)&-KRmy@a(w+y9{RIZ-4 z^q!rs7&P$G zktlx)I0$bkMMGFoLYxr&&M?NF`O&giGSE3$jhhuN7ID`kcUF&ycA>KB{Y*!c75GIy z5iy1_&$14zOx68674g%E3knM9cYaSs?7MB)$4z^iw5~H@FZihMH{f$AJO&0k^_kgU z-%x&8M_F873nN4aS}DBow(YSFl#*W`uO&d4M&gd@oRJYYQA7tdGVh7&g`n#;>_3XX zc3)DuQJRQN^(x^vR7k!Fk#!%daO9Y($#=UmyD_6S}ct(&U}hoQk^-&Rd!jGySmOvAGH4~mq<3rpNaRt%3%Vj zB1&k8L3zKg-%8)9-o3gyVZ9Kz>@O(bzsKqSe*g|a@xG7jqNpqty?GmIAxx}-s?Y$y zjRAlnK8<$k8h}Pa%qoD<=yg#Tv{C4V9q5G^5-srqe2hL^qva_ElX?GDJcq))0DwC z_47Oi3QBa8`YrgY(pX%2&)c7V;m!MBnOk_x^!#g16btK{>uRU^#&>$jB8Taxawx30F>2PM);or{5_6gTq|C)xqIY$Ed0=#%zm4t=))R6@px~ z9+f5>m1YxaAriH)Ina7;N2}j~Rv(4V5WNbp32yfe(cbfb_U>Ui9!zt$ryrwQLZViJRghp8#Mv)m8!E?Tl#6e|=+sy=@>RQY zY8)0hDJ~^$hl)hIIFQNVkmE{Hh~wzf6yiYMY$Z7b1`3Pyl$M*RYxMBp$A9qZhhKR9 z`mapQyos`Qjp8!4+ZQ&wlAw9ZOt zosF_UE9!4t8NlmrOvK-`GWoi^DT(qXHx;{*DQ|KI+D|v$IwxgyPRi;M@z%O2t4*b( zIuUQx>QRKrc&o)xgyIk;H^shXfHv|y7IF(rj(%)-S#h+r9+t2bw5umqLw+f(^uN+xhv8G3e08=($c5lLzv{8&} zQ!wVF2<-07*ps(lb46ltMPP7j!f4xs#kmEuD}kMdk8tTm2i*gMj80E6G&w^@Pd}%d zn`t^wk3C6_Mz;x>YCUpgNT3akM57fepIH4we9HirE8j9egEj_(HU^^*gGq?Ntdn5T z#bPn5x!iVx?vY7)hNigP-A8BND4m1T^bSwrvYW80V?=;j8CF4xT`ecsrlg=iPfmtF zqF#r z$nO1By!+`l-v0C-JbC#SCg;Qrn5CB=S$g>kdrwr6Qz{|PE2Yq{rogA5uv~+uQol0M ztHyx0)=Wtt-g_R2EMPdakDAnL86DSH^J*^E?L0u4_w`MKC;D*djBjr0+D&p#UB39?qR&;qy z_j|4CN{qqpmNy#2_Oo6`X^l46etOEad}YtviX|-qQ*s0*w}{Q?3Kx^O5rZ=vgG2mj zOx!5ODgIq&+lbb*0i!J(!4g49ouAV!*XZmUVr+Jv$+<;(21jUaz0UsQ`$@|-qcv_p zp$54PH1T+f^c*~!#{QG3rCt$b9u{k6px#hT1#FN?{ub%v34W3d1g{3AgTt3BzpZ!7i?F zQdFLZ$LFNTC*Fos>`SJ|>!z?YF$gd_*Fbizfy`{N3T7;~p@9mRs8(PSF6D1Y-WL(`6UL%CTZ=s%dxYkDE4P#w8o$mHX>Jw>%iqrnAHFU-43+6 zXmt7*^tu@Ix@dITXmmm}I!!b>O$>T<3`TW4I#o0VMI0u50^aIOM(3yL9vr7;e{z9G62zMy8hf z{SMBxr*PHrm6-Jd3qgshkqrfD{FzTe3jZ&-@DK?7~n^l6tF2nAWkeDRHl`JDEMfw@Q z%sd6z1qyOJ%C!K?v=sZb6qgGWS7<4&6eCta9c8t;Ai~vE&|j~|U$3WprwGKiQy;{# zg3v&Dqu~qdLqqVh<v zgk}?Cu!$ITwr~V%C|X-6D&q!}hESBc@6npVG1#MNIJApvcY7I{e#q>?5+l>Iw0HG! zs=1lkCLfL@1sdIESOpiUc&R)|8Yi9%3`(JG@6WYOqU(dd=2 zq@)Q{?@On5qKCfG3AzSGx!X5H?}JgghiB*;oxx!-Vp7CnR>WadDX?h7lL$v|*g4f@ zVRy5Ov{V&|_5{kRr8FEgapba%<}MeTo71F&pBsT4P9bOUU#jP<2pE{ZT!h zavcRFdWy?jj7`q)`t9F&`@_%7F1%*);WHMWzh~juTe=7ClT)l9qacc`qIe3+Rg~1| zDBo?tx66dD(L!mhk+KF8WxF&~98gnzOh^4uaq5<@!Gf>Bjr8U+d zKyQtclFCGi%M&?pE}fR{9CjYIaQ1u?Z5{jAe=38L8ao9(Cj~wW`Cc*02#d<>6qH!V z4V-?+$+wV^r6(;zAR|jlb!`SYIY!b_wWK6#a3`usw66lFQ)5!AFsYT8G)l}`B}Sbb zlU|0!6aZ*l0q97St^w#y{~Tb}?*J?+6Dalzl$2`&t>~8lu14sq(*+Uwc77faqW&|4 z%YXMb=!2iHu9U0siLK}I=8Kl1N&!!WmO{T)1ZYm(hACw$M)zj)t_bvr5$GLarCY9k zI$Jn8dpKHK7+Px>f+ZAdNj4%G7P zXjCy6lu@YTcc7QXkdiKE|HWKddd@R4^O&B&Av*iU=^32l{_rGyqcfCO)uNY0qnAlB zD%F@&aTHX_Xlm1P_@0bItu}JYjAVKw?7Eo1{%aD>+)d$9pPN%1CTfnFNK95?F-2i< zZXvlKp0a~6>}ruvbId@#*Fb);Kv{J@i!a{s`olkX`T7?YmfkS?@HNk0y=8If7rF;- zQ{Y#SRU{#&M2e?eptxR5?NKMSCv4Q8wBg@tpz5H3vI7a!U5MwvO)Yybo2Wf*pyrT+ ziam)Uz$zm}6(&lnEfiH-D6Mf)Qme<;V8FlAPDPWEx%g$imXJMjcMGfX$%7W>8_)%CYDa7_~~w1|?>r9E(Yg)vUl~m0`2Vu-c{A z9MS+lDM`sv+^J%c(`BS)Dgppi!8PD#EvQeR$fu#$FG5_df~D00-Wo058trEgKh4Tb zwz^LcuFdCA*JgRmxBR)cPD@FRhTYs7>opn>L^{YzP8WYD3T%L&z#h<=BO0ItNCWm|bFK;VJ#2lU!@>*X}G zMsc(|ij#Lu)Esx>-KV1QY6J({cW}PP&E&U%(!D#_eJz^( z*X2|m)RE)SP*`eU@8Kri{`7ZVy!(~M&ptA@@PfJd*SvWBmZfKZ;nK~6Wcy>tEtiv1 zrVw{N>n+rsaI*JGB1dngapYr5e66C2zyv-ikhP zGg?O^T1NzeeItT>BRcy=bT%;-Aq0&f6i1qd z-A517cDI}1sd;7>o-sW2koK+t&R)Ju-L6{94mnywBr-)PGR1~3S3qG0D)n{|pK?2L zX#^_yX4HypXf)eNP7|ow?PAvnI|W`NwzOEv4=XsWkuVt2C?Pql{ZJOjmk6Bn;E^X{kr;N^$EvGnRM%szU? z;*)p0c>R{=uRd~TF^ z37qQEaH`YB!Rt1fPCM}LHd9t_p}5*aLAjBVD)CazhGSN0jw`6U7)8VR1TNo6;px%< z_q%V=)nCo8|MxkbzT3s4H%A$oKg`8jb(B;%$t$vwk*6aw-#})bh5Qm7qf0(6-N|6z zDHp?YC)st(MM1fhL+7#?oU7*EkcTUGJXG&>(zw3}e`OX|1^ z&^k9G*dq{Z5om1TXl&st2<;mYteX%l8_}4yc~LA=j)!t>1vowv~!T zKbLOYW?*8Pxkpc#n0rjuzyz0C+u3*IFv%Grtx3XMk#9+e$PdK$T)U zO8GW4$~bgF3>Hf?2Gb4_Qv`M%vvK^Eonx(Tst#$XI;r5mEekyp-HgsGa({G~?!giI zh9~G7on>NfiPGW%bn+aZ)tRo}Ogs;xb znR?6t3olHDF!`*?S`0J8!r3lz9QjEGd z4BB|Knm9~)u?02i94)gr}alVNlGK0tSxl#~oPY1wiza>d6GIUZ#IplTIB z4W2SJMLrE4p9W94fTvtog;1cRO7lgC%PFlED5(}Gtq~}#6@tJ%O=+#LCaXVNjkB~? z2<&=lSMd4NTO=CWreL(T2(-2c1ZxBu^CmQ= zji?M^DD|Nz^#OzdfJ*Ip5j#{$}a0d$6RurP6Dph+`WclN$KPTt(Jq@Ss z={RxEz`?um+!;N`_``V~j8D@uI7Z*_I0Iu38JS*W|Ngz`6;cx2TkstHmfDW**m>(a zDqH@9@8X}Ryt0m>^M9nc`A^ie{h4ET6S&yx;B1eTJy%Vn`lBgm3a9eo_tdt8Qgbqf zk~)F>GA#w=8vJ|AJbC+q7jJ*%`P-j)_~;{3b5D4<@QjylKJe(tW5%BBqH{8zlN~Y+ zwP`rqn#|q{$(-y+r+@JfPkwG?>g67Wmi*irPv-ilm5!-o+J>^Y{ve-QBb8je@8kTP zLJqg&Q-9LN!7FK89Z2WY-58eMU7@SJi~ifUxzZ-#$KO0Wc~{27(^|%s_Ve=Nb^h`n z_jvlgm&511WP1hD3p8YT6WDzsgFK&slPx*)&YHM2V5V<6nZ?|`G=|&8?c#L`}DtQcgjU0nk762&yF9S@? zlKn2gLa*ju0H|35@Jk4n0hZK!8Q1RutknkF*X5Mf3W0T>5Nth{tKTxd>|!-v>0MjU zIwHmDXOBc7)>Zr)p`_~4QNbZq~usR zeD)l-dixofddT#nXN*iQ($O=*#cS=B2^{Z~a)3*S+4 z>1#aA-{8OU4SR2gaH>ni>0Tp^%|>!|ZNq!^dn%hZ;61vPf`({34GEO)-$BLcZTOC= zdH&%&FW>x~=Wl=F;lgXiryerDu*Ce*Jd+RmSa^Scf%z=jM~$=$T4@{f&^Bae=y?Wx zOG(VW+sovOJv@AOlG*3yc>ZpX$8TGBFki;hv)#PyimmK z(FXu*jkj~R?<)5O&X8T`#AsGxGRU#$)o9fc^nw(FHUWb+0kctx#U#aSkzlb&usNkT zTr!+VG7{YqT*(pD(^BZy;3?PO zDOXcep`oZkLvf`#fKj~$;;H2@{b5M-(ll-dp0+zNIb-p`fWcj+IQX7b?^#%7o3 zelW?^Tld&^^eAarb`0hir1EeQq@hUV;ekrH1%+xWGDQRmE12uxe`a|!D##^W7u;$gfo2_MjnkZGWmqQ;cML9DwR}r{^Vj&Ue#`E=>o|T-#;INd z4VP3D9S^1QatOu8BFU(Zp|D9p+40R(Ukah=`UWoEtLD}FpLp~Bf3W!S9Wx8h85y5u ze0qwhM`O%Be!#omuJP_~yO?;AOaG#s-iKChO+|5UF^--^E1fgR%)UCuvp1dm`14!d zeVky(3zNi(f|cDg4VOgu60=+!Qs zygSQ>zxK2AwvAf@>9q8waBC=!k@;r2C(0OEEM{=7l7&~-`02N2eE8`p6AL{wUq8;Z zn4-qHlA`7MHMQFE7kuzRT_$`G?dhc z6|tmR6HIZH`jcpa;7Y0mimNmfRjMhfR8v^4Cf}zbuS`X5sfwHu6sdX#*n zbwsYJc$-jJHlnhKf7VzdP@6ZRHg7~_3P)uWE1yEQT>ZktXu^;v*CSP~N1+bGV2#3G z@8w+U4Y~$K8J&5|*vu05ho-rHyO$H^E>i3-#O_q0P(>h-g&~!PBUeTsS4JXNY(Xm9 zj9L|qR@jQ&mVjEe16e{Ca>XX>>2c)k+Q6Rn?>W)8f%Ai#X>8eo_e2CuH$phl8%FQU zHOA-W85kR*e`J!r;Rz<@9y7b}n*G-@DQf}iJn-FbqO?HkH&j8l#&w>lpK`PbT*089cdivlv96o11Gwo7?{1z zvsXXy^6hWTK6%CTqa~*1W_b8`iJ66mEWMuR&0pL2@YnPF{J#z{`7(*SlX2Xh-Nwbi zQ0_l+a=bHy{zrNAOr){&`X2q0ySdzH;AX#!yQ3=3--%~vzLwsZEM^{`rmLfcdS4M8 zcQU!%qvK|$z|AfV*SqD^9$C-c<_$Dl{1cUD{($eq*A(sEMp=_U-60ppu4HiJQW}%b zck^H&pShR2dHSJ*hwtLKzo=mPMHP#0&+y@whrImh9%GN2n0da7+x^Mh9ab~;ID^qg zerBJXW${%zPv72P=2;tyZ+jS+XrbwVkF;zzW}^Z@6^o#bMWc>IuZ_p3kH=_=7Xdn! z0ZM}vFeycXD^(H%n4TjeGfz%-fr9KpF}X!ba*Gt?7AaOBde!84mE@PH$S+e-;8U-` zxQtM&jw@NNkgEZHnq^o;pM2+2e4Yw11%CA^eChx^1sR26zFNk&jBXjA%CZr)Wzz~i zQ#eXP7)pIO3f(foFl0g~GIb~t<$4knA;?wXX!H@JW-9fjd(NnSS(?k*Rt5hi4cZpJjaR z5mOIeakXzR1?T=i_Q`)I@66$KM>D@H%e)tEk-v6EXC-0eixWw$j zBF|p_!2HuEy!mB{f%yuiU)1o!KTdP$!6tei@8E3z25t;%xH-9*o_RUbFFee>sNm7t zDn=gVajjED?}&rpnLN5DoJ=n4VRmMiBS%WPbtj*5m+jOxMUm@S&%xsxXxjfhHI1PZ z*RG@D@YhtG_yY&7hO)cmd-h-3&aIIomVQ3OjbSUTLo#~joHXCxM(fCCUjAIq>z}>6 z`0*t7r&AbRsApi#L(7A$oa%^VXu(DAw1Lqj2ZOUk^o&$EX2Sp{;k3l-#e6hVMNgrzF-yviWLWq@K-0f?Ui6kAmh=cm9{!}C*dea8chWa9!^GSY!;=ei^-t1rvy%hIPm-KwLuc4QT*7+d z5;h=}Z$v8JM1o`!aS35K?Fn3L_R{^JoHJKll+?swvTsBti9`~;8Ns-fy!r_0+t#t? z-Z~C-Z{TQm2#0z?xHeMC^x_l7X66|fnPzBgHn1LjN#E1}IcNV!=9xc`dHUbUIUPdH zH4PW;`?x$@!jYbhoEZwEXCalY`82MMS=n_rgp!tTDLnf}j@*-R`F=hf<5f(*I>F1I zuQBs#KYfq(^Yq(kJ0a z`#LTUZRX}^46lCd=JDIpJo~YpXCDsn;^RRc4Cm6{pUZ`FGTLq!=((TE)MN{tw{LRg z%n@cExAFW#3nP;!*tOqIWxbT*vK?gR{fVsnjg;1IqO|GHR2^AQ)!}smgk1 zkK*>YmirF{+D0O1pNQe&gK#eNZ(!uPjTawl7+BIV_qvp&x96F7TE&fFIh~VE`lr2I zzn9L92RhmZ^0?GqfWJPEv;sXl_hobPl8P*lr$@Hc@%QVRy4|J zw3=A-+F104IE>~vEOto%p#+CZA_7cKz?CY&oh~6IQ%YL4l=K`a8M!ht^JD>pa&kOk zmI3A!E6FWY1hW?7+N^C+RRNfRHDje_4Y=hLmMeo1f2R(-;iAD)p;^^-D#$HW2Jk7! z^e9L#kds~@CoNw_YMz|0)Yfp6<}j4zaFph76sE8sywy<}!%*l~0Te=!386@ZP^6kL zq^b=h$U}&euS2d5MQ4e@-&D!jYd7iaA7^}4Y(sm8X1Lzb%kgs;$t_IBV2UDMvVoYm z5aK0aBuK)EjSELA+k#%VjkJFv;=675kG_d&o5+8r-8pX@p#rFBR}bB~UrH@9)1 z^*dT0M9@DUP0wr;Q&004T}o#3c_PD49n8GU;{HP;Gfx|N`O^hn{M143Y&kcF(&(Pd zrK7)$6PH~y9&%B&$4zcU0>w3Yy!AS24>~w}-b3{sHz%4axY)Lj(Hk2~) zO(vbN0?=#~tDwy(!QqmU=uRNf9giy|AplT9YL?^+02O2x%GMyvEmn|IA}6Orz6Ro_ z81u`NpCyR0d^sYt0>2`dmCrs;fqxmFlAKZn*~JR7isWQ^WMmYI@RpO3E8{DbB@CrG z425Y`0u^s{6oxPq`c-Y`8i1ilR2ztwen*0G1M%{8sP$XPEK1?n#TIUL4KXsk$i&Nr-CgTA&=bPJu2_cW#+jI%<-zbM{evS6 zj!rT$yTJ798}{9?l630dNIU-ra+?1{T}LF>XDvK<>f!FgXl~4GV(v{TFaNTSuEp(i zK8oeWl!BS(r}^R6J3M-Qk;$b!3_q&m*)Ioq@|Q!r|7C`m=@-l|ykTMK9rKIg S* z%uLVIdFL=~SKM^8rqbJ4#?aUv#%Ff(Z0RcFll2@vp1{~d75DBHbM}mZ%V*t8KRCg_ zKqXUC$GC8~oQA?|eq37Qw;zAu+0%aB{(OSJ{6{B0|9y=1;X?Ws^BH)Q!|-A~H-?kA z*x_dP89nFk8K^k*HU5)-z~cEoNH6`rIMcC?TVorUc&2CZT^#>a3v?;PD=;^ zOv?@+%#)FsFCxq;kdj>}TZuaE~YkX|4oy+B54o|M!)5npnSl&?g5p@D=UH-?I>=4S~efJCza zX~^kyW{a zf<`R|+tWC5Pr}|iA?&)lp4(&R7?~WW=YB8UJ$>{&7-oEWj;Z;lobBC7;^}|Ke(K*y zzwk$Vt?M~67|YU+hk5hsUiy|4y!`nBPk-9Y=yNNRFY>uHXX0>oEcGoqng>egUfjv) zeibKrC0v_yGWWxMUcdS~&tLw^;?fT+K7P&nw?EO*wuke_@~A6`<4B#FD@P4nIgvu! z)ofcpaMkcRuXV6E(ktEtjT(tHl&@pMCW5UIw59RbfvT|=$&yA^gF852RJN+$n zXa2~(mhYK=vzw(4$9VU*D?ECi!t1~1(KV-{ZCJzgo=k2G%eXSUg_d3y^FQa%^GKk3 zGLQKuy$nqs<=x*dF!gjVt-Xb`Jt*YHNHRBvQ@AmZ!|kC$ZVlvd=DLfDIy-tpJPK(n zN?9xl`3^LyXtcUGbow|Ep(PfxEe@+A9(!Ue4p%IR$?+tmC6JtvuohtM?*q&#l#*2> z=AR=}e3l@>PtdK6YSm9wuxhxACsw*1xjGDuDTqbk3fk@9{}44JL7m^qai^u6E9Z)t9ofsvg$ul2y2Jd?Wv-lSVDBCS9Ruk+|KS=p?|Zn|uA*;FU}Vn2 z&AueMC(;<0v(h~k&*ZZLZjB1;zWg1&J%8jxOB@eoJ&Y`t^61T8Ui^5Rg*Vl7J={Xq zTpY)5?BL=(IcGZKIdM0fCm-u+AByJT%M_lzKgPo+=Xvq*Bx8$dy!v?`?|yl})U#9E z8nw|r;-GyvgWE%S+#M_CMt>puPi5gqRHIf%#D9m9Z7Aj2(Fo$H7oA}Rpv@VNJu&Vx zfGL>?tJ+X08F|t`8!Gz@V5UbBM7SE_YK%E0vS4yc<-z26737sE$oDD5b(~keCc#Qq zwt_IPOi8X+Y&lmd-!eY2@}=ZTJ_o2Ugdo>{k6a&$L#^)Y0HTRC8sWCeH z#yHo~O6{IJGIKR3HCu>?_@0POA#90UPek~7Hid=|xp5tl8^hSVF@o4_Tam?Y#q5ls z>0}Pq?(gFMqcik8JjuP;qx8%lr|;1TMjqd#d+HqBW6gB;-r;&{D>vG1ar<@$LnDKX zP0n(E`X2cwzNM}`gu}h@jJ|4M{AC_}^GY7f%b5GY#Q5_fMxOe4@Yuu4U)ouE*Uqhp zR1VzTK=YuPp2Y(CpE&4yE@$M)exASi$dknnjEoL3Gd<7kYX`VU$>YCIv z-w`;|A#m{O_nhtsqjhvEZ~k_jo;d?|CbXQqvyO}Xp`0JsM)|%ST)rPkOYbH|7BxKm zp@QkRE`}HFOg^t>>FooipIztHa3QVz1q{v~vzlZFr{T~C}W1eqobYqE~LN6vD)XNa+fkCRxW@ct)W@ctqNmWvnip$h? zyL;!%oY`~t-3x)M+T9U*@kG4uD`nn&;`zNlMf}CV$y4UHhFCjnr=>rJhAtP1s#Rna zIujYIU}>$Lsf8j^(>&3+tBIkWJ|-sG*gKly9H2*HiHM%1V20OXnb^u>YNwRSJ})Z! z)%48yv;J_2y{Gq?+pT9|ZHSr0US<~hn4X@~Tw1$UYck##nT<53%z0c65I|U;*sh`$m{6NXc4}BbbUCEO_ zZt>{(cRVm-+ro^5W%mHUuy^5y9Au598C3OpPTl))h@p zcNFc7(fsMv5)U@IX{(GUDNssxO(3NiQd%p#Xe_g5xGRR<##H8q3#qFLVE?#{qt}hJ zjVt*2r*>ZcDT||T3z*-HCOFI-E6Fvaicjep3ZiqymZn8b7M~c?K59(I>|Od-jM#dv zpleBo=4nlG+pkjEBcQrp!s0_K_MUpM`_z%;Jx3OHy_j4}Cb>Y(&{7;Zjpk%En32_J zNpY);if$Xa<{YUXlakwDM{;g9`n}SKp_QBo54!N}$k)>w5^S z?jV+FV&iB;ZfO>?YqJc_^f0+J#n$0IBMSv|O$1Tdrlw;gh@yH0ZNoma4|~zr=Z2fF zF?Tetpr?BWQ&TNMLRDm!cvIEm%hYZH(_4{Du4k}(P(=SiB<+(PG>s~l+fCx(t0fL! zjq~KkNuK?%z}J7?<@-NB=IbAhc>H{o&D|B&_g5L&3})sqi1D3hx|ZEpd>qHfmJ7q% z68cw-ncVW?o4+jZ^!pK(_VU;|sbJ-CIL$L=bg%0$v8!TuQ_b*}3lCoG^XS<#R#sLS z>aAj;FPt=JWg=ar@vUQTn z&htX1w{plUQ4=06$5Nz8VVNgRes?J!)Sz$a9+P{9OzfLbJ9dZeRTG-#v}l{trEy%J z#!+Y5r<_^5AIa12n%R4r$>wni>xWg0ETxlM<3MVuiqbYuirVanEff-%Xi8Fv1r>c_ zDhH(`l_+p>vqr77#@ShdN+m`r6(F&)#7bm{SZIJ$VuIYp3>(|i<3|)KGn8ucGk_Mj zdRpSZ@jhlvRYw<0j|JZ}U1>-?=&d=qC#tmF*bIWX~Q z&bFCBF`~13tBYr+TTb!WqUB9^f-`Ia+yex-`3rFKx5Pa_fO~+DcTU^y4i?fIh-`0T zDf`P%H}Rs@_Ps+ryxV(CyFSo^`f;qo~kAv zdWI7ioz3RxWeaPE1Qzj~bZYi=%x~%FvP{BWvC)>?d*fyoJM8#TI4$1nFeJlSUBxRs5knJhhuVQ|BPieV#0_7wE3+A?`y#l&U=n~$29 z+sk3;ejyvjqaae#xu3+&-}(5$H&i^9d95v!5oBB|65I^72sj54%`i%%`V2h{Ff1>^^Ac)lYrA`pX!D^Lcdi_+Z?0zJ}i=ouE`;%R&AfrM|N{N`DI0?=^BBHZIWY$@b*J4R@mK?dQIW}S; z)?y)Yi4eI|h(uz6SZIQk(C7@HDRP^$Q}C3gDAndTI$KxM2HE9Jdf}f`FkMW%y}E0qYI2a&ClN}CLl_5nzMWV2r>Q< zqBmOa6rNY`Y2&&1Tj1tziL2k+aq}1Oj-~7xmeLzoN^fH!y^dR;E^04bgz_6$%5EW) zYhWq8g+O)(f%Fan>pKXnH4#|fMJUz4(&{z>>)Tk{YvCPYfvc|xQhRL#k~`Qs7%(#1 z%gX&>YWu>7$}+|;PM^{?74;o)OfNJOo~nVhTKHS_HIF<$&}fyduW^39)*_~E~v^X$80mX9hK+4N!iMJ8J>3h7z3XJpTs-bDwN zALj7v#|@^oGI{X2mG$FN9)H`%=20zMCpoMh=Cb}Eovla3Y#%&gb8D9T6dBudZ6rjP zGChz^b&eOM`S$E>H#0q1&GrFiUagL3H2G+MqnOlltX*HGk?IhM7 zG}6$NPkT!#tu4jGCaNiK@MZ0BGKViSdHS6XJCDPt9o3+!SD*NTJJfc1vbEnze5Nh+ z?NM}1DHvK+(LQI&@QNp8b+MGzgprZsLs6wG&3(SK4g^zN6=fcveSdIRtM6{WHj^!)7X_oP_#Kh)xY5!@O!Kse}~HN z_hc43;2m{^#6n%Nt1Rgn3uJCHm+_TC?mus0a6X*QF)unsoah)+(=z1Bz+50*6JGR8 z`ZKx`%H(D!tA|PKoD?v=?#b?x5*~ju%y)lz$m>6C^X$6?c8|w-^6eU%&$1ZbSJ1a@ zNawO8!yDmD?L{!U;m_8i77m_ua`3E+@BilvKmGS1fBx?iUj1p2XFvAw^oKSEW}_IN ztz=@Xo9@;;Jk(#3pCzFvS4wG-BdLjY#K#KRI~?S}(^+=*C)r$`;pF~4O~rXE4^`1x z;?8J$3ekRMjkBt?5tS)5KoO%1vEIZVt?@$FClix+=9=7;|p z<%d5n@!g*~c>Tvtb|0rQw&cV5aS99f!&qNkWns08&BH$a^1qIF`Nu`R|Ia3d7lpJ= zSW`Eti_rc9ifa^XACA*D6iI%an6zAP?tksh=A#f^{<)rS{+|(E{iUC0e;T2+Kbq!l zPnNfmI6Nw6cfX3Uxl}4!ohfORlUXYwJi`blS8Jr!76{A?5eba35?W$wV~9j-iIvbC znYAG@xdBp{335Aw(}S2D%~3g-;pl9Jle-1Z7w>_VZvYhH6C}htScp%E2;b1toHwC= z_?r;V^Gk@ogeiI(U!?dActU)`1o(ytE}HH<=kU()c?4RVwcaVRb9Bx=mKSmH72xV8 zzG?#$5M6`-*5?q4Yo)fZeeb969xiQtN;KY07*naRH5uH=8~J3S>43K>K2yP zx3RFgg@x4}1lC%ZiLW8B*1|%37dt0iN?TlM9`bsVy%_3c2DB+ zOh_qE5SeC4VxAqHgU!@-++rQI6tKO15DsEd?DG1d8=(rB8QlI9)aAbkxN%Gt^FFRSKDDPUPhUQ(PR6 zkB2SQ6&V~HY_oT?$l>EP>RU?a8%-Rg@d{oEOP9!gW>|k~$ zht-F%Y(7ll`0GrLzOABV$PMp^>sZ?Vnyd;570tf9`f-u2X%+STVz!?|a`dW<)dwmb z{pi5`XQ`|_NMmldkfE6h+fXy*7)BjbxPw9Pm%dl<{)ZZNUg7R01V z$SMdRG+c_KqXlkmVq9DVxVlMjPzex-G?CjHVPkKMLTQG*(gcOd1f`QHDi>3nJuGnb z65#G5IBP+{MGJZb3GfLK;S(Z!3!z`wDVVn-z%N{I(Tsn6|8w4kd;WRfFabWHmS+*Z zh0ZJ360cxOJcBH74=~5g-wYSu(>QsbM(t&alefi1&Jn(2YIOxOu_l(X8x+(y5s_?( zrSuwRqMHb;HL$S0g{k-^=HgqJi*I5szKMmE=Gho}2Q$%4EG62gy$opR_oJ%QhT$cD z7I#|Nf3(W%b_>1pNgO<1V`3wd>}ne#G7WGG*PyXKhvtz~5=*oxXcCZJDZwZ52B{^t zNh>p?s?(nC34i*g18EviQQjh;tjU~)9w|k&`Xm?JqNZKS!cG*EYXMBH2Qs-H&G>2r zLrcCi3`rPW@niKcpT<5%hGs)Jcv8#LZ^t-zI?Ca*Wjf}ZC>pv+)v%lwKW*^f)esL} zwlcpP&&F{ETgSyb{h^g7-w*Tn>wf08(-~b#WaB|MPrp0h)prN9wMSD{>Pmi*2f29; zlofe0*ippd%mDp^^~}xnvA;9K)Nm246(o-9oa&fiQMoG_&`zmhF=s zPQII@XFQdT;c!;(SF!ghg|0b6dKZitUAN)ki+&z`vw&ZuF-0{h!jkmZKCWkeD~73k zA$?1xY(I_QyZ^3a^(cU?lTdo++*rJyPeHjiZJmh>4aG7!6~*#e23xxoEU%R^w-d(T zu86f)?rgkvWn@cD@4T9vDmkT9-bBV(QPbo{MNK#gwLX#YYSOZTuyZiR#?AzL2UF}6 z#@H*3aZsC}bT+}s-2xX+OI*Dy@$j|8)9lXk;vb)%;jfqTfL@d`QEZ&;& z`8-DF;^`a+r@7aSf_gJjOYc(DCMLgKpXEmx?7pm_Z!VdY!!G(~%W0Z)p=&IIqD5{^jG58D;LY%|CxwkhOs@H|bCS*KK`EOL7U-CcpsdfDl}A|| zebdG9>n?U56|;I6$--VZN3Ys=`BM*1zVGG!NjK{c3wijelRy6F2|xY6-}3Uu0K-5$ zzeA3{UgGJGi#+^#lF5yB2Im@>TOH&6;W9@7SqumLCchojqlusT(=G#(NBI=i zc%yJK#mQBOwM-uyJ0t8IOi-wdad0$7OV(wMCByF{hkCMow0Wwk=06Yn5X{)T|) zPe?Afid)d{2~NF4O4Tj$TMcNPl9JzGOv|VX0}H{_4@#-%GNE(Qo{_~k4v!D$pR8o> zc_$BEw{h^ij`auWtUZWiKc*kG~(~@i#Mk{~w2Z_dkz0dA-i)P!WAik@PgC&|Kk3Z+$dVJ;jt|y5nW1L1vU4 zBds-LCWerd=tR%7jHxY8QZsy!%Z+gN@Iql{hFE+HwYx4&vm%-n4e6M)q-9KlyjnAo z3k>O;v}b%hknv4FN?R2~Cn>0|@u#jjlF@-iHa5rDdeF_&pIRB&u_iiQlcee|nLc!4 zaLtD8qaiWKW1pO<)mZAfY?)Y!j7@UourYIaw8_?0!3@3MUoV_e?@wULt#{zdh3q1TS@CbMt;3-1O zQ-Gnuw{ku{dWTrx9b$o3u=&|%Eb$2yoaKC(a^7^77w9~LEbt7nz$4K7EqHI^Gd&wO z&2aQILFHkL((N=VHxra@rl>qlWUz0Pon$P0SB3lOs49=A>wph*faT8C!UZrI~N&l=T z^*u^5icBe~6H!>T&AbJEDpNd-HPGueC?$NIxCHXbIk zdy>ueaURnf{xps0P|d&<()RAhKkn4%`qOF~wHnuJ&x6EnH2?mOWf^&vIg_Jl^-Vr^-P zom7s8lalBtZ!+=&8CdfovE~X2$Ey?k?YggR4F7E69X0{%W;#Dsg} zqBf+mDuI^HOvW}Ms2(w*sOuh?mHIS}$!Qw5WcOJFqZ=}6dnB}tS#$q+8fy>3SbgBf zt3Nex^16rJha;Rk-C=EGh^~$*a^ zPVVM7d79(mZH}vt1+Kp4xSs($Z$Ymh%L`E6A(n5n;d$N$=pAB#cktQgE=Ehi-+Zq- zhvyz(hMT`B?g8c(=+1MB&jd%$b7aOS-Om8JnxJ$w!NJu8rTckKTk#zWYYh@JLK&D& zM&)9Fx%eK2f-e!uZxNlM!aK;G(k2gTy2J#8D+!1ck(@7~xW$-(IZt}VT}a9Min)a# z%9^d|ne?N7Dg%!QElOJ5=orf-IK_aBDj9x>H_7k5LD}GKayxHO-m7H)$rBEqJms68 zp7GsZ_W9F)P4MKKZu+M^XzG`dl&?Wj?rpNl?@~}_LU6)W+J+TGrQc&&HQ?90joPIEclfBf&K%gdB(`>X*w`Cl=U|Ne89;|KfND20)E;Iyd79$vWrmB7Ic|RE2v6e~c#hHXtwy}a z8_jo-AWJ+0E&c{Ji?`bC9Gk13DXxB|7Yi0=AJdDRj+`c_JWOzKGr``)5PKIx6wXE$ zv3E8?;bM%!)d&YSV^kidyd$wQWNe|7;))0?B{wlNzlwqQDxt|XbWLS2xR^?9zcW?s zPNbK~NiGso)1k)A^K*=~-owuR5_``x7+lO`YCV;n$p~sYqp9xkAtv7hpSauPH5rpr zZ$Wa^J+eElQq+5$%(g4!b>E?)$CZVH8ty-tV((cR$KRAPxZq3gv;#>6*NMvb7?053 z5ft|ck*U|nsWfBpAcW0lQ7k}!x-jwV%U6G#`?oNX0}6_-g0JmS;6p%BmGO(v`$#iHZG!b z(vG1;Z@T7u=$nh6e=?E$5+_p9B@9o+5E`O`ubU=~^&Z5A=@RB;Ms$D$9raO6^)=H} z97l0l7%~2KMES}n%MGKb$PSV0H;5FUQ#Ihr!S|IM|3@R;%Yn2EBvM?Mj#??iSpQ4p zVr{f;eTb#`CCn^8LnbjqU~~tm*o3H1Ph8a^bTvQY)|L18^26UC7n)FCohK5TC36)Y(9)Q$RfKsJTY`U1jS}RgYH0T&n(m&dSbGm3T#?kZi zUC+VI7=?>5cFqRaIT>K9Ho(r&06RxRY}JO?UqC!X_>Npb2<{aV4|FnB<~sgr{AivPF~NkXsZ~YBRdv!@_n78;=s$I4Yub$cxGj z2V(Q}$ZasAX2h7@H3@?oN;($JXqeKbdPJM@kz3?+T%&s-nAO8Yj$W>?bKJ@7b^+@r z*{mEy5s~~Ep25GvBj|Vd#{7YZlxw6GYmr%D#_qFD4qlA$;>U5G{jrT_Kh$#YI)|od zbBbH^F%!IxsrmbaWL=_RR84(b66u9PT6(=1TJUCaHH7)?2v+VFvUop>nVoPZH$53$ zccphxMa_T_72W!@j9D?Y8Oq8*Da-c@8K2Lkt0#tn3O>u`PxJtE0MCb zlEdT2jlBMLmYv;hI{G8YDO2Ges30!Vg`A9F3d&uHh%qNL$cl()bGk+XsOz?4crKLn z!z8}>ZkEp3dpvla&H6z&CoekKJV@m5q>1^>5}JGcC@2e{d$5|Ia2xEDX4oo?uvHpk ztNbNEM^{rE-Az$@n4BRr#l`0wq1oSPLW^JK44p@y>CXu;KTV+d#fbUy`g??o}R!DBX>4a5KW*#RxkmLu}Qj_-s^q*eLZ+<7j}LlOc9arw|pc#=N6) zwxDl3kd!Pnkx6!xcIr{waf_^)>(uuc5s~^iY30|5&HR-78eMv(WE3}QQrn|PUhOT$ z)@-Tlw5EL|gt_f(a;n6{WZxq`Pm7prZL0dMXdE}8cS+9Vjz5!oD*6_sv`pz!Hh7D% zjRa0!F0r~l#MoLfBkR$0OuACjXGM6@7r6VpM`+aNc!mC+fVhtcO8Juhg#eZw6tHkV zlW+fXhpp!s%pLjDx27ba^$ImJcL|ERhM$)fPHIgO6CD|wETez8gu&@R#+H2Pp0cHH z){n`}6ecz!8C-OuXI4R7zd3o0*GaFwLT1ffs=DRW^{FXul@lJSPim|c#p&)C-F+7; z(~l79eL#S#F^Lg2G*u^1S{P12rXSViX^alFV`+2`fx%ZO* z{peSG^6@|N>tFu^GQ|~Kd^9mN|CFRycZT~?ad)`IP)9PYHNmtuhEi1=PGM#MDKSpu zWdsr%>5k^@&$)U1Lt;XM>F;Qzv#XA(ib8C~Cg|Pyg0Ybb`i9deFSR5k<{G70a$2ff zm>P1Vqe;%}SSnj9B}|U`)81fDWwwCM3K!NE%9$Cdq_ZoNqi)L)Jm8fVb zO>O>^Re4iV<-p)nGUH2WJbKyBPH)4XDzX`5>r|_J8&fyv2=xK<`!w{9n&s)vYcRVAM z?k1UNjOyL*IZO2gBqD4vL zZTzGDKxNByCRROJJ`7-d%aP$_TRO(X%jNCL5uS3D(3H;^+f=grAe+&RK!!G58C~W?2X-7ex9pQfFh>SiaDMEtS-~+lF6R0W-B0JfKy2?~K8uRGs ztfI6ugAd>PzmN*A5*4DtQ7XXP^tXs5A8}9f4}9^_zu>Ag<%4(s5rx$q6xP=%%(S7Q z%$ba6b3#0{D9^TIxWk{wKyAjlQrVa(WUw=o^`%m>Gu#mgZlS618T$9WME~BG6y_yT zS(eFQcO7wIPLvg>s4Fs~B11}bnV9vpBJOY3vos&i_EsZP%ML6q6|uda!tjVYi?bQj z6wBD0Y2%OIo={hxgTO?aU^f-fVG1V3D@o6mQBdkiL6wY(c3XB%npr%EWB<6F*$rpD z|8bO!2jMJi$1psdM0%bx!_#qu$66q=GRDB@4*o&*h$VW6B|6CMjgZ+HVB>HK&{k=T zoze(3ih74vU%jj;nNhJf78I$aVLEXN*=zf7KEpdBBFu>YHR%UnMr<79y*U zDXg-kt;d<#IwjfZ_9RBxQI zfmELDc9RfjN@bp$7mu3B&vv1_Gyo(0E9h!`j+KQzqk|)KG&E6L6~Slt_^2-v z9&(fRelHgHJ(-ztWMs;g{fF@^tVB`Ppv%~zf`)Eu$}0V7taD>+x{mqrW{!3T*j{NT zDb9tcC^_MwV$yTWD63V`J`l^;axCkIskDz8GruWiY1@L$qc9Fm>N$Eg!1jY$TKf_x ztBN46Fak@V780pG)-nTZ>`W0!w6Rl~V(Vc1Ge8H!Gk}JF7od};DbC&|ID4DC1@LEx z=its`^2_`kfNHPvvC`lTeEMeq4NM$X_^a{V{f7Ky@p@JEgZc*B0NQl=p0j8Chb{y9L&~HIZF?HnLaRJ@3|8@ zPs3P#kj=x_LmWJ5Vdr^1b(10{HuG6ME@gf%nxM$faPj&rKH*;wl5&aE5-ql#1<|po zOYXo8swWLdX}?D9;B6WgwaI9^f_>(PxR-uGa?f?@X7$J)x;lrC*U*bAy~ZEi$XMNw2y=cAW;fwFXqQSreC~MNE<&g%x&m z_D3=~mrBn-1PO6cLIXt9ZyMK7)>=5sBeO#0Tn;9Ii!Tun8p@GP07akeJ@2w%m{ImH_H29cixiCnZ`+M?)aR z*;e$lc+%G%jK1dQ80p?bU+V@knJ&(rri2H%kQif1L%9=yeqXV)*vb5IEh~qy6jU0~ z-)+I%iWhUM4m7okiA%rD`d$jXQy$c{E18^#rm`f8wz?3iauhs2Y~uCv33k@o$j|np zv_eY%gcs|3RjlsiGQF5Y=a>$Q>z+J%md4IeA`9DooP6ELgU4eWJz1r*w;UUL3uLx> zh^@7e%8ap=o&G-%Ya0XP_D0C<4KDE6sSI#%GRDEh_^lCC?O}2eCr@LXyo@g(I(r-A z;$!lQIC~r8>}~W`Tzrf#-t*3L+J43t?{yAthB&wxoPpCnU97y3b96S2`p8wMW2Uv@ z6rGjbX~eeL7uQnz)5sn4eg>FXewU;yV-nJD;^qAXQu(h)%rnDEaTO!sW$qdN9t+Vm zEX5jxCF+n-b%V6Z>*O`vCAQ=;q1o4nD7r*u>s3-Jt`eKBfs^MY)V^O4oUTL5jD(?0 zF{_WuNbeNlS@k)woqD)ue@ye7HhYiL**{65ve%sAAp=_HZCN~uVd*di-_Vb7@%$|= zzMr7<{uo>L_n11cq-s)=sD{f#HeSN3{P{)+8bKE*WKa$*b2SvG@xTOTHqnUXQdgZNkzo;g|FY zkvW%1D!E5SxiN9+8U)APCb!g?vFSSchKre9te~pWpM)qI@>BgNFAS%qJeAgtA}Z=b z$t{qga`>E-Lph~QWO#e(qO!e0ke4pQoymB+m?1E`g51gsjk{Ozi!`OW&Xd{-1(msu zlvhYOdNji5L>}w=;Z!#mvbqpL_mqVGFT zMVVp-T7nqqOy*A~i@bg|&cnkN?jO{%vYy8Mqbha}tLUG$V*5cl)0-|ld)dR%P6RU> zzC8cFiKnlYm|p0_-pK%4r75j=vd91cAOJ~3K~z>!BP3EitYrqLC!yQuV{NOC%ueqN zpdt2-hA5p)E*enjdftNPF-GlajM~!(wWkqIUgt4>MmYNz zUyPkj-ly&7=xKz~-SA@Mw0F_R)=3XrCw=Uk^|5u*$40G(T%~sgPxlO-Hdc08h;6kI z*=Qk@-$N+BhsfsinZ!;Tsl6^T2R-Cxqv$(%?Uh8w#iQ2uZktP;nWl>n9k9 zzT}?%m*^N=MpNe&Iz}4Y(Yb=I$sO+KTtUb9Dz|m7a#iCqJi@M%S8a%c%lp`=KA^N& zgje)EqH8V@lK&Y|m7f#Uav8_mPjN2zgp!e4Ozi3~y=%tAt{)pua@c(x$KJ~{`W95| zzl24sP$`;{OLc{6E6Q?^DuC%qeIvAU6LCf|CD$cgzQb zrd}d0_YO%#+JvWHCnEhOxm8vS&c?I#pqb;>Lrg7KQq>ShXp|XMwK4P$CDGCxLQzg2 z73Kcq=Qxv{2|y5C<2d3xuZ-jZYDJ7+oMdg?NtA@bBmE0yKI9puq(|J7+y? zopdi^qt-+2sEbUci%hAD)Is+Rd|DUy1hTsbWOpwRTG{DftW z@xM!Efr9cnC$b8~WEF`iZ&8y`V@%JKkdaMeChr?Cvh2Y2V^els8q+kRL3Z~wf-1it zwqApgJzECWgf!3UvUuW5Zo_RN;=aPv!NB@Qb=cTFVujQa{8h_I-q* zzvYhO|3KIC*IbwX3tDc!;VavJLC53YQOABnSkiTZV=kj~eHW?XS2(!;7P-T3kgGnW ztl5sVGHv|hz91s?4q2rl+6IG|Ud>>9A&vQsd^R4`v-hNrN8ikH^m?4*mkT_3y~op6 z>)d}d$nsV*Z9OrxbtKT!lR{OU3$?ZW^!H~mJWxP;RXl3@TZm2G$IGJYNQzP&iSZXAv$%?%s{{wRE?<87&wTR1|KP61AFxy2!NuzyLdlohy73!~%zsb& zP&TWZEvzi$Guso(cy|s3Sq|hBN^tShz|Z?OKF(i~o@PN+g^00{D24~!XsOZTU^kq@ zCk1RDRnj#kq;*i2htJBHTF$4nKbpie6;@(h0)upji8CTIN1vjydkhbIvcFl&M87xN zi@B^WL~yWGMpvhd!M-T=c57*DmN35<&eBo_Jw5Sc721+fW>3>_EJ-Ey#1%+!4KhZq zFhVHSK_JpbWTlOjOb2UQZDjU(rwEn$*r^OqsEx39GQi%+;4OrwF+}BNgv!kjmAe6s z9)^D>2B_Q(P`T@0#L?5>BIg*@UWRXNJRRKhv3Jow8$I>T;OSta);YtcgH)-FwL<$X zeCO~it?yk#Ak{)B(>`rMd)+q{GP=C8xD`)+g$`Z;8W>u9fsyzs9?@6uPW=#d*e4i? zK1bW|5=N$%Nlox2EWnzW1Pj8`G)b*CAg;)e{8}3_OQh5^s0oSDAU@rkj8ZX0tp+rX z3F%n0W@y`v(KR22HVr5lzKM7C$D}phBC+l=X`NT79MNQWLqhGG24&NCDDAn8Z@^__ zl6O%!e2B#6Hz?INNXXKqWz2!PVOQEFyy>3~rEfZl$&EznI=#`dewVMz{s|qMeHe3{T#ee&y!$f`1=s>O$i`C>NjH*x&DpI3hz=lM5- zoV*(5@#`rbeLc^ES2H|#ImV-x^Su0ipC@0haQteL$1fLo`g)$DlUWXrCYf8zrllpG zM^84Hob92iDuVv*bkb985t;suq;NY_G7aplZy+-J5}EiqO1Uh~| z-skG&chS=M7+vG{aP+=Rb6*g3oxuz(MN!k~z+i7YD}8w^_r$O@UrlA1n!XWV+WY)z ztM?$n=O*LB;q>?W(A^iz^k^_eS=Xp;xWmxAJ>x4u>{Nmpvy{YWpAQCRMY zquP+1LQ^ubwPg2aB$T}>Gt>I;HvkFDBbl@y6dCzcpIb2QB?SF6m9T$m-CiX->`1 zehh7MHgvA)Ft}$({oGwjhBc_{(I+tUGNuOqf>`iA0@1s82Yf|Nk%)#)JDNLv*}mVz z@=`U6%f#l0jSMy)W^?qif=92Kc=)V^;sryu_dM!J`|e(60-4X>f6dkG868>n3kSy=60VXc+=Za>P}?5OThvv@z9 zrOhz1;`C|B6S3Hrz|2S{kB&N+Ur1zbIh3Y4B{K70p_2R?MHNOo{yvY{nG`B34JmKC z$^KCeT@x~5vM-TRe3iT^DFxLI_=IR;uew5fvLSsVUPQ%zPHBxMnMGHq>Ck6-K8cMT zceZ!JSl#kwbE}=YMn4M6T<9DKCqCJflvFi=z6#9r^{|uc(a>j0al3$m7CDujK}2WR z;TB|y%GDHWxh^7!7FIG{Byv6E3SHz1-7|oC*s1ifb=12+_;&PAIP0Tu`6aOPC|q<= zxaeZ=#d0g^%?W@ct)X0%uq%VJBiWXsG> zClyThnl*RlKlq-Du$`GzHD@kv5+l{Pk4=}q? z#l%KB#l5yP&A74tFpuSX@yzchbL-m*w(lh~x}u|FR6up38xav^czAq5Y>Wdc)mP~C zX5{Dkk&*69|48;;wcL8##;r$P+$_*|9;9}|LZZQ4_mmllgZ?I zEX%ht85r|s<7Oh$3kgIhE)b>un(`cHb{5kZ9S^3lRYG~bJHDsVbN5u1qT0 zz1h6i#rn}GeRC1?FUC<&7fe*LH)@?HN|hTy!Cv@lTnNy*q6&2bHG&GYqlM544utguHYvqR`>|286pBVwhY9hC(dT2%u=XWkJ?ZSW8FAiFA< z)KWG6dRq#cU8w4}rl|cAdCgX2)q5fdHOJQdA_1Y+R5k}PJef{scLMo^5{gO%#Kt?2 z80Uzs<$oaeG9pH2M@M%Qb1T(c-y7xXax%?BzD%y9vU;PAlc&AxJuc_Qiz0TO)UbA1 zz~HKurqKX)?+mbWG|$X@IaQS^f>q{(hgc9BW69N-bhZyV8MqoxMYS_&DK^~LA7y)Y zh`DRo^o@kmJ`~2xS~{zH*|ha*NXd03Ji&^LLN^+_{g_-zWc{Fqnr=^OyWHs>Q!+Fg zMDu_%4FeW54Or1ObcKN_Z^joTj9wEnzUar)as)H0@yxErGIA}Hk@+wtS7KS&%V+bb zo=30d`S!0TJpb+%cOPvsyHrM9vy|#45h+=Ybo7QXvrs^Bi3SfhW30@-xw^tyoSr+Q}A%SlVKqoG+sYO)QHDl`0rU!o7ZNJF8F z(h>oc6+T1;ogqT=HN}}ugsDGdc3ethjU9)_nM|%~_^KcS}4!JVBre%6D zookB?T$?UtZYqfdBWM(;ovAK9=HshGy*D-q|n&FiMYWwu06$KCy zB_J@^usm;ArgtMS#Fd~>H&i;ew+R0}KcRH{$Gpa-*1P;eG!brSBi+!xu7qy@YNI{S zL>U0Sof7@C`Ivz^CjvsR;1}$OQtR+`Z(Lu02c&)uh!u8-!N;5J_FHq8GPS=DxLkl8?=9RP$ zDyeP}GCHfLdrZT~d=wp{Qo5&v%x)&Ib8~sz_pyS>bdhig1Moo0Qpp2eAD_BYyidG{7e<7NEi(H7s|yUAbfZ}Q-y4T@6K~)XO!u|G&Zm0u`-^<=42M-X$r3PX0bZcNnw_nq!@qFK2+U@nozSo0ke}>OL z&pd+#%N93w2yyNO^ zg^jBT@kK#&E@UxtP{!5$7z%Y#+;%e6;dB-Vq@It?1`bXFPgrwL^k%D zDb0{j-x5Y?g_^432txfm2oxI=p>-lU!iA_fb9zU^=$iJHxGXt#NrRM5N2HVF2Jo@&4Z~yk3XRq$Ce>6vPYdl$5Leevx>FSTBw%L!(?J8=^ zBN2OC#M8-w4}bp~!nJ|??mzxJ5}_9#zW=W%ygwr;GX$mFlak65>?_zndn8#bgY|oVN|9*Fuu99%NE5jLW(X(;2nwu*f6esyIRG+|9TR0;v8m@LF zG1{8WRCf`X;S#!Q)wEUWd3L9t|N7>L|9rX0>8NUqBRob-pxWte3;G6NK(I3b!3KmsPf)1q+ZZsq{6pT1QhzhiIUvL^ zNh>h)3PEoWh8qxSBHax5epdMmBl`^SevZ#C*a3yc9+}F{fY09+vBDOi%myD{YXnkj zyd~Cni4E}<+u$v=!~1n56e%2#2E5+lGsuy5RFRGdpF(e6rprzhQ$LRv;wxkUA0ri?C0OH5Z+8X(+B1Zv ze~B*nOR}r&$S;$Smm?uk>qxND5*OQ#Daln+R~<)XZZubyyII(rz{Ayu(h41|ozVpA zOpqzRL>qdUus9q&5gHF~`RUBj{TjYX24??O

gzO;4+uzgg*-ccQ^*R!~DZ;)r-%<=ri0{0$XWB=qT_rG1^ z_VX!rPg~f%-Nd7pGd%xsg}aX?n7)?J)Ji_Xa~X6DN6^?4!th)svujn1&sXsH)izJQ zyUp_-9`N$}dmP+cV05^U<|ZvUdES)QDrsyCVYD}y*^x9>rb}3xsiinAl**zo>RZy7 z->N4jQq1ww6_R2^B!~Nw7b#(VVThs5a`Gd*7-)>4FiFZ#TPEe10nGGf(bo_|YLt|= zEGuD%d&`}n~K&_695I4Fi;<(yb!r?*-3td?(^e0o7 zEu$#Wjn1+Vj+O>#uh!8~u4AGtj<)h(s`8XnR(Of6@D#p|$O_Nb2>&U-?0QR*%FKuBB1vbu!54QVu=i2}~_iF+7t> zPO&E`S$1eb&2bm~iKt|2{DaRBn;<4F%Z=(r2^-rLY;F}XGoQ`PJEJ`Nc9o~!T<777 zO-`R~a&UK?C*N=K_=jse`eA}6f1cyvw{yJw^EQuOZt(Q`b#{)s$jb8}A=#PmSO+Sb z!dThwy#U;KJ75UK85I}X2A5}RD z4p!SJPW7XrM8oYTS2=yT&F0NM8k=GW5E@gNtHEDrPG4gN!Mq1Xe1YvSBO46jv z4P-DsoJ?Os1Rd2X3e%kEtk%(9rKUK;m9|P9HH8Yg8iUD9aATk~p7HKvZmcvj(2`75 zzJ$IyH9fUjZme}M)>**vSOvo!DI9H2a8}1nQg!3U_&1 z@&3N`d^0gDAk?0KP<#AC>`-d$QEKgxt89=3S|jzhLaemHM`nSy#2jyl1)d@c+NP5kfgIT4#iQ`CPLwRp6-@l(!v9%u8N?kDG`O(5%Y5&BKA2;a;gjMoe`wx zIO690J|&G_w2!Hn+KA)Yej3wT87$stVEb+#%ZH7OEmd;=#Xf7hJ=C+#_WD^|ZDMJop5@(Qc1{{N zyw}9><3=8S*T;*$&GYgv*ZJnpdp!SfnFp`NIepsC_-rNvqsdInh zrEDDyaQJALt-DQh4d?Rs*>&P0?MMhRCr2-%xX>!T=Hi zobj?dhhFVPgwBeQfm9Z!3rLB!rm8~D+*Bz#)j5jNok)$gqBPTkvGxSU2eUc7JwS7# zj`G4F;vyVrC=a5mK9rh53H|M1^t7tTOmSp+wuGLR2$p9u8Sal{ZX%tj!Dz0HCD73n zLRVuL#Th;fw8b&n9nb7w4qdh3^fo52bhV0=$y)Z-tGK<_!>ye@HdhfaaWVDa^v3J}<_lSm9lf(!uU!Esl`@g zmU2Xv_b0j5uti_)B(0w82tgE_g933T_iovfwVj~${Qs#bg8Lm7tuMcV)$AJ zQ(F=AUkjmiB#_CqBxcrf7@e-**69{YOWhQg2jHtT7;xv$oYmZL^lj1_2eVE>yNz(R0kF&Y; ztbuzk26*=46c4@`^cUoE6F5}?-Fps{y z$+v%f#=RGN)YT*qE;T1V#)J9U1X^1ID9n|k^EXB<{EXhRP_*tw=mJa$5A#4MzJ$Hg z=ZNKJ5DDMM%k4Mh+5(Zk5X)e=HoGvCM+=+^uAo^PqXseE4ezbt4iA?Su4-*|DMk2RDAT>rFWRF7S zfKu)FPxH_489sABsdjh`&mMV@J@Ozs6e_#7qxaqbd{gxT!>$k%=J>Ya`G?ryr?tgT zYl~82gIr~ZZ=e+te@jFP3w&hecuURj6q({7G{a3``U`+=-sZRq43*H(irOHQ+aprg zA@;N7osgXK)btCe=@QUC8^rQX0qr9i4)3%xvl_+LQ6VD>fy{3wv$&T*T&@{i!&y=UF5t(7y-pGr@60^`#eEN@kE z<5mlI9{2O$*&vT!4)NmKQC_?n;ql8MZk~2=eETYUH^x}r=w@lHjpdCdmUbJMT`l17 z!5~k*y~&fOk9hX#F?a4QP+hJjAFYs{wE#(!COio1-t~*C#ort4@8VXB=jLrp9)8IvEkq7ma64K(# zX{_|6B-fezEL+M7+^HyVr@uXf&4pUVdvh6ROCm1Jj`BPS$&pT!WCReW^I-03BV)aV zjCQ8eSRO`KQw+Ju9@G^JsVfpQ&>Tj8b22KW6-lvjqQhKhC{r^zT0~B+8c)$>WPx`0 z2G}ACv_qkCK%sJY4e6)6dH?V8=5tw~-P`YZ`&}A){6nq~5cX3QGz_Z!37=tPp8=i$ zpB1toYkUK(kOo*H@v}grFh}5PhL^+?51}b;0ux-lO>p%##l_167cUcBy?%yJWQ~`^ z8XuV*LWM0Nr48?NPpRk}_ocDlo}LLM@ma=XRG82@5Wwb9CZh`q>U*51=@l`ul)}!f z3HI*{k(g&iV$Nmqt86K&cc-D#mywxBHV+G#T~4KaKu*hm3q6A&?A)y9`hEe|Z|JPENKMf3*>JR60$RVsc((o`ps^3Zq2fBV~X9IBWxWFFfpG&WrH{Q)z%a=no!ti zN^!Fpl|8m}P53gq9>wVlf@v5j3=F=^c(|X`_^t zjdHg4>o~sE%;RSR+@?wL>FSdE~a*O+qS9$i$4)>m}uzlFZ{TDMFKIrGK z|Lc3c|M3SNzr4-S{cg68^C&M5AXsq$k;|vVhd40Ol0j0KJxMVah>bNNEk#CElb+Vz zXiDlt)Hf;ca{nzcQ5Q&xKaW=RDebL(w6>~PS+3^#S|uAx1ymIJ(o`p>uEK-$54(kD#@-l;wp*mKRo;otR^1b(x9b zR`hCniZbLBBzZI1mP}5ZCuP|(dfTFCs*R=~Cy2BJKPoDtkq0^8Ewe)8Z*M>oV2Awg zAicr#b7X&?H|I(HZQjN=(Dv>8O9MbBg7gM_ey?X`Dz!E!HP$FJ)+jVq$kkTJRhIY$ zS|SawK;mzXP+^9T%oHz)2_9k-+=V8%3NGX9^*Wxf<7I-2mlSRcpkr6AtGLg*zZ%k zC7+;+Hm0ssNp`l90GR+EPgD8^^J(Y|CN5DO*Jio@HXX`w9zQK!cw|MpA9?!qq;Pgowk6%u3e6N$^`!n1) z?PYDRmC?C0>e@s!H7VKLY^1-xklwCTI@=QQ5r2eQV@BU#A(IPb+<7|8#9S&%TZPm& z2NLM_F(E-;6CG~O+0$Yf|bQ;0k9ZP=Vm4= zQTaKcQHUAuYo;tGiAb%A#hDp~2l}b2DkL;e$Y5s^MY);O*0c~Et;fyT0kMyOm`FX% zHPuuVAy43+Ca?cGedvDbLrDnH7kK*bJf44r!n*BBeD_KO21g zehKLuC_y|bucWlggFtYb9lYTV)MGU%i84`a}#C~S@$W8E;n&2fd!Ch#Ko4^=XZ)2Q2FTIX2&YqWFS3=X*RnQVI zku}~@o3{wx$*w+6T=qE%D(%TFwI{vUn3&{qEN-f48+0Ky{X7vVXHmzUqpVfJ=3zhE z`@M7x=*TK^pm#{i#B3Z3OBwWE4PbOOjQQ1UhOVYEbTyHlJ}r|Ap%hhLp}Z~-AKx>0 ziasJN(wMSZH%6v{X=;@-Hl0Ilr-FuV4ZeYA$tZFrE!P#X@@tGOe@%9YGb#DDl(s5a zJ?Ufr={nbTy2vc?CM?PvU-{PvC7)tx`v;VPMmW2D&KIBmo*mTce1Hy@F0}PkxD# zy2fBuH%mFW*UG~ey*zo@%h5?IkDtu&?D;w`zuD#6?{|6m-3q4<8acjG!|{V6?mnC0 z#g9*U@%>FUt`|^K?a0PjGDAbbbaZRkS#L!bXp2&Do{5(iKu8D60rbH zpql*5EN-lAP@0v&L|+*(p{6WNXYkFVL8ix3sjk)%pX!f?*b-@wJwk;wB84>)g*Co@ zHow612G`r6LH#(P=!j}3@Z&Z;Vp^p5Fu=Gn5)QOp1siC1O zgu-eUDjGbQx)#IYdJ-d(A@q&vNXav&s>Orq79Z+5q|`QhQr{{gG0hDR@3S~KeMoDc zjP79>GfUx&FGv}jc4O+ggq|5CJ11>)k88;&u|cT#m~%$&QCcgfxJE*0qlos25N39& znBHq(U?z)-MkNt3R!AgYAn^Hws2EdX;~cQI`4gXg`YS$s|6ftc91#mI6B25RHrST< zSP4>z35f|pR2oYXQaz~e4rTMChok#ToIKv)#;r;ECvxcQOQ5Pe6usJ!7`+RV&EYf^ z3Q5g!BsR;6*v!i$eP&v55q5hr&Gxc8)nhtC#y@a+~y zcSmSy4I(Ygj2j1e+&Zb?@Mw~*_B>MK-O&WtlbNsM^zm(;et*i{mt8b=c~ev*CP4Zz zk(#gQ=?Gu3e-5FC8(GQm#D{B0iV7u8uctUCo8)L63Xv;1wGU~DI>NLXk`l8>Ovp#C zi$Nig&{|VWMScwBxjG6nWpp;HSf0gVfIov7aR(g%u*XH6n%8TS&i* z_20z`k;3wAetxdh-!UTcr zGF}p6JVeI035;;{zKDyb5l$W#adN+allw)S+>LN{H^SM&7}wX=gNAECOFYF^c>7xO zPDz~zZS?09RN2tj?aJU}7-bDth)z34a=tk-?Vk|(ze{zy8^sN_gvNeFc)}%gaTiF* zwV|-mo!GQXH1`VV8CBBQC8DZHOkKMl#Wh}3G`XzQQpHf(>pmQ>u=HV!6dqe4( zjG}EoM{d3kp~2?FL^zV1T5$;SSV!wpp{3@=6LdKkw;G!c=ULN2M@2-m7 zTMANLS)M3lw!fIM_HdFzt;kNXq-Qu3S%4L)AWw4hlSzn+LLaQgUn)T0Y>%JJhL{+8 z6uus){Jj_-&0+d#KHcrH)KzHdY|$}49nJbuBHQcnY^`N7J6=dZRw%KNYSNM;$xhZ& zoDoe|LoOK!3NjMqbhX5jm+8jUkr=LyCZP|rMHk_KHp~%`(gK0Z645`wB~n-#aQ-5O zbA)okZGtxlC4SaOf7XKfg*u`Lwnw40G2l~K;v4uAK53xI>!az<@EPMHzl^u9F`nX| z0J?Zyz}e&cYkU`QbUnuvw+mcxGhlS_y!>`S#?8kZ50NEaQY+qxNcx(FE-wnJZ7Hg? zp`ytVRrrS_<)0-X_bjnlXHZ3cOi4X0;HPg|d!(qa|u zwUP8S>!~XACnDIKm@o@kYJzC4_Mpw)Svx3p}Z4QPb8RMO$wqxg`<;Lyd_^ zaiXC;l_PgSd&^a5wHOGI>z zr*rURlEbG{-23JlNB24zoC#%iGm5RFW;TzSncU80Y&w@)JN?`~nd93Z?(^{B8tZEn zT;J{F-lMDBIvwTUsF$7nIyN^;+1)GQ;HZ$3+XdXbQ_S(*O4he>sHzeZ9c50G-k9pj z0KWh3F8ljyKpAwJ3kq3e5u`d=U> z)P%-L5k;jA49zCv=Wm0l#aS#Jj1hRb;p%t=ZIF=SY#)lV#rTJuMWqyz9P3VReF)9< zVtP8mXsZvQB-@dNX*~@ULY8Ne>F@C8`f3I@cQX0gvstc<6;NLmMPqRuQ@wS}4UaO} z-^2QR2a!4ksE*tWfq8(mWKZgutBc1M-k%q_PS55vP2eSfp4Gz+nf9Z z(e!8S=Ve3nlN#e8x`ey%($4@I@Lh2`|BE=eU%=VpBF>(da1mU_O=ya{$O2D^CGVu= z8c|kjOKXpaswOwW6TYCR#)P=cFNw|kg6PyQ2v7PFRoF))q#KivW zqY68RfA9y$0{?_G@O{*gpP`QUj3E72czAz-=$($lP_vr$DyQ!veqp`;|a5EW&CPH#$Vye+wf z0*b3-6xGSes}Ylu?@H%L64y_Lxp9AuJI`)#aBqX7lSLLM8X4~^pdc@h)}CkvW+Pd= zk-^%{QkHJiu>Wv`!)LP`Js)HLem^5~sf;YdGITA5zUfF>`vaL;D`s-Og3M$wBLn%& zOck@VP|D)9a)!oIn4C*zawdbxnKb5?Qd!wZVRa*&jm>PfcM90OQNaFTA)A}ojE+V! zc{QHs@QXx7nzFv$#nY#UJb!t>;mH^;zq!jdukNtA*2lfm+uS)_!;3YuCC-O^}lu??Xzw8x2)T$_rg+s|%#PE}qm_PvXO@NYr0pcA$pg zwqn+%8)>OXroASXuG&O;+vCVc7NAmGp{^#7z1U|kEA5+|ghNouUY3jH_Lz^X) zb=D+ioh32%OJdW%A~@<}qEo&kQ1=n?pbzj5{(#WfPf5-*AuZRPuvk0d(w)gD5D^+< zO+cs-fm$PE+Rv%&)sR&tLK^e|Za(kh>iq}oJ%5L_>u+%Nc@Mwf^F$@sqmQ;CD%Kjc z+6o`hN7&l`1`EsIU}5!Z+`ayUjrDt2SiFnFmG>DK%ciC+l&o?Qg|&X96$;3z6i`$x zCpN}~c)bhN1%A}!D=0|xpfK5ksys2Z#lAFD1W;8HKv|KJ!dx+l@lHfWSQDZ*Cp^iX z!bU%$vm6Q18XQ?Ff&)l#(o8>hdJDMT*dK= zPWB)7a`0rFz5C&9Nekm_*p%hhowv|<*;+JovEo}1Rm$n_}h^h z?@n5(h}?W1n%h;hb_UVfsbc0@5{qkvtZx-^eLIiM?Ob;E@;NvtW_z=MYjf#zw`(XV z@E|VsGHq?4tgf~3@X0E-@AtF1(aOeF5Bo=59GncXf84_QZYiga#(3~_mj^EoSl^lD z+G+7A4k%X#8egskelX{m)I0`qIpZQ7{UTCxjR}tW5`FwvL?)iWL-;$qMDL?ieN2GvGvwM&@eTSC zZ|Ubadwxz(usx+UI>zS;*xoHkmf zZGjZl`H@~KBqG(3w0v(u!fep0Eyzk2k`&{GN^F8wVn%F;1Cc6Aa*}*#tBYo|Ka2j3 zWEyM3C@u0MKEZ{=3{MKG0}%V2NA7=_xFi9c{b?*8407|qI=3DkaqH0ux9;w-wAjVc zausVkMXcT^;>N>fZanGY_~kI$r`=rJsHVOqnCcQgigVm(ZjdlI6vDtv1e?cAtQ}TU z(V}8*xq{~QA|@w?i3}GYml>lCwjwjzgRDF^n%jffKB(g0w4UR;^&A{mv$Iph{(cF! zj;lG`uVd$W4U=OT%uQvZ4*Htx90vx5BRRR<%i&2o+q=!&f4Io~C&S!4>S1swp2`|e z`i6rU9Lr*Lr<3Cc<81DaaeQZC+}s}G&e1U2Yjsptgp-$}VPmzH z#;S0_v@Q$|loJ@_jDLUwUShLf#8YI7r^M7y0sYMp1y~?aStC_hBhy%;)Y_oXSQ+pI zn!Uv)_P_j#{QUi&s-F>F(o1-VjSSUKa1m$E3plx-dt2%LPZiMF-3S-YOSpJl#!X;m zsDP0PUn328m$>vZwDx)s67e}%#Wtkmo8l$=BkscAA(DNDKGK}xY9WoiQbd6t;4JtZ z0{JKS$j+dTaiy?A#oogqsyZd;jO|KFX7_+5jK|Z z@##nZ4SVa45C}fP+Vb}}JHJbChy^Cbf5Q0UZ*g+`5Ig&KiA!<8KiHU*0uOR)#MJfa zs2R|qPq>06!V0PK9A3gNko#T4FTfH{r>_w?pTS>jOr*-5tV9`A#bIQeS z%0eZTB?{{6)HF8fsH)Oakf)-eJcQP+I3|}H**>1*@ZLWAr+e(}uhKi5PI0x6iZ%gF zLu%%CO4z*J%I57>=GRN<>5rkV!k^qE0db-B=v2l8YmF(X7qW4;k=w5pdHUz;tn4;( zZF`Wtd&?+-jB#=P0J+kH)C_kTS~aX~Rd9T-i_=Ga+qN7p>RED5Y z`jM3!MVMMmN_;#)axMP8SBMU`M<%?4Hpm*i)(xG;g1M1o_SPFX*&bqMuz;hr8rJ73 zs4R(MXMK{Dxe)@iZg`4JaT6HhAv7@@#U==qmPl1L$h9^|HC9MfmiVeIk*h8L0iPtm z7_q4!GO>65-wiG zKLL!2zlcQk9=`tX;jjJ^5wVvD2|tHE`T{}f3)owKfV2H4BqqC)l;lcbi5ERr6;yS~ zh)Hur6K+gclq>Zeu~f81;io%CRPq&)bKLQeenwEJIo`hKaPj;Ik*rj# z@)@S4f56`UW4`$EU-|IE|A$M)f8@;BUvcKlZ*g+?0$-seKHf$Ig&31j=t^9&BZ=7p zhE@t`pGzb*%MnG8F+%Yfgwn4OE6$@1b;REGQ=IKSK_N0i8)!pZq=3{|Ut)A#=+&-h z{VWI#v?eLWje*`wmKK^>oNr@%q@0$fXmYYeWaTRuo~~y8`UK0{Q%o*2Qd}b=BKb05 zNtY<96EZj($K+Bz^XpYiEN9a<8ctJlAZaPC$Ry|R@%joglV2kgpQEx($HYo83wyQ9 z@71z(XN38UHiY7@adY_ysqcBpY66*9Ok?j(H7Ac6xOu;Z>o>BvwieB`)d*IvC$hU+ z$*to~?%kQ-?wtkpwg)N9mk=Ito~YQb$uF{|e=vxR?Q(YZs9(`jv3JN^vA1bE5FOyikBL#&@ zddG@Lj1w_3T1sbgA^I>aJ_1KXA}<{6tWik4Nlb_#C_sZ;8c2|z7JDlf^kH#m)WK*} zI%1>ahzSp)vZ##IxKyNGQsVSUcHJ%`9BZ#;yC2cg}43<%u>#&}9h@m1O2`(|Y`(Cqc1iuv0ZuI-GG`dvcg zXZ(wZ{VyX_{`9_&{1O705#GK=cuI_L7hb?sZ~<4rMVvhT?*Lsqjd1Za!uj<*LU$ii cJVfUFe}UK*aE+4P`~Uy|07*qoM6N<$g3j+&&j0`b literal 0 HcmV?d00001 diff --git a/rust/onnxruntime/tests/data/upsample.onnx b/rust/onnxruntime/tests/data/upsample.onnx new file mode 100644 index 0000000000000000000000000000000000000000..43b2596edcbbd495ee0a8bfb0eb6cd1d049b394b GIT binary patch literal 1861 zcmbVN&2G~`5Y9Sj>`YS{ERd>7l*$qZ3~C&wzd1!I9Ej8k6(j_gl^btb3wv$sO({LW zoAeIPz$@?!ya}^*T-Pm?lIAqyo%v>FXTL9+s;gpbhn%0UE_60_HtgQ72k-$NPa>K) zLF9Y<#CEC2qlxHQ`ttkO;Yk!GjMC0_&stq7eJ~nVuK5sZ5sQO(*Y2`}W#-FJMyov<$x#)aqj?b3C$v(~KDjrlUBGr>;+bJxbX!ds#OnjX0<;?S8yiq-v`0x?tETV&4JQ8OM3u?Jt zdJZZ+2V~F?37PY%XTh~Ry@?Z!9El>u0SeEkXSTTrHRs$*Rw^0RZ*of!K#fBWxoj6k z307|&vBbNO9N&>e3eTu#wz&suqcDtJk2``zGOHws;sHFiaXHLu#A-}CabLwTu3u7PA@dPj92Z2&QI=peEW{7Xt+v}P4hNW<% zcGmx4R{D^6C#T0@{C`zdq^_o_dZpmzvQ}x4L4JS3in*qdmO8j0nM18I{h3eF^Rk2v zH6=eTrRPK*Y9v2@nx@|*_E4+l0ewwr)d#bS-IO3%3$l$PTqHkZv0(1{I1Yn#5y(HL zGbF!UlmF?}U26UUo@+#Z1`S=E@XwTXJB4W%W)~E#db|hM)~4S|-7Mm_3Ugbg4wP2& EACDAH$^ZZW literal 0 HcmV?d00001 diff --git a/rust/onnxruntime/tests/integration_tests.rs b/rust/onnxruntime/tests/integration_tests.rs new file mode 100644 index 0000000000000..7843fe269e5e4 --- /dev/null +++ b/rust/onnxruntime/tests/integration_tests.rs @@ -0,0 +1,555 @@ +use onnxruntime::{error::OrtDownloadError, tensor::ndarray_tensor::NdArrayTensor}; +use std::{ + fs, + io::{self, BufRead, BufReader}, + path::Path, + sync::Arc, + time::Duration, +}; + +mod download { + use std::env::var; + + use super::*; + const RUST_ONNXRUNTIME_LIBRARY_PATH: &str = "RUST_ONNXRUNTIME_LIBRARY_PATH"; + + use image::{imageops::FilterType, ImageBuffer, Luma, Pixel, Rgb}; + use ndarray::s; + use test_log::test; + + use onnxruntime::{ + download::vision::{DomainBasedImageClassification, ImageClassification}, + environment::Environment, + GraphOptimizationLevel, LoggingLevel, + }; + + #[test] + fn squeezenet_mushroom() { + const IMAGE_TO_LOAD: &str = "mushroom.png"; + + let path = var(RUST_ONNXRUNTIME_LIBRARY_PATH).ok(); + + let environment = { + let builder = Environment::builder() + .with_name("integration_test") + .with_log_level(LoggingLevel::Warning); + let builder = if let Some(path) = path { + builder.with_library_path(path) + } else { + builder + }; + + builder.build().unwrap() + }; + let session = environment + .new_session_builder() + .unwrap() + .with_graph_optimization_level(GraphOptimizationLevel::Basic) + .unwrap() + .with_intra_op_num_threads(1) + .unwrap() + .with_model_downloaded(ImageClassification::SqueezeNet) + .expect("Could not download model from file"); + + let class_labels = get_imagenet_labels().unwrap(); + + let input0_shape: Vec = session.inputs[0].dimensions().map(|d| d.unwrap()).collect(); + let output0_shape: Vec = session.outputs[0] + .dimensions() + .map(|d| d.unwrap()) + .collect(); + + assert_eq!(input0_shape, [1, 3, 224, 224]); + assert_eq!(output0_shape, [1, 1000]); + + // Load image and resize to model's shape, converting to RGB format + let image_buffer: ImageBuffer, Vec> = image::open( + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("data") + .join(IMAGE_TO_LOAD), + ) + .unwrap() + .resize( + input0_shape[2] as u32, + input0_shape[3] as u32, + FilterType::Nearest, + ) + .to_rgb8(); + + // Python: + // # image[y, x, RGB] + // # x==0 --> left + // # y==0 --> top + + // See https://github.com/onnx/models/blob/main/vision/classification/imagenet_inference.ipynb + // for pre-processing image. + // WARNING: Note order of declaration of arguments: (_,c,j,i) + let mut array = ndarray::Array::from_shape_fn((1, 3, 224, 224), |(_, c, j, i)| { + let pixel = image_buffer.get_pixel(i as u32, j as u32); + let channels = pixel.channels(); + + // range [0, 255] -> range [0, 1] + (channels[c] as f32) / 255.0 + }); + + // Normalize channels to mean=[0.485, 0.456, 0.406] and std=[0.229, 0.224, 0.225] + let mean = [0.485, 0.456, 0.406]; + let std = [0.229, 0.224, 0.225]; + for c in 0..3 { + let mut channel_array = array.slice_mut(s![0, c, .., ..]); + channel_array -= mean[c]; + channel_array /= std[c]; + } + + // Batch of 1 + let input_tensor_values = vec![array.into()]; + + // Perform the inference + let outputs = session.run(input_tensor_values).unwrap(); + + // Downloaded model does not have a softmax as final layer; call softmax on second axis + // and iterate on resulting probabilities, creating an index to later access labels. + let output = outputs[0].float_array().unwrap(); + let mut probabilities: Vec<(usize, f32)> = output + .softmax(ndarray::Axis(1)) + .iter() + .copied() + .enumerate() + .collect::>(); + // Sort probabilities so highest is at beginning of vector. + probabilities.sort_unstable_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); + + assert_eq!( + class_labels[probabilities[0].0], "n07734744 mushroom", + "Expecting class for {} to be a mushroom", + IMAGE_TO_LOAD + ); + + assert_eq!( + probabilities[0].0, 947, + "Expecting class for {} to be a mushroom (index 947 in labels file)", + IMAGE_TO_LOAD + ); + + // for i in 0..5 { + // println!( + // "class={} ({}); probability={}", + // labels[probabilities[i].0], probabilities[i].0, probabilities[i].1 + // ); + // } + } + + #[test] + fn mnist_5() { + const IMAGE_TO_LOAD: &str = "mnist_5.jpg"; + + let path = var(RUST_ONNXRUNTIME_LIBRARY_PATH).ok(); + + let environment = { + let builder = Environment::builder() + .with_name("integration_test") + .with_log_level(LoggingLevel::Warning); + let builder = if let Some(path) = path { + builder.with_library_path(path) + } else { + builder + }; + + builder.build().unwrap() + }; + + let session = environment + .new_session_builder() + .unwrap() + .with_graph_optimization_level(GraphOptimizationLevel::Basic) + .unwrap() + .with_intra_op_num_threads(1) + .unwrap() + .with_model_downloaded(DomainBasedImageClassification::Mnist) + .expect("Could not download model from file"); + + let input0_shape: Vec = session.inputs[0].dimensions().map(|d| d.unwrap()).collect(); + let output0_shape: Vec = session.outputs[0] + .dimensions() + .map(|d| d.unwrap()) + .collect(); + + assert_eq!(input0_shape, [1, 1, 28, 28]); + assert_eq!(output0_shape, [1, 10]); + + // Load image and resize to model's shape, converting to RGB format + let image_buffer: ImageBuffer, Vec> = image::open( + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("data") + .join(IMAGE_TO_LOAD), + ) + .unwrap() + .resize( + input0_shape[2] as u32, + input0_shape[3] as u32, + FilterType::Nearest, + ) + .to_luma8(); + + let array = ndarray::Array::from_shape_fn((1, 1, 28, 28), |(_, c, j, i)| { + let pixel = image_buffer.get_pixel(i as u32, j as u32); + let channels = pixel.channels(); + + // range [0, 255] -> range [0, 1] + (channels[c] as f32) / 255.0 + }); + + // Batch of 1 + let input_tensor_values = vec![array.into()]; + + // Perform the inference + let outputs = session.run(input_tensor_values).unwrap(); + + let output = outputs[0].float_array().unwrap(); + let mut probabilities: Vec<(usize, f32)> = output + .softmax(ndarray::Axis(1)) + .iter() + .copied() + .enumerate() + .collect::>(); + + // Sort probabilities so highest is at beginning of vector. + probabilities.sort_unstable_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); + + assert_eq!( + probabilities[0].0, 5, + "Expecting class for {} is '5' (not {})", + IMAGE_TO_LOAD, probabilities[0].0 + ); + } + + #[test] + fn mnist_5_concurrent_session() { + const IMAGE_TO_LOAD: &str = "mnist_5.jpg"; + + let path = var(RUST_ONNXRUNTIME_LIBRARY_PATH).ok(); + + let environment = { + let builder = Environment::builder() + .with_name("integration_test") + .with_log_level(LoggingLevel::Warning); + let builder = if let Some(path) = path { + builder.with_library_path(path) + } else { + builder + }; + + builder.build().unwrap() + }; + + let session = Arc::new( + environment + .new_session_builder() + .unwrap() + .with_graph_optimization_level(GraphOptimizationLevel::Basic) + .unwrap() + .with_intra_op_num_threads(1) + .unwrap() + .with_model_downloaded(DomainBasedImageClassification::Mnist) + .expect("Could not download model from file"), + ); + + let children: Vec> = (0..20) + .map(move |_| { + let session = session.clone(); + std::thread::spawn(move || { + let input0_shape: Vec = + session.inputs[0].dimensions().map(|d| d.unwrap()).collect(); + let output0_shape: Vec = session.outputs[0] + .dimensions() + .map(|d| d.unwrap()) + .collect(); + + assert_eq!(input0_shape, [1, 1, 28, 28]); + assert_eq!(output0_shape, [1, 10]); + + // Load image and resize to model's shape, converting to RGB format + let image_buffer: ImageBuffer, Vec> = image::open( + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("data") + .join(IMAGE_TO_LOAD), + ) + .unwrap() + .resize( + input0_shape[2] as u32, + input0_shape[3] as u32, + FilterType::Nearest, + ) + .to_luma8(); + + let array = ndarray::Array::from_shape_fn((1, 1, 28, 28), |(_, c, j, i)| { + let pixel = image_buffer.get_pixel(i as u32, j as u32); + let channels = pixel.channels(); + + // range [0, 255] -> range [0, 1] + (channels[c] as f32) / 255.0 + }); + + // Batch of 1 + let input_tensor_values = vec![array.into()]; + + // Perform the inference + let outputs = session.run(input_tensor_values).unwrap(); + + let output = &outputs[0].float_array().unwrap(); + let mut probabilities: Vec<(usize, f32)> = output + .softmax(ndarray::Axis(1)) + .iter() + .copied() + .enumerate() + .collect::>(); + + // Sort probabilities so highest is at beginning of vector. + probabilities.sort_unstable_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); + + assert_eq!( + probabilities[0].0, 5, + "Expecting class for {} is '5' (not {})", + IMAGE_TO_LOAD, probabilities[0].0 + ); + }) + }) + .collect(); + + assert!(children + .into_iter() + .map(std::thread::JoinHandle::join) + .collect::, _>>() + .is_ok()); + } + + #[test] + fn mnist_5_send_session() { + const IMAGE_TO_LOAD: &str = "mnist_5.jpg"; + + let path = var(RUST_ONNXRUNTIME_LIBRARY_PATH).ok(); + + let environment = { + let builder = Environment::builder() + .with_name("integration_test") + .with_log_level(LoggingLevel::Warning); + let builder = if let Some(path) = path { + builder.with_library_path(path) + } else { + builder + }; + + builder.build().unwrap() + }; + + let children: Vec> = (0..20) + .map(|_| { + let session = environment + .new_session_builder() + .unwrap() + .with_graph_optimization_level(GraphOptimizationLevel::Basic) + .unwrap() + .with_intra_op_num_threads(1) + .unwrap() + .with_model_downloaded(DomainBasedImageClassification::Mnist) + .expect("Could not download model from file"); + std::thread::spawn(move || { + let input0_shape: Vec = + session.inputs[0].dimensions().map(|d| d.unwrap()).collect(); + let output0_shape: Vec = session.outputs[0] + .dimensions() + .map(|d| d.unwrap()) + .collect(); + + assert_eq!(input0_shape, [1, 1, 28, 28]); + assert_eq!(output0_shape, [1, 10]); + + // Load image and resize to model's shape, converting to RGB format + let image_buffer: ImageBuffer, Vec> = image::open( + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("data") + .join(IMAGE_TO_LOAD), + ) + .unwrap() + .resize( + input0_shape[2] as u32, + input0_shape[3] as u32, + FilterType::Nearest, + ) + .to_luma8(); + + let array = ndarray::Array::from_shape_fn((1, 1, 28, 28), |(_, c, j, i)| { + let pixel = image_buffer.get_pixel(i as u32, j as u32); + let channels = pixel.channels(); + + // range [0, 255] -> range [0, 1] + (channels[c] as f32) / 255.0 + }); + + // Batch of 1 + let input_tensor_values = vec![array.into()]; + + // Perform the inference + let outputs = session.run(input_tensor_values).unwrap(); + + let output = &outputs[0].float_array().unwrap(); + let mut probabilities: Vec<(usize, f32)> = output + .softmax(ndarray::Axis(1)) + .iter() + .copied() + .enumerate() + .collect::>(); + + // Sort probabilities so highest is at beginning of vector. + probabilities.sort_unstable_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); + + assert_eq!( + probabilities[0].0, 5, + "Expecting class for {} is '5' (not {})", + IMAGE_TO_LOAD, probabilities[0].0 + ); + }) + }) + .collect(); + + assert!(children + .into_iter() + .map(std::thread::JoinHandle::join) + .collect::, _>>() + .is_ok()); + } + + // This test verifies that dynamically sized inputs and outputs work. It loads and runs + // upsample.onnx, which was produced via: + // + // ``` + // import subprocess + // from tensorflow import keras + // + // m = keras.Sequential([ + // keras.layers.UpSampling2D(size=2) + // ]) + // m.build(input_shape=(None, None, None, 3)) + // m.summary() + // m.save('saved_model') + // + // subprocess.check_call([ + // 'python', '-m', 'tf2onnx.convert', + // '--saved-model', 'saved_model', + // '--opset', '12', + // '--output', 'upsample.onnx', + // ]) + // ``` + #[test] + fn upsample() { + const IMAGE_TO_LOAD: &str = "mushroom.png"; + + let path = var(RUST_ONNXRUNTIME_LIBRARY_PATH).ok(); + + let environment = { + let builder = Environment::builder() + .with_name("integration_test") + .with_log_level(LoggingLevel::Warning); + let builder = if let Some(path) = path { + builder.with_library_path(path) + } else { + builder + }; + + builder.build().unwrap() + }; + + let session = environment + .new_session_builder() + .unwrap() + .with_graph_optimization_level(GraphOptimizationLevel::Basic) + .unwrap() + .with_intra_op_num_threads(1) + .unwrap() + .with_model_from_file( + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("data") + .join("upsample.onnx"), + ) + .expect("Could not open model from file"); + + assert_eq!( + session.inputs[0].dimensions().collect::>(), + [None, None, None, Some(3)] + ); + assert_eq!( + session.outputs[0].dimensions().collect::>(), + [None, None, None, Some(3)] + ); + + // Load image, converting to RGB format + let image_buffer: ImageBuffer, Vec> = image::open( + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("data") + .join(IMAGE_TO_LOAD), + ) + .unwrap() + .to_rgb8(); + + let array = ndarray::Array::from_shape_fn((1, 224, 224, 3), |(_, j, i, c)| { + let pixel = image_buffer.get_pixel(i as u32, j as u32); + let channels = pixel.channels(); + + // range [0, 255] -> range [0, 1] + (channels[c] as f32) / 255.0 + }); + + // Just one input + let input_tensor_values = vec![array.into()]; + + // Perform the inference + let outputs = session.run(input_tensor_values).unwrap(); + + assert_eq!(outputs.len(), 1); + let output = outputs[0].float_array().unwrap(); + + // The image should have doubled in size + assert_eq!(output.shape(), [1, 448, 448, 3]); + } +} + +fn get_imagenet_labels() -> Result, OrtDownloadError> { + // Download the ImageNet class labels, matching SqueezeNet's classes. + let labels_path = Path::new(env!("CARGO_MANIFEST_DIR")).join("synset.txt"); + if !labels_path.exists() { + let url = "https://s3.amazonaws.com/onnx-model-zoo/synset.txt"; + println!("Downloading {:?} to {:?}...", url, labels_path); + let resp = ureq::get(url) + .timeout(Duration::from_secs(180)) // 3 minutes + .call() + .map_err(Box::new) + .map_err(OrtDownloadError::UreqError)?; + + assert!(resp.has("Content-Length")); + let len = resp + .header("Content-Length") + .and_then(|s| s.parse::().ok()) + .unwrap(); + println!("Downloading {} bytes...", len); + + let mut reader = resp.into_reader(); + + let f = fs::File::create(&labels_path).unwrap(); + let mut writer = io::BufWriter::new(f); + + let bytes_io_count = io::copy(&mut reader, &mut writer).unwrap(); + + assert_eq!(bytes_io_count, len as u64); + } + let file = BufReader::new(fs::File::open(labels_path).unwrap()); + + file.lines() + .map(|line| line.map_err(|io_err| OrtDownloadError::IoError(io_err))) + .collect() +} diff --git a/rust/rustfmt.toml b/rust/rustfmt.toml new file mode 100644 index 0000000000000..267219dda5f37 --- /dev/null +++ b/rust/rustfmt.toml @@ -0,0 +1,2 @@ +format_code_in_doc_comments = true +imports_granularity = "Crate" From 875a7791bf610f9d5cc670ad20dfb58627406aba Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Wed, 8 Feb 2023 19:54:06 -0800 Subject: [PATCH 37/68] [DORT] Update import path (#14605) Follow up changes from https://github.com/pytorch/pytorch/pull/93409/files for fixing DORT CI failures. --- .../orttraining/python/training/torchdynamo/register_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/orttraining/orttraining/python/training/torchdynamo/register_backend.py b/orttraining/orttraining/python/training/torchdynamo/register_backend.py index a9450e119c7a5..2830a10b4feb7 100644 --- a/orttraining/orttraining/python/training/torchdynamo/register_backend.py +++ b/orttraining/orttraining/python/training/torchdynamo/register_backend.py @@ -4,7 +4,7 @@ # -------------------------------------------------------------------------- from functorch.compile import min_cut_rematerialization_partition -from torch._dynamo.optimizations.training import aot_autograd +from torch._dynamo.backends.common import aot_autograd from .ort_backend import OrtBackend From b53038b6a097a5b7168a8783c9659cedf85c6e00 Mon Sep 17 00:00:00 2001 From: PeixuanZuo <94887879+PeixuanZuo@users.noreply.github.com> Date: Thu, 9 Feb 2023 13:55:21 +0800 Subject: [PATCH 38/68] Fix softmax block forward with small element size (#14475) ### Description 1. ALIGN_BYTES is set to 16 before because float4 is used for vectorization by default. This PR computes ALIGN_BYTES by vectorize size. 2. Fix wrong data access when using small elemant size (e.g., 1, 33). Small case may be used for SoftmaxTunableOp. 3. Fix the bug that data may be written first and then read in BlockReduce function on ROCm EP. There is a slightly performance improvement because all theads in warp-0 work. BlockReduce method before this PR: One block has N(warps_per_block) warps, one warp has M(WARP_SIZE) threads. step1. All the threads in one block read data into shared memory. step2. Reduce all data to the first warp. Only the first N threads of warp-0 are used. thread-0 computes data in warp-0 and writes the result into the location of data0, thread-1 computes data in warp-1 and writes the result into the location of data1. __syncwarp(mask) is necessary here to make sure thread-1,...N will delay writing data into warp-0 until thread-0 has finished reading data from warp-0. step3. Thread-0 reduces all vaild data(only the first N data) in warp-0 and writes the results into the location of data0, then return data0. Issue: ROCm doesn't support __syncwarp() now, we need another implementation to make sure read before write in warp-0. BlockReduce function in this PR. step2. Reduce all data to the first warp. Only the threads of warp-0 are used. Each thread in warp-0 read data from the same location of every warp and computes result. For example, thread-0 computes the first data of every warp and writes the result into the location of data0. step3. Thread-0 reduces all data in warp-0 and writes the results into the location of data0, then return data0. Shared memory ![image](https://user-images.githubusercontent.com/94887879/216281207-8b332af5-bb9f-443a-8e2d-5d40c2231629.png) Test: kernel explorer will use small element to test. (https://github.com/microsoft/onnxruntime/pull/14541) --- .../cuda/math/softmax_blockwise_impl.cuh | 171 +++++++++--------- 1 file changed, 83 insertions(+), 88 deletions(-) diff --git a/onnxruntime/core/providers/cuda/math/softmax_blockwise_impl.cuh b/onnxruntime/core/providers/cuda/math/softmax_blockwise_impl.cuh index bb26f5fdccad6..6cb65ea8e739c 100644 --- a/onnxruntime/core/providers/cuda/math/softmax_blockwise_impl.cuh +++ b/onnxruntime/core/providers/cuda/math/softmax_blockwise_impl.cuh @@ -1,19 +1,19 @@ /** -* Copyright (c) 2016-present, Facebook, Inc. -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Copyright (c) 2016-present, Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ // The code below is mostly copied from Pytorch SoftMax.cuh @@ -23,7 +23,6 @@ namespace onnxruntime { namespace cuda { -constexpr int ALIGN_BYTES = 16; const int max_threads = 1024; dim3 SoftMax_getBlockSize(int ILP, uint64_t dim_size) { @@ -45,33 +44,28 @@ dim3 SoftMax_getBlockSize(int ILP, uint64_t dim_size) { return dim3(static_cast(block_size)); } - //////////////////////////////////////////////////////////////////////////////// // Regular kernel (fast when dim_size is large; requires inner_size == 1) //////////////////////////////////////////////////////////////////////////////// - template -struct MaxFloat -{ +struct MaxFloat { __device__ __forceinline__ AccumT operator()(AccumT max, T v) const { return ::max(max, (AccumT)v); } }; -template -struct AddFloat -{ +template +struct AddFloat { __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const { return sum + (AccumT)v; } }; -template -struct SumExpFloat -{ +template +struct SumExpFloat { __device__ __forceinline__ SumExpFloat(AccumT v) - : max_k(v) {} + : max_k(v) {} __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const { return sum + std::exp((AccumT)v - max_k); @@ -80,12 +74,23 @@ struct SumExpFloat const AccumT max_k; }; -template class Reduction, typename AccumT> -__device__ __forceinline__ AccumT -blockReduce(AccumT* smem, AccumT val, - const Reduction& r, - AccumT defaultVal) -{ +// One block has N(warps_per_block) warps, one warp has M(WARP_SIZE) threads. +// 1. All the threads in one block read data into shared memory. +// 2. Reduce all data to the first warp. Only the threads of warp-0 are used. Each thread in warp-0 reads data from the +// same location of every warp and computes result. For example, thread-0 computes the first data of every warp and +// writes the result into the location of data0. +// Shared memory +// ----------------------------------------------------------------------------------------------------------------------- +// | data0 | data1 | data2 | .... | dataM | ... | dataM*2 | ... | +// ----------------------------------------------------------------------------------------------------------------------- +// | | | | +// -------------------warp-0----------------------------------warp-1----------------------------------warp-2-------------- +// 3. Thread-0 reduces all data in warp-0 and writes the results into the location of data0, then return data0. + +template