From 72d7ede6816d5e8155283f6a96bf73967a3b7d72 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 28 Sep 2024 03:54:55 +0000 Subject: [PATCH 01/25] enable fp16kernels --- onnxruntime/core/providers/xnnpack/nn/conv.cc | 23 +++++-- .../xnnpack/xnnpack_execution_provider.cc | 68 ++++++++++++++++--- .../core/providers/xnnpack/xnnpack_init.h | 17 +++++ 3 files changed, 93 insertions(+), 15 deletions(-) diff --git a/onnxruntime/core/providers/xnnpack/nn/conv.cc b/onnxruntime/core/providers/xnnpack/nn/conv.cc index b815cc1570c96..86d53ed9012e1 100644 --- a/onnxruntime/core/providers/xnnpack/nn/conv.cc +++ b/onnxruntime/core/providers/xnnpack/nn/conv.cc @@ -4,7 +4,7 @@ #include "conv.h" #include - +#include //temp #include #include "core/common/inlined_containers_fwd.h" #include "core/framework/tensorprotoutils.h" @@ -21,9 +21,10 @@ Status Conv::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, /*out*/ bool& is_packed, /*out*/ PrePackedWeights* /*prepacked_weights*/) { is_packed = false; + throw std::invalid_argument("I'm here now"); // only layout of weight input is adjusted via PrePack - if ((conv_type_ == OpComputeType::op_compute_type_fp32 && input_idx == 1) || - (conv_type_ != OpComputeType::op_compute_type_fp32 && input_idx == 3)) { // InputTensors::IN_W + if (((conv_type_ == OpComputeType::op_compute_type_fp32 || conv_type_ == OpComputeType::op_compute_type_fp16) && input_idx == 1) || + ((conv_type_ != OpComputeType::op_compute_type_fp32 || conv_type_ != OpComputeType::op_compute_type_fp16) && input_idx == 3)) { // InputTensors::IN_W // Transpose from {M, C/group, kH, kW} to {M, kH, kW, C/group} auto orig_shape = tensor.Shape(); const auto rank = orig_shape.NumDimensions(); @@ -70,6 +71,7 @@ Status Conv::Compute(OpKernelContext* context) const { // we support 1D or 2D. if 1D we convert to 2D by setting H to 1 const int64_t H = is_1D ? 1 : X_shape[1]; const int64_t W = X_shape[rank - 2]; + throw std::invalid_argument("I'm here now"); // We don't need to call ValidateInputShape as we checked validity in ConvChecker. // We also can't use ValidateInputShape as-is as the weight tensor was pre-packed and the layout was changed there. @@ -142,14 +144,24 @@ Status Conv::Compute(OpKernelContext* context) const { return Status::OK(); } -ONNX_OPERATOR_VERSIONED_KERNEL_EX(Conv, kMSInternalNHWCDomain, 1, 10, kXnnpackExecutionProvider, +ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(Conv, kMSInternalNHWCDomain, 1, 10, float, kXnnpackExecutionProvider, KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), Conv); -ONNX_OPERATOR_KERNEL_EX(Conv, kMSInternalNHWCDomain, 11, kXnnpackExecutionProvider, +ONNX_OPERATOR_TYPED_KERNEL_EX(Conv, kMSInternalNHWCDomain, 11, float, kXnnpackExecutionProvider, KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), Conv); +#ifdef XNNPACK_FP16_SUPPORTED +ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(Conv, kMSInternalNHWCDomain, 1, 10, MLFloat16, kXnnpackExecutionProvider, + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + Conv); + +ONNX_OPERATOR_TYPED_KERNEL_EX(Conv, kMSInternalNHWCDomain, 11, MLFloat16, kXnnpackExecutionProvider, + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + Conv); +#endif + ONNX_OPERATOR_TYPED_KERNEL_EX( QLinearConv, kMSInternalNHWCDomain, @@ -175,5 +187,6 @@ ONNX_OPERATOR_TYPED_KERNEL_EX( .TypeConstraint("T3", DataTypeImpl::GetTensorType()) .TypeConstraint("T4", DataTypeImpl::GetTensorType()), Conv); + } // namespace xnnpack } // namespace onnxruntime diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc index 12e567e7080b3..4d87b70fc0609 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc +++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc @@ -18,6 +18,13 @@ #include "core/providers/xnnpack/detail/node_support_checker.h" #include "core/providers/xnnpack/xnnpack_init.h" +namespace { +struct KernelRegistryAndStatus { + std::shared_ptr kernel_registry = std::make_shared(); + onnxruntime::Status st; +}; +} // namespace + namespace onnxruntime { namespace xnnpack { @@ -31,6 +38,10 @@ KernelCreateInfo BuildKernelCreateInfo() { BuildKernelCreateInfo< \ ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, End, Op)> +#define KERNEL_CREATE_INFO_TYPED_VERSIONED(Start, End, Type, Op, Domain) \ + BuildKernelCreateInfo + #define KERNEL_CREATE_INFO(Start, Op, Domain) \ BuildKernelCreateInfo< \ ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, Op)> @@ -45,8 +56,12 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSIn class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 18, AveragePool); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 19, AveragePool); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 1, 10, Conv); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, Conv); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, Conv); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, float, Conv); +#ifdef XNNPACK_FP16_SUPPORTED +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 1, 10, MLFloat16, Conv); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, MLFloat16, Conv); +#endif class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 1, 10, ConvTranspose); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, ConvTranspose); @@ -86,9 +101,26 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 13 // Internal domain class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kDynamicDomainByCreate, 1, QLinearSoftmax); -std::unique_ptr RegisterKernels() { - auto kernel_registry = std::make_unique(); +#ifdef XNNPACK_FP16_SUPPORTED +Status RegisterFp16Kernels(KernelRegistry& kernel_registry) { + static const BuildKernelCreateInfoFn function_table[] = { + BuildKernelCreateInfo, // default entry to avoid the list become empty after ops-reducing + KERNEL_CREATE_INFO_TYPED_VERSIONED(1, 10, MLFloat16,Conv, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO_TYPED(11, MLFloat16, Conv, kMSInternalNHWCDomain), + }; + + for (auto& function_table_entry : function_table) { + KernelCreateInfo info = function_table_entry(); + if (info.kernel_def != nullptr) { // filter disabled entries where type is void + ORT_RETURN_IF_ERROR(kernel_registry.Register(std::move(info))); + } + } + + return Status::OK(); +} +#endif +Status RegisterKernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn function_table[] = { BuildKernelCreateInfo, // default entry to avoid the list becoming empty after ops-reducing @@ -98,8 +130,8 @@ std::unique_ptr RegisterKernels() { KERNEL_CREATE_INFO_VERSIONED(11, 18, AveragePool, kMSInternalNHWCDomain), KERNEL_CREATE_INFO(19, AveragePool, kMSInternalNHWCDomain), - KERNEL_CREATE_INFO_VERSIONED(1, 10, Conv, kMSInternalNHWCDomain), - KERNEL_CREATE_INFO(11, Conv, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO_TYPED_VERSIONED(1, 10, float, Conv, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO_TYPED(11, float, Conv, kMSInternalNHWCDomain), KERNEL_CREATE_INFO_VERSIONED(1, 10, ConvTranspose, kMSInternalNHWCDomain), KERNEL_CREATE_INFO(11, ConvTranspose, kMSInternalNHWCDomain), @@ -143,11 +175,11 @@ std::unique_ptr RegisterKernels() { for (auto& function_table_entry : function_table) { KernelCreateInfo info = function_table_entry(); if (info.kernel_def != nullptr) { // filter disabled entries where type is void - ORT_THROW_IF_ERROR(kernel_registry->Register(std::move(info))); + ORT_THROW_IF_ERROR(kernel_registry.Register(std::move(info))); } } - return kernel_registry; + return Status::OK(); } } // namespace xnnpack @@ -348,9 +380,25 @@ std::vector> XnnpackExecutionProvider::GetCap return capabilities; } +Status RegisterXnnPackKernels(KernelRegistry& kernel_registry) { + ORT_RETURN_IF_ERROR(RegisterKernels(kernel_registry)); +#ifdef XNNPACK_FP16_SUPPORTED + ORT_RETURN_IF_ERROR(RegisterFp16Kernels(kernel_registry)); +#endif + return Status::OK(); +} + +KernelRegistryAndStatus GetXnnPackKernelRegistry() { + KernelRegistryAndStatus ret; + ret.st = RegisterXnnPackKernels(*ret.kernel_registry); + return ret; +} + std::shared_ptr XnnpackExecutionProvider::GetKernelRegistry() const { - static std::shared_ptr registry = xnnpack::RegisterKernels(); - return registry; + static KernelRegistryAndStatus k = GetXnnPackKernelRegistry(); + // throw if the registry failed to initialize + ORT_THROW_IF_ERROR(k.st); + return k.kernel_registry; } XnnpackExecutionProvider::~XnnpackExecutionProvider() { diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_init.h b/onnxruntime/core/providers/xnnpack/xnnpack_init.h index a1e64bf6046b2..c4484273a790e 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_init.h +++ b/onnxruntime/core/providers/xnnpack/xnnpack_init.h @@ -46,6 +46,23 @@ namespace xnnpack { #define XNN_ALLOCATION_ALIGNMENT 16 #endif + +#if defined(_M_ARM64) || defined(__aarch64__) || defined(_M_ARM64EC) +#if !defined(__APPLE__) +// Had to temporary disable fp16 under APPLE ARM64, as compiling +// the source files require a hardware specific compilation flag. +// When building an universial binary for APPLE, this flag would +// cause trouble for x64 target. +// reference from MLAS + +#if (!defined(_MSC_VER)) +// KleidiAI doesn't support MSVC +#define XNNPACK_FP16_SUPPORTED +#endif + +#endif // +#endif // ARM64 + std::pair GetStoredAllocator(); } // namespace xnnpack From a03471a7e91bc7a40edc529f6d30d66702d39edc Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 28 Sep 2024 13:18:45 +0000 Subject: [PATCH 02/25] maxpool --- .../core/providers/xnnpack/nn/max_pool.cc | 47 +++++++++++++++++-- .../xnnpack/xnnpack_execution_provider.cc | 20 ++++++-- .../core/providers/xnnpack/xnnpack_init.h | 2 +- 3 files changed, 59 insertions(+), 10 deletions(-) diff --git a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc index 0f0b827974f66..9fabb9a0c3e73 100644 --- a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc +++ b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc @@ -2,9 +2,10 @@ // Licensed under the MIT License. #include "max_pool.h" - +#include #include "core/graph/graph.h" #include "core/providers/utils.h" +#include "core/providers/xnnpack/xnnpack_init.h" #include "core/framework/tensorprotoutils.h" // to sanity check output shape @@ -54,6 +55,9 @@ bool MaxPool::IsOnnxNodeSupported(const NodeUnit& node_unit, // input of maxpool could be fp16/fp32/fp64,i8/u8 according to ONNX if (x_type == nullptr || (x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT && +#ifdef XNNPACK_FP16_SUPPORTED + x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 && +#endif x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_UINT8 && x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8)) { break; @@ -193,9 +197,19 @@ MaxPool::MaxPool(const OpKernelInfo& info) stride_height, stride_width, dilation_height, dilation_width, output_min, output_max, flags, &p); + } else if(input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16){ + maxpool_type_ = OpComputeType::op_compute_type_fp16; + const float output_min = -65504.0; + const float output_max = 65504.0; + status = xnn_create_max_pooling2d_nhwc_f16(input_padding_top, input_padding_right, + input_padding_bottom, input_padding_left, + pooling_height, pooling_width, + stride_height, stride_width, + dilation_height, dilation_width, + output_min, output_max, flags, &p); } else { auto stype = DataTypeImpl::ToString(DataTypeImpl::TypeFromProto(*X_arg.TypeAsProto())); - ORT_THROW("unsupported Conv in maxpool, we have FLOAT|UINT8, but got ", stype); + ORT_THROW("unsupported Conv in maxpool, we have FLOAT|UINT8|FP16, but got ", stype); } ORT_ENFORCE(status == xnn_status_success, "xnn_create_max_pooling2d_nhwc_", OpTypeToString(maxpool_type_), "failed. Status:", status); @@ -225,10 +239,12 @@ Status MaxPool::Compute(OpKernelContext* context) const { pthreadpool_t threadpool = GetThreadPool(); auto reshape_fn = xnn_reshape_max_pooling2d_nhwc_f32; - if (maxpool_type_ == OpComputeType::op_compute_type_qu8) + if (maxpool_type_ == OpComputeType::op_compute_type_qu8) { reshape_fn = xnn_reshape_max_pooling2d_nhwc_u8; - else if (maxpool_type_ == OpComputeType::op_compute_type_qs8) { + } else if (maxpool_type_ == OpComputeType::op_compute_type_qs8) { reshape_fn = xnn_reshape_max_pooling2d_nhwc_s8; + } else if (maxpool_type_ == OpComputeType::op_compute_type_fp16) { + reshape_fn = xnn_reshape_max_pooling2d_nhwc_f16; } auto status = reshape_fn(op0_.get(), N, H, W, @@ -244,8 +260,10 @@ Status MaxPool::Compute(OpKernelContext* context) const { status = xnn_setup_max_pooling2d_nhwc_f32(op0_.get(), X.Data(), Y->MutableData()); } else if (maxpool_type_ == OpComputeType::op_compute_type_qu8) { status = xnn_setup_max_pooling2d_nhwc_u8(op0_.get(), X.Data(), Y->MutableData()); - } else { + } else if (maxpool_type_ == OpComputeType::op_compute_type_qs8) { status = xnn_setup_max_pooling2d_nhwc_s8(op0_.get(), X.Data(), Y->MutableData()); + } else if (maxpool_type_ == OpComputeType::op_compute_type_fp16) { + status = xnn_setup_max_pooling2d_nhwc_f16(op0_.get(), X.Data(), Y->MutableData()); } if (status != xnn_status_success) { @@ -285,5 +303,24 @@ ONNX_OPERATOR_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 12, kXnnpackExecutionPro DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType()}), MaxPool); + +#ifdef XNNPACK_FP16_SUPPORTED +ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 8, 9, MLFloat16, kXnnpackExecutionProvider, + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + MaxPool); + +ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 10, 10, MLFloat16, kXnnpackExecutionProvider, + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + MaxPool); + +ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 11, 11, MLFloat16, kXnnpackExecutionProvider, + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + MaxPool); + +ONNX_OPERATOR_TYPED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 12, MLFloat16, kXnnpackExecutionProvider, + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + MaxPool); +#endif + } // namespace xnnpack } // namespace onnxruntime diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc index 4d87b70fc0609..1376d4b64ab20 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc +++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc @@ -38,7 +38,7 @@ KernelCreateInfo BuildKernelCreateInfo() { BuildKernelCreateInfo< \ ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, End, Op)> -#define KERNEL_CREATE_INFO_TYPED_VERSIONED(Start, End, Type, Op, Domain) \ +#define KERNEL_CREATE_INFO_VERSIONED_TYPED(Start, End, Type, Op, Domain) \ BuildKernelCreateInfo @@ -83,6 +83,12 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSIn class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, 10, MaxPool); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool); +#ifdef XNNPACK_FP16_SUPPORTED +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 8, 9, MLFloat16, MaxPool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, 10, MLFloat16, MaxPool); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 11, MLFloat16, MaxPool); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 12, MLFloat16, MaxPool); +#endif // ONNX operators class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 7, 8, Gemm); @@ -105,8 +111,15 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kDynamicDomainB Status RegisterFp16Kernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn function_table[] = { BuildKernelCreateInfo, // default entry to avoid the list become empty after ops-reducing - KERNEL_CREATE_INFO_TYPED_VERSIONED(1, 10, MLFloat16,Conv, kMSInternalNHWCDomain), + + KERNEL_CREATE_INFO_VERSIONED_TYPED(1, 10, MLFloat16,Conv, kMSInternalNHWCDomain), KERNEL_CREATE_INFO_TYPED(11, MLFloat16, Conv, kMSInternalNHWCDomain), + + KERNEL_CREATE_INFO_VERSIONED_TYPED(8, 9, MLFloat16, MaxPool, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO_VERSIONED_TYPED(10, 10, MLFloat16, MaxPool, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO_VERSIONED_TYPED(11, 11, MLFloat16, MaxPool, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO_TYPED(12, MLFloat16, MaxPool, kMSInternalNHWCDomain), + }; for (auto& function_table_entry : function_table) { @@ -130,7 +143,7 @@ Status RegisterKernels(KernelRegistry& kernel_registry) { KERNEL_CREATE_INFO_VERSIONED(11, 18, AveragePool, kMSInternalNHWCDomain), KERNEL_CREATE_INFO(19, AveragePool, kMSInternalNHWCDomain), - KERNEL_CREATE_INFO_TYPED_VERSIONED(1, 10, float, Conv, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO_VERSIONED_TYPED(1, 10, float, Conv, kMSInternalNHWCDomain), KERNEL_CREATE_INFO_TYPED(11, float, Conv, kMSInternalNHWCDomain), KERNEL_CREATE_INFO_VERSIONED(1, 10, ConvTranspose, kMSInternalNHWCDomain), @@ -291,7 +304,6 @@ std::vector> XnnpackExecutionProvider::GetCap const onnxruntime::GraphViewer& graph, const IKernelLookup& /*kernel_lookup*/) const { std::vector> capabilities; - std::shared_ptr registry = GetKernelRegistry(); std::unordered_map supported_node_unit_map; NodeSupportChecker checker{graph, supported_node_unit_map}; diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_init.h b/onnxruntime/core/providers/xnnpack/xnnpack_init.h index c4484273a790e..3b8836b9ad497 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_init.h +++ b/onnxruntime/core/providers/xnnpack/xnnpack_init.h @@ -1,5 +1,5 @@ #include "core/framework/allocator.h" - +#include //temp struct xnn_allocator; namespace onnxruntime { From 77c5643ff3759c0cded94cd58981daca42043ee6 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 28 Sep 2024 13:39:42 +0000 Subject: [PATCH 03/25] rm changes in conv --- onnxruntime/core/providers/xnnpack/nn/conv.cc | 16 ++-------------- .../xnnpack/xnnpack_execution_provider.cc | 18 +++++------------- 2 files changed, 7 insertions(+), 27 deletions(-) diff --git a/onnxruntime/core/providers/xnnpack/nn/conv.cc b/onnxruntime/core/providers/xnnpack/nn/conv.cc index 86d53ed9012e1..d6cffcd08a39f 100644 --- a/onnxruntime/core/providers/xnnpack/nn/conv.cc +++ b/onnxruntime/core/providers/xnnpack/nn/conv.cc @@ -21,7 +21,6 @@ Status Conv::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, /*out*/ bool& is_packed, /*out*/ PrePackedWeights* /*prepacked_weights*/) { is_packed = false; - throw std::invalid_argument("I'm here now"); // only layout of weight input is adjusted via PrePack if (((conv_type_ == OpComputeType::op_compute_type_fp32 || conv_type_ == OpComputeType::op_compute_type_fp16) && input_idx == 1) || ((conv_type_ != OpComputeType::op_compute_type_fp32 || conv_type_ != OpComputeType::op_compute_type_fp16) && input_idx == 3)) { // InputTensors::IN_W @@ -144,24 +143,13 @@ Status Conv::Compute(OpKernelContext* context) const { return Status::OK(); } -ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(Conv, kMSInternalNHWCDomain, 1, 10, float, kXnnpackExecutionProvider, +ONNX_OPERATOR_VERSIONED_KERNEL_EX(Conv, kMSInternalNHWCDomain, 1, 10, kXnnpackExecutionProvider, KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), Conv); - -ONNX_OPERATOR_TYPED_KERNEL_EX(Conv, kMSInternalNHWCDomain, 11, float, kXnnpackExecutionProvider, +ONNX_OPERATOR_KERNEL_EX(Conv, kMSInternalNHWCDomain, 11, kXnnpackExecutionProvider, KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), Conv); -#ifdef XNNPACK_FP16_SUPPORTED -ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(Conv, kMSInternalNHWCDomain, 1, 10, MLFloat16, kXnnpackExecutionProvider, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Conv); - -ONNX_OPERATOR_TYPED_KERNEL_EX(Conv, kMSInternalNHWCDomain, 11, MLFloat16, kXnnpackExecutionProvider, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Conv); -#endif - ONNX_OPERATOR_TYPED_KERNEL_EX( QLinearConv, kMSInternalNHWCDomain, diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc index 1376d4b64ab20..21b9f873d51c6 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc +++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc @@ -56,12 +56,8 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSIn class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 18, AveragePool); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 19, AveragePool); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 1, 10, float, Conv); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, float, Conv); -#ifdef XNNPACK_FP16_SUPPORTED -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 1, 10, MLFloat16, Conv); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, MLFloat16, Conv); -#endif +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 1, 10, Conv); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, Conv); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 1, 10, ConvTranspose); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, ConvTranspose); @@ -81,7 +77,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWC class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 8, 9, MaxPool); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, 10, MaxPool); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLAS_SNAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool); #ifdef XNNPACK_FP16_SUPPORTED class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 8, 9, MLFloat16, MaxPool); @@ -112,14 +108,10 @@ Status RegisterFp16Kernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn function_table[] = { BuildKernelCreateInfo, // default entry to avoid the list become empty after ops-reducing - KERNEL_CREATE_INFO_VERSIONED_TYPED(1, 10, MLFloat16,Conv, kMSInternalNHWCDomain), - KERNEL_CREATE_INFO_TYPED(11, MLFloat16, Conv, kMSInternalNHWCDomain), - KERNEL_CREATE_INFO_VERSIONED_TYPED(8, 9, MLFloat16, MaxPool, kMSInternalNHWCDomain), KERNEL_CREATE_INFO_VERSIONED_TYPED(10, 10, MLFloat16, MaxPool, kMSInternalNHWCDomain), KERNEL_CREATE_INFO_VERSIONED_TYPED(11, 11, MLFloat16, MaxPool, kMSInternalNHWCDomain), KERNEL_CREATE_INFO_TYPED(12, MLFloat16, MaxPool, kMSInternalNHWCDomain), - }; for (auto& function_table_entry : function_table) { @@ -143,8 +135,8 @@ Status RegisterKernels(KernelRegistry& kernel_registry) { KERNEL_CREATE_INFO_VERSIONED(11, 18, AveragePool, kMSInternalNHWCDomain), KERNEL_CREATE_INFO(19, AveragePool, kMSInternalNHWCDomain), - KERNEL_CREATE_INFO_VERSIONED_TYPED(1, 10, float, Conv, kMSInternalNHWCDomain), - KERNEL_CREATE_INFO_TYPED(11, float, Conv, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO_VERSIONED(1, 10, Conv, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO(11, Conv, kMSInternalNHWCDomain), KERNEL_CREATE_INFO_VERSIONED(1, 10, ConvTranspose, kMSInternalNHWCDomain), KERNEL_CREATE_INFO(11, ConvTranspose, kMSInternalNHWCDomain), From 944519ebf5a979a872b288493a00fee7a3925d29 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 28 Sep 2024 13:51:55 +0000 Subject: [PATCH 04/25] revert conv --- onnxruntime/core/providers/xnnpack/nn/conv.cc | 179 ++++++++++++++++++ 1 file changed, 179 insertions(+) diff --git a/onnxruntime/core/providers/xnnpack/nn/conv.cc b/onnxruntime/core/providers/xnnpack/nn/conv.cc index d6cffcd08a39f..7e5324ed7cf9c 100644 --- a/onnxruntime/core/providers/xnnpack/nn/conv.cc +++ b/onnxruntime/core/providers/xnnpack/nn/conv.cc @@ -150,6 +150,162 @@ ONNX_OPERATOR_KERNEL_EX(Conv, kMSInternalNHWCDomain, 11, kXnnpackExecutionProvid KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), Conv); +ONNX_OPERATOR_TYPED_KERNEL_EX( + QLinearConv, + kMSInternalNHWCDomain, + 10, + uint8_t,// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "conv.h" + +#include + +#include +#include "core/common/inlined_containers_fwd.h" +#include "core/framework/tensorprotoutils.h" +#include "core/framework/transpose_helper.h" +#include "core/providers/utils.h" +#include "core/providers/xnnpack/xnnpack_init.h" +#include "core/providers/xnnpack/detail/utils.h" + +namespace onnxruntime { +namespace xnnpack { + +// use PrePack to handle the weight layout change as that's not a simple NCHW -> NHWC transpose +Status Conv::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, + /*out*/ bool& is_packed, + /*out*/ PrePackedWeights* /*prepacked_weights*/) { + is_packed = false; + // only layout of weight input is adjusted via PrePack + if ((conv_type_ == OpComputeType::op_compute_type_fp32 && input_idx == 1) || + (conv_type_ != OpComputeType::op_compute_type_fp32 && input_idx == 3)) { // InputTensors::IN_W + // Transpose from {M, C/group, kH, kW} to {M, kH, kW, C/group} + auto orig_shape = tensor.Shape(); + const auto rank = orig_shape.NumDimensions(); + + if (rank == 4) { + InlinedVector perm{0, 2, 3, 1}; + TensorShapeVector new_dims{orig_shape[0], + orig_shape[2], + orig_shape[3], + orig_shape[1]}; + + packed_w_ = Tensor(tensor.DataType(), TensorShape(new_dims), std::move(alloc)); + + SingleAxisTranspose(perm, tensor, packed_w_, /*from*/ 1, /*to*/ 3); + } else { + assert(rank == 3); // ConvBase::IsOnnxNodeSupported validates this + + InlinedVector perm{0, 2, 1}; + TensorShapeVector new_dims{orig_shape[0], + orig_shape[2], + orig_shape[1]}; + + packed_w_ = Tensor(tensor.DataType(), TensorShape(new_dims), std::move(alloc)); + + SingleAxisTranspose(perm, tensor, packed_w_, /*from*/ 1, /*to*/ 2); + } + + is_packed = true; + + // we can create the kernel now + ORT_RETURN_IF_ERROR(CreateKernel()); + } + + return Status::OK(); +} + +Status Conv::Compute(OpKernelContext* context) const { + const Tensor& X = *context->Input(0); // this is in NHWC format + const auto& X_shape = X.Shape(); + const auto rank = X_shape.NumDimensions(); + const auto is_1D = rank == 3; + const int64_t N = X_shape[0]; // input is NHWC or NWC + + // we support 1D or 2D. if 1D we convert to 2D by setting H to 1 + const int64_t H = is_1D ? 1 : X_shape[1]; + const int64_t W = X_shape[rank - 2]; + + // We don't need to call ValidateInputShape as we checked validity in ConvChecker. + // We also can't use ValidateInputShape as-is as the weight tensor was pre-packed and the layout was changed there. + // ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(&X, &W)); + + // CPU Conv starts with TensorShapeVector Y_dims({N, M}); and passes in X->Shape().Slice(2); + // We know this is 2D in NHWC format so we need to start with 'N', pass in the H, W, and append M last + TensorShapeVector Y_dims(output_shape_); + Y_dims[0] = N; + Tensor* Y = context->Output(0, TensorShape(Y_dims)); + + // Bail out early if one of the dimensions is zero. + if (Y->Shape().Size() == 0) { + return Status::OK(); + } + + pthreadpool_t threadpool = GetThreadPool(); + + // setup allocator/automated dellocate for workspace + size_t workspace_size = 0; + size_t workspace_alignment = 0; + xnn_allocator* allocator = GetStoredAllocator().second; + auto deallocator = [allocator](void* ptr) { allocator->aligned_deallocate(allocator->context, ptr); }; + std::unique_ptr workspace(nullptr, deallocator); + + auto reshape_fn = xnn_reshape_convolution2d_nhwc_f32; + if (conv_type_ == OpComputeType::op_compute_type_qs8) { + reshape_fn = xnn_reshape_convolution2d_nhwc_qs8; + } else if (conv_type_ == OpComputeType::op_compute_type_qu8) { + reshape_fn = xnn_reshape_convolution2d_nhwc_qu8; + } else if (conv_type_ == OpComputeType::op_compute_type_qs8_per_channel) { + reshape_fn = xnn_reshape_convolution2d_nhwc_qs8_qc8w; + } + + auto status = reshape_fn(op0_.get(), N, H, W, + &workspace_size, &workspace_alignment, + /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, + threadpool); + if (status != xnn_status_success) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_reshape_convolution2d_nhwc_", OpTypeToString(conv_type_), + "returned ", status); + } + + workspace.reset(allocator->aligned_allocate(allocator->context, XNN_ALLOCATION_ALIGNMENT, workspace_size)); + + if (conv_type_ == OpComputeType::op_compute_type_fp32) { + status = xnn_setup_convolution2d_nhwc_f32(op0_.get(), workspace.get(), X.Data(), + Y->MutableData()); + } else if (conv_type_ == OpComputeType::op_compute_type_qs8) { + status = xnn_setup_convolution2d_nhwc_qs8(op0_.get(), workspace.get(), X.Data(), + Y->MutableData()); + } else if (conv_type_ == OpComputeType::op_compute_type_qu8) { + status = xnn_setup_convolution2d_nhwc_qu8(op0_.get(), workspace.get(), X.Data(), + Y->MutableData()); + } else if (conv_type_ == OpComputeType::op_compute_type_qs8_per_channel) { + status = xnn_setup_convolution2d_nhwc_qs8_qc8w(op0_.get(), workspace.get(), X.Data(), + Y->MutableData()); + } + + if (status != xnn_status_success) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_setup_convolution2d_nhwc_", + OpTypeToString(conv_type_), "returned ", status); + } + + status = xnn_run_operator(op0_.get(), threadpool); + if (status != xnn_status_success) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_run_operator returned ", status); + } + + return Status::OK(); +} + +ONNX_OPERATOR_VERSIONED_KERNEL_EX(Conv, kMSInternalNHWCDomain, 1, 10, kXnnpackExecutionProvider, + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + Conv); + +ONNX_OPERATOR_KERNEL_EX(Conv, kMSInternalNHWCDomain, 11, kXnnpackExecutionProvider, + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + Conv); + ONNX_OPERATOR_TYPED_KERNEL_EX( QLinearConv, kMSInternalNHWCDomain, @@ -163,6 +319,29 @@ ONNX_OPERATOR_TYPED_KERNEL_EX( .TypeConstraint("T4", DataTypeImpl::GetTensorType()), Conv); +ONNX_OPERATOR_TYPED_KERNEL_EX( + QLinearConv, + kMSInternalNHWCDomain, + 10, + int8_t, + kXnnpackExecutionProvider, + KernelDefBuilder() + .TypeConstraint("T1", DataTypeImpl::GetTensorType()) + .TypeConstraint("T2", DataTypeImpl::GetTensorType()) + .TypeConstraint("T3", DataTypeImpl::GetTensorType()) + .TypeConstraint("T4", DataTypeImpl::GetTensorType()), + Conv); +} // namespace xnnpack +} // namespace onnxruntime + + kXnnpackExecutionProvider, + KernelDefBuilder() + .TypeConstraint("T1", DataTypeImpl::GetTensorType()) + .TypeConstraint("T2", DataTypeImpl::GetTensorType()) + .TypeConstraint("T3", DataTypeImpl::GetTensorType()) + .TypeConstraint("T4", DataTypeImpl::GetTensorType()), + Conv); + ONNX_OPERATOR_TYPED_KERNEL_EX( QLinearConv, kMSInternalNHWCDomain, From 34abf574f8f932de7bd3f7e8a8d2c1999a291b41 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 28 Sep 2024 14:00:16 +0000 Subject: [PATCH 05/25] revert conv --- onnxruntime/core/providers/xnnpack/nn/conv.cc | 180 ------------------ 1 file changed, 180 deletions(-) diff --git a/onnxruntime/core/providers/xnnpack/nn/conv.cc b/onnxruntime/core/providers/xnnpack/nn/conv.cc index 7e5324ed7cf9c..b815cc1570c96 100644 --- a/onnxruntime/core/providers/xnnpack/nn/conv.cc +++ b/onnxruntime/core/providers/xnnpack/nn/conv.cc @@ -3,162 +3,6 @@ #include "conv.h" -#include -#include //temp -#include -#include "core/common/inlined_containers_fwd.h" -#include "core/framework/tensorprotoutils.h" -#include "core/framework/transpose_helper.h" -#include "core/providers/utils.h" -#include "core/providers/xnnpack/xnnpack_init.h" -#include "core/providers/xnnpack/detail/utils.h" - -namespace onnxruntime { -namespace xnnpack { - -// use PrePack to handle the weight layout change as that's not a simple NCHW -> NHWC transpose -Status Conv::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, - /*out*/ bool& is_packed, - /*out*/ PrePackedWeights* /*prepacked_weights*/) { - is_packed = false; - // only layout of weight input is adjusted via PrePack - if (((conv_type_ == OpComputeType::op_compute_type_fp32 || conv_type_ == OpComputeType::op_compute_type_fp16) && input_idx == 1) || - ((conv_type_ != OpComputeType::op_compute_type_fp32 || conv_type_ != OpComputeType::op_compute_type_fp16) && input_idx == 3)) { // InputTensors::IN_W - // Transpose from {M, C/group, kH, kW} to {M, kH, kW, C/group} - auto orig_shape = tensor.Shape(); - const auto rank = orig_shape.NumDimensions(); - - if (rank == 4) { - InlinedVector perm{0, 2, 3, 1}; - TensorShapeVector new_dims{orig_shape[0], - orig_shape[2], - orig_shape[3], - orig_shape[1]}; - - packed_w_ = Tensor(tensor.DataType(), TensorShape(new_dims), std::move(alloc)); - - SingleAxisTranspose(perm, tensor, packed_w_, /*from*/ 1, /*to*/ 3); - } else { - assert(rank == 3); // ConvBase::IsOnnxNodeSupported validates this - - InlinedVector perm{0, 2, 1}; - TensorShapeVector new_dims{orig_shape[0], - orig_shape[2], - orig_shape[1]}; - - packed_w_ = Tensor(tensor.DataType(), TensorShape(new_dims), std::move(alloc)); - - SingleAxisTranspose(perm, tensor, packed_w_, /*from*/ 1, /*to*/ 2); - } - - is_packed = true; - - // we can create the kernel now - ORT_RETURN_IF_ERROR(CreateKernel()); - } - - return Status::OK(); -} - -Status Conv::Compute(OpKernelContext* context) const { - const Tensor& X = *context->Input(0); // this is in NHWC format - const auto& X_shape = X.Shape(); - const auto rank = X_shape.NumDimensions(); - const auto is_1D = rank == 3; - const int64_t N = X_shape[0]; // input is NHWC or NWC - - // we support 1D or 2D. if 1D we convert to 2D by setting H to 1 - const int64_t H = is_1D ? 1 : X_shape[1]; - const int64_t W = X_shape[rank - 2]; - throw std::invalid_argument("I'm here now"); - - // We don't need to call ValidateInputShape as we checked validity in ConvChecker. - // We also can't use ValidateInputShape as-is as the weight tensor was pre-packed and the layout was changed there. - // ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(&X, &W)); - - // CPU Conv starts with TensorShapeVector Y_dims({N, M}); and passes in X->Shape().Slice(2); - // We know this is 2D in NHWC format so we need to start with 'N', pass in the H, W, and append M last - TensorShapeVector Y_dims(output_shape_); - Y_dims[0] = N; - Tensor* Y = context->Output(0, TensorShape(Y_dims)); - - // Bail out early if one of the dimensions is zero. - if (Y->Shape().Size() == 0) { - return Status::OK(); - } - - pthreadpool_t threadpool = GetThreadPool(); - - // setup allocator/automated dellocate for workspace - size_t workspace_size = 0; - size_t workspace_alignment = 0; - xnn_allocator* allocator = GetStoredAllocator().second; - auto deallocator = [allocator](void* ptr) { allocator->aligned_deallocate(allocator->context, ptr); }; - std::unique_ptr workspace(nullptr, deallocator); - - auto reshape_fn = xnn_reshape_convolution2d_nhwc_f32; - if (conv_type_ == OpComputeType::op_compute_type_qs8) { - reshape_fn = xnn_reshape_convolution2d_nhwc_qs8; - } else if (conv_type_ == OpComputeType::op_compute_type_qu8) { - reshape_fn = xnn_reshape_convolution2d_nhwc_qu8; - } else if (conv_type_ == OpComputeType::op_compute_type_qs8_per_channel) { - reshape_fn = xnn_reshape_convolution2d_nhwc_qs8_qc8w; - } - - auto status = reshape_fn(op0_.get(), N, H, W, - &workspace_size, &workspace_alignment, - /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, - threadpool); - if (status != xnn_status_success) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_reshape_convolution2d_nhwc_", OpTypeToString(conv_type_), - "returned ", status); - } - - workspace.reset(allocator->aligned_allocate(allocator->context, XNN_ALLOCATION_ALIGNMENT, workspace_size)); - - if (conv_type_ == OpComputeType::op_compute_type_fp32) { - status = xnn_setup_convolution2d_nhwc_f32(op0_.get(), workspace.get(), X.Data(), - Y->MutableData()); - } else if (conv_type_ == OpComputeType::op_compute_type_qs8) { - status = xnn_setup_convolution2d_nhwc_qs8(op0_.get(), workspace.get(), X.Data(), - Y->MutableData()); - } else if (conv_type_ == OpComputeType::op_compute_type_qu8) { - status = xnn_setup_convolution2d_nhwc_qu8(op0_.get(), workspace.get(), X.Data(), - Y->MutableData()); - } else if (conv_type_ == OpComputeType::op_compute_type_qs8_per_channel) { - status = xnn_setup_convolution2d_nhwc_qs8_qc8w(op0_.get(), workspace.get(), X.Data(), - Y->MutableData()); - } - - if (status != xnn_status_success) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_setup_convolution2d_nhwc_", - OpTypeToString(conv_type_), "returned ", status); - } - - status = xnn_run_operator(op0_.get(), threadpool); - if (status != xnn_status_success) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_run_operator returned ", status); - } - - return Status::OK(); -} - -ONNX_OPERATOR_VERSIONED_KERNEL_EX(Conv, kMSInternalNHWCDomain, 1, 10, kXnnpackExecutionProvider, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Conv); -ONNX_OPERATOR_KERNEL_EX(Conv, kMSInternalNHWCDomain, 11, kXnnpackExecutionProvider, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Conv); - -ONNX_OPERATOR_TYPED_KERNEL_EX( - QLinearConv, - kMSInternalNHWCDomain, - 10, - uint8_t,// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "conv.h" - #include #include @@ -332,28 +176,4 @@ ONNX_OPERATOR_TYPED_KERNEL_EX( .TypeConstraint("T4", DataTypeImpl::GetTensorType()), Conv); } // namespace xnnpack -} // namespace onnxruntime - - kXnnpackExecutionProvider, - KernelDefBuilder() - .TypeConstraint("T1", DataTypeImpl::GetTensorType()) - .TypeConstraint("T2", DataTypeImpl::GetTensorType()) - .TypeConstraint("T3", DataTypeImpl::GetTensorType()) - .TypeConstraint("T4", DataTypeImpl::GetTensorType()), - Conv); - -ONNX_OPERATOR_TYPED_KERNEL_EX( - QLinearConv, - kMSInternalNHWCDomain, - 10, - int8_t, - kXnnpackExecutionProvider, - KernelDefBuilder() - .TypeConstraint("T1", DataTypeImpl::GetTensorType()) - .TypeConstraint("T2", DataTypeImpl::GetTensorType()) - .TypeConstraint("T3", DataTypeImpl::GetTensorType()) - .TypeConstraint("T4", DataTypeImpl::GetTensorType()), - Conv); - -} // namespace xnnpack } // namespace onnxruntime From ccbb7df83ab1b32852202dc680cf7fb4eebd5686 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 28 Sep 2024 14:29:32 +0000 Subject: [PATCH 06/25] update --- onnxruntime/core/providers/xnnpack/nn/max_pool.cc | 2 +- .../providers/xnnpack/xnnpack_execution_provider.cc | 2 +- onnxruntime/core/providers/xnnpack/xnnpack_init.h | 10 +++------- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc index 9fabb9a0c3e73..2e70574be213d 100644 --- a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc +++ b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc @@ -2,7 +2,7 @@ // Licensed under the MIT License. #include "max_pool.h" -#include + #include "core/graph/graph.h" #include "core/providers/utils.h" #include "core/providers/xnnpack/xnnpack_init.h" diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc index 21b9f873d51c6..f1062a2ab06e6 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc +++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc @@ -77,7 +77,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWC class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 8, 9, MaxPool); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, 10, MaxPool); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLAS_SNAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool); #ifdef XNNPACK_FP16_SUPPORTED class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 8, 9, MLFloat16, MaxPool); diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_init.h b/onnxruntime/core/providers/xnnpack/xnnpack_init.h index 3b8836b9ad497..315f47cfe99e3 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_init.h +++ b/onnxruntime/core/providers/xnnpack/xnnpack_init.h @@ -1,5 +1,5 @@ #include "core/framework/allocator.h" -#include //temp + struct xnn_allocator; namespace onnxruntime { @@ -46,7 +46,7 @@ namespace xnnpack { #define XNN_ALLOCATION_ALIGNMENT 16 #endif - +#if (!defined(_MSC_VER)) || (_MSC_VER >= 1930) #if defined(_M_ARM64) || defined(__aarch64__) || defined(_M_ARM64EC) #if !defined(__APPLE__) // Had to temporary disable fp16 under APPLE ARM64, as compiling @@ -54,14 +54,10 @@ namespace xnnpack { // When building an universial binary for APPLE, this flag would // cause trouble for x64 target. // reference from MLAS - -#if (!defined(_MSC_VER)) -// KleidiAI doesn't support MSVC #define XNNPACK_FP16_SUPPORTED -#endif - #endif // #endif // ARM64 +#endif // Visual Studio 16 or earlier does not support fp16 intrinsic std::pair GetStoredAllocator(); From 9f16ab2f986fb2afab79c0d8be5266c32e67ac02 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 29 Sep 2024 01:47:33 +0000 Subject: [PATCH 07/25] fix lint --- .../core/providers/xnnpack/nn/max_pool.cc | 28 +++++++++---------- .../xnnpack/xnnpack_execution_provider.cc | 4 +-- .../core/providers/xnnpack/xnnpack_init.h | 8 +++--- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc index 2e70574be213d..64639a9cc9747 100644 --- a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc +++ b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc @@ -197,16 +197,16 @@ MaxPool::MaxPool(const OpKernelInfo& info) stride_height, stride_width, dilation_height, dilation_width, output_min, output_max, flags, &p); - } else if(input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16){ + } else if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) { maxpool_type_ = OpComputeType::op_compute_type_fp16; const float output_min = -65504.0; const float output_max = 65504.0; status = xnn_create_max_pooling2d_nhwc_f16(input_padding_top, input_padding_right, - input_padding_bottom, input_padding_left, - pooling_height, pooling_width, - stride_height, stride_width, - dilation_height, dilation_width, - output_min, output_max, flags, &p); + input_padding_bottom, input_padding_left, + pooling_height, pooling_width, + stride_height, stride_width, + dilation_height, dilation_width, + output_min, output_max, flags, &p); } else { auto stype = DataTypeImpl::ToString(DataTypeImpl::TypeFromProto(*X_arg.TypeAsProto())); ORT_THROW("unsupported Conv in maxpool, we have FLOAT|UINT8|FP16, but got ", stype); @@ -306,20 +306,20 @@ ONNX_OPERATOR_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 12, kXnnpackExecutionPro #ifdef XNNPACK_FP16_SUPPORTED ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 8, 9, MLFloat16, kXnnpackExecutionProvider, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - MaxPool); + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + MaxPool); ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 10, 10, MLFloat16, kXnnpackExecutionProvider, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - MaxPool); + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + MaxPool); ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 11, 11, MLFloat16, kXnnpackExecutionProvider, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - MaxPool); + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + MaxPool); ONNX_OPERATOR_TYPED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 12, MLFloat16, kXnnpackExecutionProvider, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - MaxPool); + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + MaxPool); #endif } // namespace xnnpack diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc index f1062a2ab06e6..18a2d4803833f 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc +++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc @@ -38,9 +38,9 @@ KernelCreateInfo BuildKernelCreateInfo() { BuildKernelCreateInfo< \ ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, End, Op)> -#define KERNEL_CREATE_INFO_VERSIONED_TYPED(Start, End, Type, Op, Domain) \ +#define KERNEL_CREATE_INFO_VERSIONED_TYPED(Start, End, Type, Op, Domain) \ BuildKernelCreateInfo + Type, Op)> #define KERNEL_CREATE_INFO(Start, Op, Domain) \ BuildKernelCreateInfo< \ diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_init.h b/onnxruntime/core/providers/xnnpack/xnnpack_init.h index 315f47cfe99e3..04722fe01e027 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_init.h +++ b/onnxruntime/core/providers/xnnpack/xnnpack_init.h @@ -48,16 +48,16 @@ namespace xnnpack { #if (!defined(_MSC_VER)) || (_MSC_VER >= 1930) #if defined(_M_ARM64) || defined(__aarch64__) || defined(_M_ARM64EC) -#if !defined(__APPLE__) +#if !defined(__APPLE__) // Had to temporary disable fp16 under APPLE ARM64, as compiling // the source files require a hardware specific compilation flag. // When building an universial binary for APPLE, this flag would // cause trouble for x64 target. // reference from MLAS #define XNNPACK_FP16_SUPPORTED -#endif // -#endif // ARM64 -#endif // Visual Studio 16 or earlier does not support fp16 intrinsic +#endif // +#endif // ARM64 +#endif // Visual Studio 16 or earlier does not support fp16 intrinsic std::pair GetStoredAllocator(); From b8fcd59a24a9e2fa359acaa3671fa28a13dfd098 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 30 Sep 2024 03:21:30 +0000 Subject: [PATCH 08/25] add macros to register f16 kernels --- .../core/providers/xnnpack/nn/max_pool.cc | 2 +- .../xnnpack/xnnpack_execution_provider.cc | 96 +++++++------------ 2 files changed, 38 insertions(+), 60 deletions(-) diff --git a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc index 64639a9cc9747..c8d44cdb37c29 100644 --- a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc +++ b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc @@ -209,7 +209,7 @@ MaxPool::MaxPool(const OpKernelInfo& info) output_min, output_max, flags, &p); } else { auto stype = DataTypeImpl::ToString(DataTypeImpl::TypeFromProto(*X_arg.TypeAsProto())); - ORT_THROW("unsupported Conv in maxpool, we have FLOAT|UINT8|FP16, but got ", stype); + ORT_THROW("unsupported Conv in maxpool, we have FLOAT|UINT8|FLOAT16, but got ", stype); } ORT_ENFORCE(status == xnn_status_success, "xnn_create_max_pooling2d_nhwc_", OpTypeToString(maxpool_type_), "failed. Status:", status); diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc index 18a2d4803833f..a22fadc9a5412 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc +++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc @@ -18,13 +18,6 @@ #include "core/providers/xnnpack/detail/node_support_checker.h" #include "core/providers/xnnpack/xnnpack_init.h" -namespace { -struct KernelRegistryAndStatus { - std::shared_ptr kernel_registry = std::make_shared(); - onnxruntime::Status st; -}; -} // namespace - namespace onnxruntime { namespace xnnpack { @@ -38,10 +31,6 @@ KernelCreateInfo BuildKernelCreateInfo() { BuildKernelCreateInfo< \ ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, End, Op)> -#define KERNEL_CREATE_INFO_VERSIONED_TYPED(Start, End, Type, Op, Domain) \ - BuildKernelCreateInfo - #define KERNEL_CREATE_INFO(Start, Op, Domain) \ BuildKernelCreateInfo< \ ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, Op)> @@ -50,6 +39,27 @@ KernelCreateInfo BuildKernelCreateInfo() { BuildKernelCreateInfo< \ ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, Type, Op)> +#ifdef XNNPACK_FP16_SUPPORTED + #define KERNEL_CREATE_INFO_VERSIONED_FP16(Start, End, Op, Domain) \ + BuildKernelCreateInfo< \ + ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, End, MLFloat16, Op)> + + #define KERNEL_CREATE_INFO_FP16(Start, Op, Domain) \ + BuildKernelCreateInfo< \ + ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, MLFloat16, Op)> + + #define ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name) \ + ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, startver, endver, MLFloat16, name) + + #define ONNX_OPERATOR_KERNEL_CLASS_NAME_FP16(provider, domain, startver, name) \ + ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, startver, MLFloat16, name) +#else + #define KERNEL_CREATE_INFO_VERSIONED_FP16(Start, End, Op, Domain) + #define KERNEL_CREATE_INFO_FP16(Start, Op, Domain) + #define ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name) + #define ONNX_OPERATOR_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name) +#endif + // Layout sensitive operators in NHWC domain class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 7, 9, AveragePool); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, 10, AveragePool); @@ -79,12 +89,10 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSIn class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, 10, MaxPool); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool); -#ifdef XNNPACK_FP16_SUPPORTED -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 8, 9, MLFloat16, MaxPool); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, 10, MLFloat16, MaxPool); -class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 11, MLFloat16, MaxPool); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 12, MLFloat16, MaxPool); -#endif +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 8, 9, MaxPool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, 10, MaxPool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool); +class ONNX_OPERATOR_KERNEL_CLASS_NAME_FP16(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool); // ONNX operators class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 7, 8, Gemm); @@ -103,29 +111,9 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 13 // Internal domain class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kDynamicDomainByCreate, 1, QLinearSoftmax); -#ifdef XNNPACK_FP16_SUPPORTED -Status RegisterFp16Kernels(KernelRegistry& kernel_registry) { - static const BuildKernelCreateInfoFn function_table[] = { - BuildKernelCreateInfo, // default entry to avoid the list become empty after ops-reducing - - KERNEL_CREATE_INFO_VERSIONED_TYPED(8, 9, MLFloat16, MaxPool, kMSInternalNHWCDomain), - KERNEL_CREATE_INFO_VERSIONED_TYPED(10, 10, MLFloat16, MaxPool, kMSInternalNHWCDomain), - KERNEL_CREATE_INFO_VERSIONED_TYPED(11, 11, MLFloat16, MaxPool, kMSInternalNHWCDomain), - KERNEL_CREATE_INFO_TYPED(12, MLFloat16, MaxPool, kMSInternalNHWCDomain), - }; +std::unique_ptr RegisterKernels() { + auto kernel_registry = std::make_unique(); - for (auto& function_table_entry : function_table) { - KernelCreateInfo info = function_table_entry(); - if (info.kernel_def != nullptr) { // filter disabled entries where type is void - ORT_RETURN_IF_ERROR(kernel_registry.Register(std::move(info))); - } - } - - return Status::OK(); -} -#endif - -Status RegisterKernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn function_table[] = { BuildKernelCreateInfo, // default entry to avoid the list becoming empty after ops-reducing @@ -145,7 +133,12 @@ Status RegisterKernels(KernelRegistry& kernel_registry) { KERNEL_CREATE_INFO_VERSIONED(10, 10, MaxPool, kMSInternalNHWCDomain), KERNEL_CREATE_INFO_VERSIONED(11, 11, MaxPool, kMSInternalNHWCDomain), KERNEL_CREATE_INFO(12, MaxPool, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO_VERSIONED_FP16(8, 9, MaxPool, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO_VERSIONED_FP16(10, 10, MaxPool, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO_VERSIONED_FP16(11, 11, MaxPool, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO_FP16(12, MaxPool, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO(1, QLinearConvTranspose, kMSInternalNHWCDomain), KERNEL_CREATE_INFO_VERSIONED(10, 10, Resize, kMSInternalNHWCDomain), @@ -180,11 +173,11 @@ Status RegisterKernels(KernelRegistry& kernel_registry) { for (auto& function_table_entry : function_table) { KernelCreateInfo info = function_table_entry(); if (info.kernel_def != nullptr) { // filter disabled entries where type is void - ORT_THROW_IF_ERROR(kernel_registry.Register(std::move(info))); + ORT_THROW_IF_ERROR(kernel_registry->Register(std::move(info))); } } - return Status::OK(); + return kernel_registry; } } // namespace xnnpack @@ -296,6 +289,7 @@ std::vector> XnnpackExecutionProvider::GetCap const onnxruntime::GraphViewer& graph, const IKernelLookup& /*kernel_lookup*/) const { std::vector> capabilities; + std::shared_ptr registry = GetKernelRegistry(); std::unordered_map supported_node_unit_map; NodeSupportChecker checker{graph, supported_node_unit_map}; @@ -384,25 +378,9 @@ std::vector> XnnpackExecutionProvider::GetCap return capabilities; } -Status RegisterXnnPackKernels(KernelRegistry& kernel_registry) { - ORT_RETURN_IF_ERROR(RegisterKernels(kernel_registry)); -#ifdef XNNPACK_FP16_SUPPORTED - ORT_RETURN_IF_ERROR(RegisterFp16Kernels(kernel_registry)); -#endif - return Status::OK(); -} - -KernelRegistryAndStatus GetXnnPackKernelRegistry() { - KernelRegistryAndStatus ret; - ret.st = RegisterXnnPackKernels(*ret.kernel_registry); - return ret; -} - std::shared_ptr XnnpackExecutionProvider::GetKernelRegistry() const { - static KernelRegistryAndStatus k = GetXnnPackKernelRegistry(); - // throw if the registry failed to initialize - ORT_THROW_IF_ERROR(k.st); - return k.kernel_registry; + static std::shared_ptr registry = xnnpack::RegisterKernels(); + return registry; } XnnpackExecutionProvider::~XnnpackExecutionProvider() { From 65f65ca199a4eddbcbe869bf640f7cae50d0d86d Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 30 Sep 2024 14:14:06 +0000 Subject: [PATCH 09/25] update --- onnxruntime/core/providers/xnnpack/nn/max_pool.cc | 2 -- onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc index c8d44cdb37c29..a7c127b3f6b34 100644 --- a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc +++ b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc @@ -55,9 +55,7 @@ bool MaxPool::IsOnnxNodeSupported(const NodeUnit& node_unit, // input of maxpool could be fp16/fp32/fp64,i8/u8 according to ONNX if (x_type == nullptr || (x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT && -#ifdef XNNPACK_FP16_SUPPORTED x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 && -#endif x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_UINT8 && x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8)) { break; diff --git a/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc index b033ddbca23d6..680485467343e 100644 --- a/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc @@ -3,7 +3,7 @@ #include "core/mlas/inc/mlas.h" -#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(USE_XNNPACK) #include "core/providers/cpu/nn/pool.h" #include "gtest/gtest.h" From eb55ead4b1f70232736ebc7647c05f54b7b51183 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 30 Sep 2024 14:27:39 +0000 Subject: [PATCH 10/25] only for arm64 --- onnxruntime/core/providers/xnnpack/xnnpack_init.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_init.h b/onnxruntime/core/providers/xnnpack/xnnpack_init.h index 04722fe01e027..d6a3b10bb57d5 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_init.h +++ b/onnxruntime/core/providers/xnnpack/xnnpack_init.h @@ -47,7 +47,7 @@ namespace xnnpack { #endif #if (!defined(_MSC_VER)) || (_MSC_VER >= 1930) -#if defined(_M_ARM64) || defined(__aarch64__) || defined(_M_ARM64EC) +#if (defined(_M_ARM64) || defined(__aarch64__) || defined(_M_ARM64EC)) && !XNN_ARCH_X86_64 #if !defined(__APPLE__) // Had to temporary disable fp16 under APPLE ARM64, as compiling // the source files require a hardware specific compilation flag. From 96070d7a1f10ee2e74761448528815276288790d Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 30 Sep 2024 14:38:41 +0000 Subject: [PATCH 11/25] update --- .../xnnpack/xnnpack_execution_provider.cc | 29 +++++++++---------- .../core/providers/xnnpack/xnnpack_init.h | 2 +- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc index a22fadc9a5412..cb31533e0bcf5 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc +++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc @@ -40,24 +40,24 @@ KernelCreateInfo BuildKernelCreateInfo() { ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, Type, Op)> #ifdef XNNPACK_FP16_SUPPORTED - #define KERNEL_CREATE_INFO_VERSIONED_FP16(Start, End, Op, Domain) \ - BuildKernelCreateInfo< \ - ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, End, MLFloat16, Op)> +#define KERNEL_CREATE_INFO_VERSIONED_FP16(Start, End, Op, Domain) \ + BuildKernelCreateInfo< \ + ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, End, MLFloat16, Op)> - #define KERNEL_CREATE_INFO_FP16(Start, Op, Domain) \ - BuildKernelCreateInfo< \ - ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, MLFloat16, Op)> +#define KERNEL_CREATE_INFO_FP16(Start, Op, Domain) \ + BuildKernelCreateInfo< \ + ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, MLFloat16, Op)> - #define ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name) \ - ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, startver, endver, MLFloat16, name) +#define ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name) \ + ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, startver, endver, MLFloat16, name) - #define ONNX_OPERATOR_KERNEL_CLASS_NAME_FP16(provider, domain, startver, name) \ - ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, startver, MLFloat16, name) +#define ONNX_OPERATOR_KERNEL_CLASS_NAME_FP16(provider, domain, startver, name) \ + ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, startver, MLFloat16, name) #else - #define KERNEL_CREATE_INFO_VERSIONED_FP16(Start, End, Op, Domain) - #define KERNEL_CREATE_INFO_FP16(Start, Op, Domain) - #define ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name) - #define ONNX_OPERATOR_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name) +#define KERNEL_CREATE_INFO_VERSIONED_FP16(Start, End, Op, Domain) +#define KERNEL_CREATE_INFO_FP16(Start, Op, Domain) +#define ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name) +#define ONNX_OPERATOR_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name) #endif // Layout sensitive operators in NHWC domain @@ -138,7 +138,6 @@ std::unique_ptr RegisterKernels() { KERNEL_CREATE_INFO_VERSIONED_FP16(11, 11, MaxPool, kMSInternalNHWCDomain), KERNEL_CREATE_INFO_FP16(12, MaxPool, kMSInternalNHWCDomain), - KERNEL_CREATE_INFO(1, QLinearConvTranspose, kMSInternalNHWCDomain), KERNEL_CREATE_INFO_VERSIONED(10, 10, Resize, kMSInternalNHWCDomain), diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_init.h b/onnxruntime/core/providers/xnnpack/xnnpack_init.h index d6a3b10bb57d5..16031ee8f58ef 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_init.h +++ b/onnxruntime/core/providers/xnnpack/xnnpack_init.h @@ -53,7 +53,7 @@ namespace xnnpack { // the source files require a hardware specific compilation flag. // When building an universial binary for APPLE, this flag would // cause trouble for x64 target. -// reference from MLAS +// referenced from MLAS #define XNNPACK_FP16_SUPPORTED #endif // #endif // ARM64 From 49d751650ee26466d3cbdc5a35eddb5b7688aaef Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 1 Oct 2024 03:10:41 +0000 Subject: [PATCH 12/25] update1 --- .../xnnpack/xnnpack_execution_provider.cc | 20 +++++++++---------- .../providers/cpu/nn/pool_fp16_op_test.cc | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc index cb31533e0bcf5..d8c613f3547a3 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc +++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc @@ -48,16 +48,16 @@ KernelCreateInfo BuildKernelCreateInfo() { BuildKernelCreateInfo< \ ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, MLFloat16, Op)> -#define ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name) \ - ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, startver, endver, MLFloat16, name) +#define CLASS_ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name) \ + class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, startver, endver, MLFloat16, name) -#define ONNX_OPERATOR_KERNEL_CLASS_NAME_FP16(provider, domain, startver, name) \ - ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, startver, MLFloat16, name) +#define CLASS_ONNX_OPERATOR_KERNEL_CLASS_NAME_FP16(provider, domain, startver, name) \ + class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, startver, MLFloat16, name) #else #define KERNEL_CREATE_INFO_VERSIONED_FP16(Start, End, Op, Domain) #define KERNEL_CREATE_INFO_FP16(Start, Op, Domain) -#define ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name) -#define ONNX_OPERATOR_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name) +#define CLASS_ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name) +#define CLASS_ONNX_OPERATOR_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name) #endif // Layout sensitive operators in NHWC domain @@ -89,10 +89,10 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSIn class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, 10, MaxPool); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 8, 9, MaxPool); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, 10, MaxPool); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool); -class ONNX_OPERATOR_KERNEL_CLASS_NAME_FP16(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool); +CLASS_ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 8, 9, MaxPool); +CLASS_ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, 10, MaxPool); +CLASS_ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool); +CLASS_ONNX_OPERATOR_KERNEL_CLASS_NAME_FP16(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool); // ONNX operators class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 7, 8, Gemm); diff --git a/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc index d4e0af5011525..4060abe366e35 100644 --- a/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc @@ -3,7 +3,7 @@ #include "core/mlas/inc/mlas.h" -#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM) || defined(USE_XNNPACK) +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM) || defined(XNNPACK_F16_SUPPORTED) #include "core/providers/cpu/nn/pool.h" #include "gtest/gtest.h" From e66d1eb773cf2d385e6e2d1d70006d12a2a521f6 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 1 Oct 2024 04:03:36 +0000 Subject: [PATCH 13/25] update --- .../providers/xnnpack/xnnpack_execution_provider.cc | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc index d8c613f3547a3..24cdd91610850 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc +++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc @@ -42,11 +42,11 @@ KernelCreateInfo BuildKernelCreateInfo() { #ifdef XNNPACK_FP16_SUPPORTED #define KERNEL_CREATE_INFO_VERSIONED_FP16(Start, End, Op, Domain) \ BuildKernelCreateInfo< \ - ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, End, MLFloat16, Op)> + ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, End, MLFloat16, Op)>, #define KERNEL_CREATE_INFO_FP16(Start, Op, Domain) \ BuildKernelCreateInfo< \ - ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, MLFloat16, Op)> + ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, MLFloat16, Op)>, #define CLASS_ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name) \ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, startver, endver, MLFloat16, name) @@ -133,10 +133,11 @@ std::unique_ptr RegisterKernels() { KERNEL_CREATE_INFO_VERSIONED(10, 10, MaxPool, kMSInternalNHWCDomain), KERNEL_CREATE_INFO_VERSIONED(11, 11, MaxPool, kMSInternalNHWCDomain), KERNEL_CREATE_INFO(12, MaxPool, kMSInternalNHWCDomain), - KERNEL_CREATE_INFO_VERSIONED_FP16(8, 9, MaxPool, kMSInternalNHWCDomain), - KERNEL_CREATE_INFO_VERSIONED_FP16(10, 10, MaxPool, kMSInternalNHWCDomain), - KERNEL_CREATE_INFO_VERSIONED_FP16(11, 11, MaxPool, kMSInternalNHWCDomain), - KERNEL_CREATE_INFO_FP16(12, MaxPool, kMSInternalNHWCDomain), + // because the F16 macro may be expanded to empty, so don't add "," + KERNEL_CREATE_INFO_VERSIONED_FP16(8, 9, MaxPool, kMSInternalNHWCDomain) + KERNEL_CREATE_INFO_VERSIONED_FP16(10, 10, MaxPool, kMSInternalNHWCDomain) + KERNEL_CREATE_INFO_VERSIONED_FP16(11, 11, MaxPool, kMSInternalNHWCDomain) + KERNEL_CREATE_INFO_FP16(12, MaxPool, kMSInternalNHWCDomain) KERNEL_CREATE_INFO(1, QLinearConvTranspose, kMSInternalNHWCDomain), From 608f9d2295b038b293f27d9b888830c023ee2090 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 1 Oct 2024 09:42:06 +0000 Subject: [PATCH 14/25] minor fix --- .../core/providers/xnnpack/xnnpack_execution_provider.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc index 24cdd91610850..38e5a37ae0d37 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc +++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc @@ -57,7 +57,7 @@ KernelCreateInfo BuildKernelCreateInfo() { #define KERNEL_CREATE_INFO_VERSIONED_FP16(Start, End, Op, Domain) #define KERNEL_CREATE_INFO_FP16(Start, Op, Domain) #define CLASS_ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name) -#define CLASS_ONNX_OPERATOR_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name) +#define CLASS_ONNX_OPERATOR_KERNEL_CLASS_NAME_FP16(provider, domain, startver, name) #endif // Layout sensitive operators in NHWC domain @@ -94,6 +94,7 @@ CLASS_ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(kXnnpackExecutionProvider, CLASS_ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool); CLASS_ONNX_OPERATOR_KERNEL_CLASS_NAME_FP16(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool); + // ONNX operators class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 7, 8, Gemm); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 9, 10, Gemm); From 53bcfd3754b40d0ad0dddadef1059e4836f72a7e Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 2 Oct 2024 12:00:12 +0000 Subject: [PATCH 15/25] typo --- onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc index 4060abe366e35..46eb1180f4e7e 100644 --- a/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc @@ -3,7 +3,7 @@ #include "core/mlas/inc/mlas.h" -#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM) || defined(XNNPACK_F16_SUPPORTED) +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM) || defined(XNNPACK_FP16_SUPPORTED) #include "core/providers/cpu/nn/pool.h" #include "gtest/gtest.h" From 2ab8233b87f56ff700a5a4d019eb493a4af6523b Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 2 Oct 2024 12:53:29 +0000 Subject: [PATCH 16/25] lint --- .../core/providers/xnnpack/xnnpack_execution_provider.cc | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc index 38e5a37ae0d37..3223880c6efe1 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc +++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc @@ -94,7 +94,6 @@ CLASS_ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(kXnnpackExecutionProvider, CLASS_ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool); CLASS_ONNX_OPERATOR_KERNEL_CLASS_NAME_FP16(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool); - // ONNX operators class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 7, 8, Gemm); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 9, 10, Gemm); @@ -136,11 +135,11 @@ std::unique_ptr RegisterKernels() { KERNEL_CREATE_INFO(12, MaxPool, kMSInternalNHWCDomain), // because the F16 macro may be expanded to empty, so don't add "," KERNEL_CREATE_INFO_VERSIONED_FP16(8, 9, MaxPool, kMSInternalNHWCDomain) - KERNEL_CREATE_INFO_VERSIONED_FP16(10, 10, MaxPool, kMSInternalNHWCDomain) - KERNEL_CREATE_INFO_VERSIONED_FP16(11, 11, MaxPool, kMSInternalNHWCDomain) - KERNEL_CREATE_INFO_FP16(12, MaxPool, kMSInternalNHWCDomain) + KERNEL_CREATE_INFO_VERSIONED_FP16(10, 10, MaxPool, kMSInternalNHWCDomain) + KERNEL_CREATE_INFO_VERSIONED_FP16(11, 11, MaxPool, kMSInternalNHWCDomain) + KERNEL_CREATE_INFO_FP16(12, MaxPool, kMSInternalNHWCDomain) - KERNEL_CREATE_INFO(1, QLinearConvTranspose, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO(1, QLinearConvTranspose, kMSInternalNHWCDomain), KERNEL_CREATE_INFO_VERSIONED(10, 10, Resize, kMSInternalNHWCDomain), KERNEL_CREATE_INFO_VERSIONED(11, 12, Resize, kMSInternalNHWCDomain), From b490108f2b718ac0616c5654c54dc9d9b48349d5 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 2 Oct 2024 13:20:37 +0000 Subject: [PATCH 17/25] temp change --- onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc index 46eb1180f4e7e..d8cf873256559 100644 --- a/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc @@ -3,7 +3,7 @@ #include "core/mlas/inc/mlas.h" -#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM) || defined(XNNPACK_FP16_SUPPORTED) +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(XNNPACK_FP16_SUPPORTED) #include "core/providers/cpu/nn/pool.h" #include "gtest/gtest.h" From b39e962b980a48ef5d1bb682b2218e587cff0520 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 2 Oct 2024 14:32:14 +0000 Subject: [PATCH 18/25] onnx node supported --- onnxruntime/core/providers/xnnpack/nn/max_pool.cc | 3 +++ onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc index a7c127b3f6b34..4f0f8fd2ddc61 100644 --- a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc +++ b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc @@ -55,7 +55,10 @@ bool MaxPool::IsOnnxNodeSupported(const NodeUnit& node_unit, // input of maxpool could be fp16/fp32/fp64,i8/u8 according to ONNX if (x_type == nullptr || (x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT && +// because pool_fp16_op_test can be enabled by other preprocessor, for example, COREML_ENABLE_MLPROGRAM +if XNNPACK_FP16_SUPPORTED x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 && +#endif x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_UINT8 && x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8)) { break; diff --git a/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc index d8cf873256559..655821817d487 100644 --- a/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc @@ -3,7 +3,11 @@ #include "core/mlas/inc/mlas.h" +<<<<<<< HEAD #if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(XNNPACK_FP16_SUPPORTED) +======= +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM) || defined(XNNPACK_FP16_SUPPORTED) +>>>>>>> 6527ac2a24 (onnx node supported) #include "core/providers/cpu/nn/pool.h" #include "gtest/gtest.h" From b3dac3767d76c8f5ca4fd81efd0b581873ec88ed Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 2 Oct 2024 14:38:19 +0000 Subject: [PATCH 19/25] update --- onnxruntime/core/providers/xnnpack/xnnpack_init.h | 8 +++----- onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc | 4 ---- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_init.h b/onnxruntime/core/providers/xnnpack/xnnpack_init.h index 16031ee8f58ef..e3f71b9a90367 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_init.h +++ b/onnxruntime/core/providers/xnnpack/xnnpack_init.h @@ -49,11 +49,9 @@ namespace xnnpack { #if (!defined(_MSC_VER)) || (_MSC_VER >= 1930) #if (defined(_M_ARM64) || defined(__aarch64__) || defined(_M_ARM64EC)) && !XNN_ARCH_X86_64 #if !defined(__APPLE__) -// Had to temporary disable fp16 under APPLE ARM64, as compiling -// the source files require a hardware specific compilation flag. -// When building an universial binary for APPLE, this flag would -// cause trouble for x64 target. -// referenced from MLAS +// Had to temporary disable fp16 under APPLE ARM64, as compiling the source files +// require a hardware specific compilation flag. When building an universial binary for APPLE, +// this flag would cause trouble for x64 target. Referenced from MLAS #define XNNPACK_FP16_SUPPORTED #endif // #endif // ARM64 diff --git a/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc index 655821817d487..46eb1180f4e7e 100644 --- a/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc @@ -3,11 +3,7 @@ #include "core/mlas/inc/mlas.h" -<<<<<<< HEAD -#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(XNNPACK_FP16_SUPPORTED) -======= #if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM) || defined(XNNPACK_FP16_SUPPORTED) ->>>>>>> 6527ac2a24 (onnx node supported) #include "core/providers/cpu/nn/pool.h" #include "gtest/gtest.h" From 7831e300bc66c7d6680bc679de27b86650fe0895 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 2 Oct 2024 14:59:09 +0000 Subject: [PATCH 20/25] typo --- onnxruntime/core/providers/xnnpack/nn/max_pool.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc index 4f0f8fd2ddc61..f0cf7a599d131 100644 --- a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc +++ b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc @@ -56,7 +56,7 @@ bool MaxPool::IsOnnxNodeSupported(const NodeUnit& node_unit, if (x_type == nullptr || (x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT && // because pool_fp16_op_test can be enabled by other preprocessor, for example, COREML_ENABLE_MLPROGRAM -if XNNPACK_FP16_SUPPORTED +#if XNNPACK_FP16_SUPPORTED x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 && #endif x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_UINT8 && From 70dedcc27bc06366b792437fa3230fbbcda3bd83 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 2 Oct 2024 15:16:08 +0000 Subject: [PATCH 21/25] typo --- onnxruntime/core/providers/xnnpack/nn/max_pool.cc | 2 +- .../core/providers/xnnpack/xnnpack_execution_provider.cc | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc index f0cf7a599d131..749e004094ba1 100644 --- a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc +++ b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc @@ -56,7 +56,7 @@ bool MaxPool::IsOnnxNodeSupported(const NodeUnit& node_unit, if (x_type == nullptr || (x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT && // because pool_fp16_op_test can be enabled by other preprocessor, for example, COREML_ENABLE_MLPROGRAM -#if XNNPACK_FP16_SUPPORTED +#ifdef XNNPACK_FP16_SUPPORTED x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 && #endif x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_UINT8 && diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc index 3223880c6efe1..733634ecbc9a0 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc +++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc @@ -49,10 +49,12 @@ KernelCreateInfo BuildKernelCreateInfo() { ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, MLFloat16, Op)>, #define CLASS_ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name) \ - class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, startver, endver, MLFloat16, name) + class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, \ + startver, endver, MLFloat16, name) #define CLASS_ONNX_OPERATOR_KERNEL_CLASS_NAME_FP16(provider, domain, startver, name) \ - class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, startver, MLFloat16, name) + class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, startver, \ + MLFloat16, name) #else #define KERNEL_CREATE_INFO_VERSIONED_FP16(Start, End, Op, Domain) #define KERNEL_CREATE_INFO_FP16(Start, Op, Domain) From 27011c05cf8415b32b29da9a632d00030a325035 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 2 Oct 2024 15:21:13 +0000 Subject: [PATCH 22/25] XNN_ARCH_ARM64 --- .../core/providers/xnnpack/xnnpack_init.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_init.h b/onnxruntime/core/providers/xnnpack/xnnpack_init.h index e3f71b9a90367..efb0189976708 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_init.h +++ b/onnxruntime/core/providers/xnnpack/xnnpack_init.h @@ -46,16 +46,17 @@ namespace xnnpack { #define XNN_ALLOCATION_ALIGNMENT 16 #endif -#if (!defined(_MSC_VER)) || (_MSC_VER >= 1930) -#if (defined(_M_ARM64) || defined(__aarch64__) || defined(_M_ARM64EC)) && !XNN_ARCH_X86_64 -#if !defined(__APPLE__) -// Had to temporary disable fp16 under APPLE ARM64, as compiling the source files -// require a hardware specific compilation flag. When building an universial binary for APPLE, -// this flag would cause trouble for x64 target. Referenced from MLAS +#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) + #define XNN_ARCH_ARM64 1 +#else + #define XNN_ARCH_ARM64 0 +#endif + +// fp16 support can vary on a kernel by kernel basis. Keep it simple and limit to arm64 for now. +// e.g. XNNPACK maxpool has x64 and arm64 fp16 kernels. +#if XNN_ARCH_ARM64 #define XNNPACK_FP16_SUPPORTED -#endif // -#endif // ARM64 -#endif // Visual Studio 16 or earlier does not support fp16 intrinsic +#endif std::pair GetStoredAllocator(); From 9dfa8b1b43b59a18ae7ea2d7b83690622b026a7f Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 2 Oct 2024 15:22:18 +0000 Subject: [PATCH 23/25] lint --- .../core/providers/xnnpack/xnnpack_execution_provider.cc | 4 ++-- onnxruntime/core/providers/xnnpack/xnnpack_init.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc index 733634ecbc9a0..ac826a1bcec98 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc +++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc @@ -48,11 +48,11 @@ KernelCreateInfo BuildKernelCreateInfo() { BuildKernelCreateInfo< \ ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, MLFloat16, Op)>, -#define CLASS_ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name) \ +#define CLASS_ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name) \ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, \ startver, endver, MLFloat16, name) -#define CLASS_ONNX_OPERATOR_KERNEL_CLASS_NAME_FP16(provider, domain, startver, name) \ +#define CLASS_ONNX_OPERATOR_KERNEL_CLASS_NAME_FP16(provider, domain, startver, name) \ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, startver, \ MLFloat16, name) #else diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_init.h b/onnxruntime/core/providers/xnnpack/xnnpack_init.h index efb0189976708..ed824939a40da 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_init.h +++ b/onnxruntime/core/providers/xnnpack/xnnpack_init.h @@ -47,9 +47,9 @@ namespace xnnpack { #endif #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) - #define XNN_ARCH_ARM64 1 +#define XNN_ARCH_ARM64 1 #else - #define XNN_ARCH_ARM64 0 +#define XNN_ARCH_ARM64 0 #endif // fp16 support can vary on a kernel by kernel basis. Keep it simple and limit to arm64 for now. From be9c20953275778e4b61a28b6a38db04af8f216f Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 3 Oct 2024 04:41:44 +0000 Subject: [PATCH 24/25] update --- .../xnnpack/xnnpack_execution_provider.cc | 28 ++++++++----------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc index ac826a1bcec98..1f50009ec735f 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc +++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc @@ -31,6 +31,10 @@ KernelCreateInfo BuildKernelCreateInfo() { BuildKernelCreateInfo< \ ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, End, Op)> +#define KERNEL_CREATE_INFO_VERSIONED_TYPED(Start, End, Type, Op, Domain) \ + BuildKernelCreateInfo< \ + ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, End, Type, Op)> + #define KERNEL_CREATE_INFO(Start, Op, Domain) \ BuildKernelCreateInfo< \ ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, Op)> @@ -40,14 +44,6 @@ KernelCreateInfo BuildKernelCreateInfo() { ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, Type, Op)> #ifdef XNNPACK_FP16_SUPPORTED -#define KERNEL_CREATE_INFO_VERSIONED_FP16(Start, End, Op, Domain) \ - BuildKernelCreateInfo< \ - ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, End, MLFloat16, Op)>, - -#define KERNEL_CREATE_INFO_FP16(Start, Op, Domain) \ - BuildKernelCreateInfo< \ - ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, MLFloat16, Op)>, - #define CLASS_ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name) \ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, \ startver, endver, MLFloat16, name) @@ -56,8 +52,6 @@ KernelCreateInfo BuildKernelCreateInfo() { class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, startver, \ MLFloat16, name) #else -#define KERNEL_CREATE_INFO_VERSIONED_FP16(Start, End, Op, Domain) -#define KERNEL_CREATE_INFO_FP16(Start, Op, Domain) #define CLASS_ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name) #define CLASS_ONNX_OPERATOR_KERNEL_CLASS_NAME_FP16(provider, domain, startver, name) #endif @@ -135,13 +129,8 @@ std::unique_ptr RegisterKernels() { KERNEL_CREATE_INFO_VERSIONED(10, 10, MaxPool, kMSInternalNHWCDomain), KERNEL_CREATE_INFO_VERSIONED(11, 11, MaxPool, kMSInternalNHWCDomain), KERNEL_CREATE_INFO(12, MaxPool, kMSInternalNHWCDomain), - // because the F16 macro may be expanded to empty, so don't add "," - KERNEL_CREATE_INFO_VERSIONED_FP16(8, 9, MaxPool, kMSInternalNHWCDomain) - KERNEL_CREATE_INFO_VERSIONED_FP16(10, 10, MaxPool, kMSInternalNHWCDomain) - KERNEL_CREATE_INFO_VERSIONED_FP16(11, 11, MaxPool, kMSInternalNHWCDomain) - KERNEL_CREATE_INFO_FP16(12, MaxPool, kMSInternalNHWCDomain) - KERNEL_CREATE_INFO(1, QLinearConvTranspose, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO(1, QLinearConvTranspose, kMSInternalNHWCDomain), KERNEL_CREATE_INFO_VERSIONED(10, 10, Resize, kMSInternalNHWCDomain), KERNEL_CREATE_INFO_VERSIONED(11, 12, Resize, kMSInternalNHWCDomain), @@ -170,6 +159,13 @@ std::unique_ptr RegisterKernels() { KERNEL_CREATE_INFO_TYPED(10, int8_t, QLinearConv, kMSInternalNHWCDomain), KERNEL_CREATE_INFO(1, QLinearSoftmax, kDynamicDomainByCreate), + +#ifdef XNNPACK_FP16_SUPPORTED + KERNEL_CREATE_INFO_VERSIONED_TYPED(8, 9, MLFloat16, MaxPool, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO_VERSIONED_TYPED(10, 10, MLFloat16, MaxPool, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO_VERSIONED_TYPED(11, 11, MLFloat16, MaxPool, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO_TYPED(12, MLFloat16, MaxPool, kMSInternalNHWCDomain), +#endif }; for (auto& function_table_entry : function_table) { From 060c55b38b81120a310b1220316fc0fdaeb4ff34 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 3 Oct 2024 05:01:35 +0000 Subject: [PATCH 25/25] lint --- .../core/providers/xnnpack/xnnpack_execution_provider.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc index 1f50009ec735f..df7df0b4376ce 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc +++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc @@ -32,7 +32,7 @@ KernelCreateInfo BuildKernelCreateInfo() { ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, End, Op)> #define KERNEL_CREATE_INFO_VERSIONED_TYPED(Start, End, Type, Op, Domain) \ - BuildKernelCreateInfo< \ + BuildKernelCreateInfo< \ ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, End, Type, Op)> #define KERNEL_CREATE_INFO(Start, Op, Domain) \