Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add MaxPool FP16 in XnnPack EP #22258

Merged
merged 28 commits into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 39 additions & 4 deletions onnxruntime/core/providers/xnnpack/nn/max_pool.cc
mszhanyi marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include "core/graph/graph.h"
#include "core/providers/utils.h"
#include "core/providers/xnnpack/xnnpack_init.h"
#include "core/framework/tensorprotoutils.h"

// to sanity check output shape
Expand Down Expand Up @@ -54,6 +55,7 @@
// input of maxpool could be fp16/fp32/fp64,i8/u8 according to ONNX
if (x_type == nullptr ||
(x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT &&
x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 &&
x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_UINT8 &&
x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8)) {
break;
Expand Down Expand Up @@ -193,9 +195,19 @@
stride_height, stride_width,
dilation_height, dilation_width,
output_min, output_max, flags, &p);
} else if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
maxpool_type_ = OpComputeType::op_compute_type_fp16;
const float output_min = -65504.0;
const float output_max = 65504.0;
status = xnn_create_max_pooling2d_nhwc_f16(input_padding_top, input_padding_right,
input_padding_bottom, input_padding_left,
pooling_height, pooling_width,
stride_height, stride_width,
dilation_height, dilation_width,
output_min, output_max, flags, &p);
} else {
auto stype = DataTypeImpl::ToString(DataTypeImpl::TypeFromProto(*X_arg.TypeAsProto()));
ORT_THROW("unsupported Conv in maxpool, we have FLOAT|UINT8, but got ", stype);
ORT_THROW("unsupported Conv in maxpool, we have FLOAT|UINT8|FLOAT16, but got ", stype);
}
ORT_ENFORCE(status == xnn_status_success, "xnn_create_max_pooling2d_nhwc_",
OpTypeToString(maxpool_type_), "failed. Status:", status);
Expand Down Expand Up @@ -225,10 +237,12 @@
pthreadpool_t threadpool = GetThreadPool();

auto reshape_fn = xnn_reshape_max_pooling2d_nhwc_f32;
if (maxpool_type_ == OpComputeType::op_compute_type_qu8)
if (maxpool_type_ == OpComputeType::op_compute_type_qu8) {
reshape_fn = xnn_reshape_max_pooling2d_nhwc_u8;
else if (maxpool_type_ == OpComputeType::op_compute_type_qs8) {
} else if (maxpool_type_ == OpComputeType::op_compute_type_qs8) {
reshape_fn = xnn_reshape_max_pooling2d_nhwc_s8;
} else if (maxpool_type_ == OpComputeType::op_compute_type_fp16) {
reshape_fn = xnn_reshape_max_pooling2d_nhwc_f16;
}

auto status = reshape_fn(op0_.get(), N, H, W,
Expand All @@ -244,8 +258,10 @@
status = xnn_setup_max_pooling2d_nhwc_f32(op0_.get(), X.Data<float>(), Y->MutableData<float>());
} else if (maxpool_type_ == OpComputeType::op_compute_type_qu8) {
status = xnn_setup_max_pooling2d_nhwc_u8(op0_.get(), X.Data<uint8_t>(), Y->MutableData<uint8_t>());
} else {
} else if (maxpool_type_ == OpComputeType::op_compute_type_qs8) {
status = xnn_setup_max_pooling2d_nhwc_s8(op0_.get(), X.Data<int8_t>(), Y->MutableData<int8_t>());
} else if (maxpool_type_ == OpComputeType::op_compute_type_fp16) {
status = xnn_setup_max_pooling2d_nhwc_f16(op0_.get(), X.Data<MLFloat16>(), Y->MutableData<MLFloat16>());
}

if (status != xnn_status_success) {
Expand Down Expand Up @@ -285,5 +301,24 @@
DataTypeImpl::GetTensorType<uint8_t>(),
DataTypeImpl::GetTensorType<int8_t>()}),
MaxPool);

#ifdef XNNPACK_FP16_SUPPORTED
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 8, 9, MLFloat16, kXnnpackExecutionProvider,
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<MLFloat16>()),

Check warning on line 307 in onnxruntime/core/providers/xnnpack/nn/max_pool.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Lines should be <= 120 characters long [whitespace/line_length] [2] Raw Output: onnxruntime/core/providers/xnnpack/nn/max_pool.cc:307: Lines should be <= 120 characters long [whitespace/line_length] [2]
MaxPool);

ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 10, 10, MLFloat16, kXnnpackExecutionProvider,
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<MLFloat16>()),

Check warning on line 311 in onnxruntime/core/providers/xnnpack/nn/max_pool.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Lines should be <= 120 characters long [whitespace/line_length] [2] Raw Output: onnxruntime/core/providers/xnnpack/nn/max_pool.cc:311: Lines should be <= 120 characters long [whitespace/line_length] [2]
MaxPool);

ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 11, 11, MLFloat16, kXnnpackExecutionProvider,
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<MLFloat16>()),

Check warning on line 315 in onnxruntime/core/providers/xnnpack/nn/max_pool.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Lines should be <= 120 characters long [whitespace/line_length] [2] Raw Output: onnxruntime/core/providers/xnnpack/nn/max_pool.cc:315: Lines should be <= 120 characters long [whitespace/line_length] [2]
MaxPool);

ONNX_OPERATOR_TYPED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 12, MLFloat16, kXnnpackExecutionProvider,
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<MLFloat16>()),
MaxPool);
#endif

} // namespace xnnpack
} // namespace onnxruntime
29 changes: 29 additions & 0 deletions onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,27 @@
BuildKernelCreateInfo< \
ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, Type, Op)>

#ifdef XNNPACK_FP16_SUPPORTED
#define KERNEL_CREATE_INFO_VERSIONED_FP16(Start, End, Op, Domain) \
BuildKernelCreateInfo< \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, End, MLFloat16, Op)>

#define KERNEL_CREATE_INFO_FP16(Start, Op, Domain) \
BuildKernelCreateInfo< \
ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, MLFloat16, Op)>

#define ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name) \
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, startver, endver, MLFloat16, name)

Check warning on line 52 in onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Lines should be <= 120 characters long [whitespace/line_length] [2] Raw Output: onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc:52: Lines should be <= 120 characters long [whitespace/line_length] [2]

#define ONNX_OPERATOR_KERNEL_CLASS_NAME_FP16(provider, domain, startver, name) \
ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, startver, MLFloat16, name)
#else
#define KERNEL_CREATE_INFO_VERSIONED_FP16(Start, End, Op, Domain)
#define KERNEL_CREATE_INFO_FP16(Start, Op, Domain)
#define ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name)
#define ONNX_OPERATOR_KERNEL_CLASS_NAME_FP16(provider, domain, startver, endver, name)
#endif

// Layout sensitive operators in NHWC domain
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 7, 9, AveragePool);
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, 10, AveragePool);
Expand Down Expand Up @@ -68,6 +89,10 @@
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, 10, MaxPool);
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool);
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 8, 9, MaxPool);
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, 10, MaxPool);
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME_FP16(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool);
class ONNX_OPERATOR_KERNEL_CLASS_NAME_FP16(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool);

// ONNX operators
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 7, 8, Gemm);
Expand Down Expand Up @@ -108,6 +133,10 @@
KERNEL_CREATE_INFO_VERSIONED(10, 10, MaxPool, kMSInternalNHWCDomain),
KERNEL_CREATE_INFO_VERSIONED(11, 11, MaxPool, kMSInternalNHWCDomain),
KERNEL_CREATE_INFO(12, MaxPool, kMSInternalNHWCDomain),
KERNEL_CREATE_INFO_VERSIONED_FP16(8, 9, MaxPool, kMSInternalNHWCDomain),
KERNEL_CREATE_INFO_VERSIONED_FP16(10, 10, MaxPool, kMSInternalNHWCDomain),
KERNEL_CREATE_INFO_VERSIONED_FP16(11, 11, MaxPool, kMSInternalNHWCDomain),
KERNEL_CREATE_INFO_FP16(12, MaxPool, kMSInternalNHWCDomain),

KERNEL_CREATE_INFO(1, QLinearConvTranspose, kMSInternalNHWCDomain),

Expand Down
13 changes: 13 additions & 0 deletions onnxruntime/core/providers/xnnpack/xnnpack_init.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,19 @@ namespace xnnpack {
#define XNN_ALLOCATION_ALIGNMENT 16
#endif

#if (!defined(_MSC_VER)) || (_MSC_VER >= 1930)
#if (defined(_M_ARM64) || defined(__aarch64__) || defined(_M_ARM64EC)) && !XNN_ARCH_X86_64
#if !defined(__APPLE__)
// Had to temporary disable fp16 under APPLE ARM64, as compiling
mszhanyi marked this conversation as resolved.
Show resolved Hide resolved
// the source files require a hardware specific compilation flag.
// When building an universial binary for APPLE, this flag would
// cause trouble for x64 target.
mszhanyi marked this conversation as resolved.
Show resolved Hide resolved
// referenced from MLAS
#define XNNPACK_FP16_SUPPORTED
#endif //
#endif // ARM64
#endif // Visual Studio 16 or earlier does not support fp16 intrinsic

std::pair<AllocatorPtr&, xnn_allocator*> GetStoredAllocator();

} // namespace xnnpack
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/test/providers/cpu/nn/pool_fp16_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

#include "core/mlas/inc/mlas.h"

#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM)
#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || defined(COREML_ENABLE_MLPROGRAM) || defined(USE_XNNPACK)
mszhanyi marked this conversation as resolved.
Show resolved Hide resolved

#include "core/providers/cpu/nn/pool.h"
#include "gtest/gtest.h"
Expand Down
Loading