Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add MaxPool FP16 in XnnPack EP #22258

Merged
merged 28 commits into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 41 additions & 4 deletions onnxruntime/core/providers/xnnpack/nn/max_pool.cc
mszhanyi marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#include "core/graph/graph.h"
#include "core/providers/utils.h"
#include "core/providers/xnnpack/xnnpack_init.h"
#include "core/framework/tensorprotoutils.h"

// to sanity check output shape
Expand Down Expand Up @@ -54,6 +55,9 @@
// input of maxpool could be fp16/fp32/fp64,i8/u8 according to ONNX
if (x_type == nullptr ||
(x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT &&
#ifdef XNNPACK_FP16_SUPPORTED
mszhanyi marked this conversation as resolved.
Show resolved Hide resolved
x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 &&
#endif
x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_UINT8 &&
x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8)) {
break;
Expand Down Expand Up @@ -193,9 +197,19 @@
stride_height, stride_width,
dilation_height, dilation_width,
output_min, output_max, flags, &p);
} else if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
maxpool_type_ = OpComputeType::op_compute_type_fp16;
const float output_min = -65504.0;
const float output_max = 65504.0;
status = xnn_create_max_pooling2d_nhwc_f16(input_padding_top, input_padding_right,
input_padding_bottom, input_padding_left,
pooling_height, pooling_width,
stride_height, stride_width,
dilation_height, dilation_width,
output_min, output_max, flags, &p);
} else {
auto stype = DataTypeImpl::ToString(DataTypeImpl::TypeFromProto(*X_arg.TypeAsProto()));
ORT_THROW("unsupported Conv in maxpool, we have FLOAT|UINT8, but got ", stype);
ORT_THROW("unsupported Conv in maxpool, we have FLOAT|UINT8|FP16, but got ", stype);
mszhanyi marked this conversation as resolved.
Show resolved Hide resolved
}
ORT_ENFORCE(status == xnn_status_success, "xnn_create_max_pooling2d_nhwc_",
OpTypeToString(maxpool_type_), "failed. Status:", status);
Expand Down Expand Up @@ -225,10 +239,12 @@
pthreadpool_t threadpool = GetThreadPool();

auto reshape_fn = xnn_reshape_max_pooling2d_nhwc_f32;
if (maxpool_type_ == OpComputeType::op_compute_type_qu8)
if (maxpool_type_ == OpComputeType::op_compute_type_qu8) {
reshape_fn = xnn_reshape_max_pooling2d_nhwc_u8;
else if (maxpool_type_ == OpComputeType::op_compute_type_qs8) {
} else if (maxpool_type_ == OpComputeType::op_compute_type_qs8) {
reshape_fn = xnn_reshape_max_pooling2d_nhwc_s8;
} else if (maxpool_type_ == OpComputeType::op_compute_type_fp16) {
reshape_fn = xnn_reshape_max_pooling2d_nhwc_f16;
}

auto status = reshape_fn(op0_.get(), N, H, W,
Expand All @@ -244,8 +260,10 @@
status = xnn_setup_max_pooling2d_nhwc_f32(op0_.get(), X.Data<float>(), Y->MutableData<float>());
} else if (maxpool_type_ == OpComputeType::op_compute_type_qu8) {
status = xnn_setup_max_pooling2d_nhwc_u8(op0_.get(), X.Data<uint8_t>(), Y->MutableData<uint8_t>());
} else {
} else if (maxpool_type_ == OpComputeType::op_compute_type_qs8) {
status = xnn_setup_max_pooling2d_nhwc_s8(op0_.get(), X.Data<int8_t>(), Y->MutableData<int8_t>());
} else if (maxpool_type_ == OpComputeType::op_compute_type_fp16) {
status = xnn_setup_max_pooling2d_nhwc_f16(op0_.get(), X.Data<MLFloat16>(), Y->MutableData<MLFloat16>());
}

if (status != xnn_status_success) {
Expand Down Expand Up @@ -285,5 +303,24 @@
DataTypeImpl::GetTensorType<uint8_t>(),
DataTypeImpl::GetTensorType<int8_t>()}),
MaxPool);

#ifdef XNNPACK_FP16_SUPPORTED
ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 8, 9, MLFloat16, kXnnpackExecutionProvider,
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<MLFloat16>()),

Check warning on line 309 in onnxruntime/core/providers/xnnpack/nn/max_pool.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Lines should be <= 120 characters long [whitespace/line_length] [2] Raw Output: onnxruntime/core/providers/xnnpack/nn/max_pool.cc:309: Lines should be <= 120 characters long [whitespace/line_length] [2]
MaxPool);

ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 10, 10, MLFloat16, kXnnpackExecutionProvider,
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<MLFloat16>()),

Check warning on line 313 in onnxruntime/core/providers/xnnpack/nn/max_pool.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Lines should be <= 120 characters long [whitespace/line_length] [2] Raw Output: onnxruntime/core/providers/xnnpack/nn/max_pool.cc:313: Lines should be <= 120 characters long [whitespace/line_length] [2]
MaxPool);

ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 11, 11, MLFloat16, kXnnpackExecutionProvider,
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<MLFloat16>()),

Check warning on line 317 in onnxruntime/core/providers/xnnpack/nn/max_pool.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Lines should be <= 120 characters long [whitespace/line_length] [2] Raw Output: onnxruntime/core/providers/xnnpack/nn/max_pool.cc:317: Lines should be <= 120 characters long [whitespace/line_length] [2]
MaxPool);

ONNX_OPERATOR_TYPED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 12, MLFloat16, kXnnpackExecutionProvider,
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<MLFloat16>()),
MaxPool);
#endif

} // namespace xnnpack
} // namespace onnxruntime
66 changes: 59 additions & 7 deletions onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,13 @@
#include "core/providers/xnnpack/detail/node_support_checker.h"
#include "core/providers/xnnpack/xnnpack_init.h"

namespace {
struct KernelRegistryAndStatus {
std::shared_ptr<onnxruntime::KernelRegistry> kernel_registry = std::make_shared<onnxruntime::KernelRegistry>();
onnxruntime::Status st;
};
} // namespace

namespace onnxruntime {

namespace xnnpack {
Expand All @@ -31,6 +38,10 @@
BuildKernelCreateInfo< \
ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, End, Op)>

#define KERNEL_CREATE_INFO_VERSIONED_TYPED(Start, End, Type, Op, Domain) \
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, End, \
Type, Op)>

#define KERNEL_CREATE_INFO(Start, Op, Domain) \
BuildKernelCreateInfo< \
ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, Op)>
Expand Down Expand Up @@ -68,6 +79,12 @@
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, 10, MaxPool);
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool);
#ifdef XNNPACK_FP16_SUPPORTED
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 8, 9, MLFloat16, MaxPool);

Check warning on line 83 in onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Lines should be <= 120 characters long [whitespace/line_length] [2] Raw Output: onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc:83: Lines should be <= 120 characters long [whitespace/line_length] [2]
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, 10, MLFloat16, MaxPool);

Check warning on line 84 in onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Lines should be <= 120 characters long [whitespace/line_length] [2] Raw Output: onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc:84: Lines should be <= 120 characters long [whitespace/line_length] [2]
class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 11, MLFloat16, MaxPool);

Check warning on line 85 in onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Lines should be <= 120 characters long [whitespace/line_length] [2] Raw Output: onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc:85: Lines should be <= 120 characters long [whitespace/line_length] [2]
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 12, MLFloat16, MaxPool);
#endif

// ONNX operators
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 7, 8, Gemm);
Expand All @@ -86,9 +103,29 @@
// Internal domain
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kDynamicDomainByCreate, 1, QLinearSoftmax);

std::unique_ptr<KernelRegistry> RegisterKernels() {
auto kernel_registry = std::make_unique<onnxruntime::KernelRegistry>();
#ifdef XNNPACK_FP16_SUPPORTED
Status RegisterFp16Kernels(KernelRegistry& kernel_registry) {
static const BuildKernelCreateInfoFn function_table[] = {
BuildKernelCreateInfo<void>, // default entry to avoid the list become empty after ops-reducing

KERNEL_CREATE_INFO_VERSIONED_TYPED(8, 9, MLFloat16, MaxPool, kMSInternalNHWCDomain),
KERNEL_CREATE_INFO_VERSIONED_TYPED(10, 10, MLFloat16, MaxPool, kMSInternalNHWCDomain),
KERNEL_CREATE_INFO_VERSIONED_TYPED(11, 11, MLFloat16, MaxPool, kMSInternalNHWCDomain),
KERNEL_CREATE_INFO_TYPED(12, MLFloat16, MaxPool, kMSInternalNHWCDomain),
};
skottmckay marked this conversation as resolved.
Show resolved Hide resolved

for (auto& function_table_entry : function_table) {
KernelCreateInfo info = function_table_entry();
if (info.kernel_def != nullptr) { // filter disabled entries where type is void
ORT_RETURN_IF_ERROR(kernel_registry.Register(std::move(info)));
}
}

return Status::OK();
}
#endif

Status RegisterKernels(KernelRegistry& kernel_registry) {
static const BuildKernelCreateInfoFn function_table[] = {
BuildKernelCreateInfo<void>, // default entry to avoid the list becoming empty after ops-reducing

Expand Down Expand Up @@ -143,11 +180,11 @@
for (auto& function_table_entry : function_table) {
KernelCreateInfo info = function_table_entry();
if (info.kernel_def != nullptr) { // filter disabled entries where type is void
ORT_THROW_IF_ERROR(kernel_registry->Register(std::move(info)));
ORT_THROW_IF_ERROR(kernel_registry.Register(std::move(info)));
}
}

return kernel_registry;
return Status::OK();
}

} // namespace xnnpack
Expand Down Expand Up @@ -259,7 +296,6 @@
const onnxruntime::GraphViewer& graph,
const IKernelLookup& /*kernel_lookup*/) const {
std::vector<std::unique_ptr<ComputeCapability>> capabilities;

std::shared_ptr<KernelRegistry> registry = GetKernelRegistry();
std::unordered_map<const Node*, const NodeUnit*> supported_node_unit_map;
NodeSupportChecker checker{graph, supported_node_unit_map};
Expand Down Expand Up @@ -348,9 +384,25 @@
return capabilities;
}

Status RegisterXnnPackKernels(KernelRegistry& kernel_registry) {
ORT_RETURN_IF_ERROR(RegisterKernels(kernel_registry));
#ifdef XNNPACK_FP16_SUPPORTED
ORT_RETURN_IF_ERROR(RegisterFp16Kernels(kernel_registry));
#endif
return Status::OK();
}

KernelRegistryAndStatus GetXnnPackKernelRegistry() {
KernelRegistryAndStatus ret;
ret.st = RegisterXnnPackKernels(*ret.kernel_registry);
return ret;
}

std::shared_ptr<KernelRegistry> XnnpackExecutionProvider::GetKernelRegistry() const {
static std::shared_ptr<KernelRegistry> registry = xnnpack::RegisterKernels();
return registry;
static KernelRegistryAndStatus k = GetXnnPackKernelRegistry();
// throw if the registry failed to initialize
ORT_THROW_IF_ERROR(k.st);
return k.kernel_registry;
}

XnnpackExecutionProvider::~XnnpackExecutionProvider() {
Expand Down
13 changes: 13 additions & 0 deletions onnxruntime/core/providers/xnnpack/xnnpack_init.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,19 @@ namespace xnnpack {
#define XNN_ALLOCATION_ALIGNMENT 16
#endif

#if (!defined(_MSC_VER)) || (_MSC_VER >= 1930)
#if defined(_M_ARM64) || defined(__aarch64__) || defined(_M_ARM64EC)
#if !defined(__APPLE__)
// Had to temporary disable fp16 under APPLE ARM64, as compiling
mszhanyi marked this conversation as resolved.
Show resolved Hide resolved
// the source files require a hardware specific compilation flag.
// When building an universial binary for APPLE, this flag would
// cause trouble for x64 target.
mszhanyi marked this conversation as resolved.
Show resolved Hide resolved
// reference from MLAS
#define XNNPACK_FP16_SUPPORTED
#endif //
#endif // ARM64
#endif // Visual Studio 16 or earlier does not support fp16 intrinsic

std::pair<AllocatorPtr&, xnn_allocator*> GetStoredAllocator();

} // namespace xnnpack
Expand Down
Loading