microsoft · mszhanyi · Oct 3, 2024 · Sep 28, 2024 · Sep 28, 2024 · Sep 28, 2024
diff --git a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc
@@ -5,6 +5,7 @@
 
 #include "core/graph/graph.h"
 #include "core/providers/utils.h"
+#include "core/providers/xnnpack/xnnpack_init.h"
 #include "core/framework/tensorprotoutils.h"
 
 // to sanity check output shape
@@ -54,6 +55,9 @@
     // input of maxpool could be fp16/fp32/fp64,i8/u8 according to ONNX
     if (x_type == nullptr ||
         (x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT &&
+#ifdef XNNPACK_FP16_SUPPORTED
+         x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 &&
+#endif
          x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_UINT8 &&
          x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8)) {
       break;
@@ -193,9 +197,19 @@
                                               stride_height, stride_width,
                                               dilation_height, dilation_width,
                                               output_min, output_max, flags, &p);
+  } else if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
+    maxpool_type_ = OpComputeType::op_compute_type_fp16;
+    const float output_min = -65504.0;
+    const float output_max = 65504.0;
+    status = xnn_create_max_pooling2d_nhwc_f16(input_padding_top, input_padding_right,
+                                               input_padding_bottom, input_padding_left,
+                                               pooling_height, pooling_width,
+                                               stride_height, stride_width,
+                                               dilation_height, dilation_width,
+                                               output_min, output_max, flags, &p);
   } else {
     auto stype = DataTypeImpl::ToString(DataTypeImpl::TypeFromProto(*X_arg.TypeAsProto()));
-    ORT_THROW("unsupported Conv in maxpool, we have FLOAT|UINT8, but got ", stype);
+    ORT_THROW("unsupported Conv in maxpool, we have FLOAT|UINT8|FP16, but got ", stype);
   }
   ORT_ENFORCE(status == xnn_status_success, "xnn_create_max_pooling2d_nhwc_",
               OpTypeToString(maxpool_type_), "failed. Status:", status);
@@ -225,10 +239,12 @@
   pthreadpool_t threadpool = GetThreadPool();
 
   auto reshape_fn = xnn_reshape_max_pooling2d_nhwc_f32;
-  if (maxpool_type_ == OpComputeType::op_compute_type_qu8)
+  if (maxpool_type_ == OpComputeType::op_compute_type_qu8) {
     reshape_fn = xnn_reshape_max_pooling2d_nhwc_u8;
-  else if (maxpool_type_ == OpComputeType::op_compute_type_qs8) {
+  } else if (maxpool_type_ == OpComputeType::op_compute_type_qs8) {
     reshape_fn = xnn_reshape_max_pooling2d_nhwc_s8;
+  } else if (maxpool_type_ == OpComputeType::op_compute_type_fp16) {
+    reshape_fn = xnn_reshape_max_pooling2d_nhwc_f16;
   }
 
   auto status = reshape_fn(op0_.get(), N, H, W,
@@ -244,8 +260,10 @@
     status = xnn_setup_max_pooling2d_nhwc_f32(op0_.get(), X.Data<float>(), Y->MutableData<float>());
   } else if (maxpool_type_ == OpComputeType::op_compute_type_qu8) {
     status = xnn_setup_max_pooling2d_nhwc_u8(op0_.get(), X.Data<uint8_t>(), Y->MutableData<uint8_t>());
-  } else {
+  } else if (maxpool_type_ == OpComputeType::op_compute_type_qs8) {
     status = xnn_setup_max_pooling2d_nhwc_s8(op0_.get(), X.Data<int8_t>(), Y->MutableData<int8_t>());
+  } else if (maxpool_type_ == OpComputeType::op_compute_type_fp16) {
+    status = xnn_setup_max_pooling2d_nhwc_f16(op0_.get(), X.Data<MLFloat16>(), Y->MutableData<MLFloat16>());
   }
 
   if (status != xnn_status_success) {
@@ -285,5 +303,24 @@
                                                   DataTypeImpl::GetTensorType<uint8_t>(),
                                                   DataTypeImpl::GetTensorType<int8_t>()}),
                         MaxPool);
+
+#ifdef XNNPACK_FP16_SUPPORTED
+ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 8, 9, MLFloat16, kXnnpackExecutionProvider,
+                                        KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<MLFloat16>()),
+                                        MaxPool);
+
+ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 10, 10, MLFloat16, kXnnpackExecutionProvider,
+                                        KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<MLFloat16>()),
+                                        MaxPool);
+
+ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 11, 11, MLFloat16, kXnnpackExecutionProvider,
+                                        KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<MLFloat16>()),
+                                        MaxPool);
+
+ONNX_OPERATOR_TYPED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 12, MLFloat16, kXnnpackExecutionProvider,
+                              KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<MLFloat16>()),
+                              MaxPool);
+#endif
+
 }  // namespace xnnpack
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc
@@ -18,6 +18,13 @@
 #include "core/providers/xnnpack/detail/node_support_checker.h"
 #include "core/providers/xnnpack/xnnpack_init.h"
 
+namespace {
+struct KernelRegistryAndStatus {
+  std::shared_ptr<onnxruntime::KernelRegistry> kernel_registry = std::make_shared<onnxruntime::KernelRegistry>();
+  onnxruntime::Status st;
+};
+}  // namespace
+
 namespace onnxruntime {
 
 namespace xnnpack {
@@ -31,6 +38,10 @@
   BuildKernelCreateInfo<                                     \
       ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, End, Op)>
 
+#define KERNEL_CREATE_INFO_VERSIONED_TYPED(Start, End, Type, Op, Domain)                                               \
+  BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, End, \
+                                                                        Type, Op)>
+
 #define KERNEL_CREATE_INFO(Start, Op, Domain) \
   BuildKernelCreateInfo<                      \
       ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, Op)>
@@ -68,6 +79,12 @@
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, 10, MaxPool);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool);
+#ifdef XNNPACK_FP16_SUPPORTED
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 8, 9, MLFloat16, MaxPool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, 10, MLFloat16, MaxPool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 11, MLFloat16, MaxPool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 12, MLFloat16, MaxPool);
+#endif
 
 // ONNX operators
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 7, 8, Gemm);
@@ -86,9 +103,29 @@
 // Internal domain
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kDynamicDomainByCreate, 1, QLinearSoftmax);
 
-std::unique_ptr<KernelRegistry> RegisterKernels() {
-  auto kernel_registry = std::make_unique<onnxruntime::KernelRegistry>();
+#ifdef XNNPACK_FP16_SUPPORTED
+Status RegisterFp16Kernels(KernelRegistry& kernel_registry) {
+  static const BuildKernelCreateInfoFn function_table[] = {
+      BuildKernelCreateInfo<void>,  // default entry to avoid the list become empty after ops-reducing
+
+      KERNEL_CREATE_INFO_VERSIONED_TYPED(8, 9, MLFloat16, MaxPool, kMSInternalNHWCDomain),
+      KERNEL_CREATE_INFO_VERSIONED_TYPED(10, 10, MLFloat16, MaxPool, kMSInternalNHWCDomain),
+      KERNEL_CREATE_INFO_VERSIONED_TYPED(11, 11, MLFloat16, MaxPool, kMSInternalNHWCDomain),
+      KERNEL_CREATE_INFO_TYPED(12, MLFloat16, MaxPool, kMSInternalNHWCDomain),
+  };
+
+  for (auto& function_table_entry : function_table) {
+    KernelCreateInfo info = function_table_entry();
+    if (info.kernel_def != nullptr) {  // filter disabled entries where type is void
+      ORT_RETURN_IF_ERROR(kernel_registry.Register(std::move(info)));
+    }
+  }
+
+  return Status::OK();
+}
+#endif
 
+Status RegisterKernels(KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn function_table[] = {
       BuildKernelCreateInfo<void>,  // default entry to avoid the list becoming empty after ops-reducing
 
@@ -143,11 +180,11 @@
   for (auto& function_table_entry : function_table) {
     KernelCreateInfo info = function_table_entry();
     if (info.kernel_def != nullptr) {  // filter disabled entries where type is void
-      ORT_THROW_IF_ERROR(kernel_registry->Register(std::move(info)));
+      ORT_THROW_IF_ERROR(kernel_registry.Register(std::move(info)));
     }
   }
 
-  return kernel_registry;
+  return Status::OK();
 }
 
 }  // namespace xnnpack
@@ -259,7 +296,6 @@
     const onnxruntime::GraphViewer& graph,
     const IKernelLookup& /*kernel_lookup*/) const {
   std::vector<std::unique_ptr<ComputeCapability>> capabilities;
-
   std::shared_ptr<KernelRegistry> registry = GetKernelRegistry();
   std::unordered_map<const Node*, const NodeUnit*> supported_node_unit_map;
   NodeSupportChecker checker{graph, supported_node_unit_map};
@@ -348,9 +384,25 @@
   return capabilities;
 }
 
+Status RegisterXnnPackKernels(KernelRegistry& kernel_registry) {
+  ORT_RETURN_IF_ERROR(RegisterKernels(kernel_registry));
+#ifdef XNNPACK_FP16_SUPPORTED
+  ORT_RETURN_IF_ERROR(RegisterFp16Kernels(kernel_registry));
+#endif
+  return Status::OK();
+}
+
+KernelRegistryAndStatus GetXnnPackKernelRegistry() {
+  KernelRegistryAndStatus ret;
+  ret.st = RegisterXnnPackKernels(*ret.kernel_registry);
+  return ret;
+}
+
 std::shared_ptr<KernelRegistry> XnnpackExecutionProvider::GetKernelRegistry() const {
-  static std::shared_ptr<KernelRegistry> registry = xnnpack::RegisterKernels();
-  return registry;
+  static KernelRegistryAndStatus k = GetXnnPackKernelRegistry();
+  // throw if the registry failed to initialize
+  ORT_THROW_IF_ERROR(k.st);
+  return k.kernel_registry;
 }
 
 XnnpackExecutionProvider::~XnnpackExecutionProvider() {

diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_init.h b/onnxruntime/core/providers/xnnpack/xnnpack_init.h
@@ -46,6 +46,19 @@ namespace xnnpack {
 #define XNN_ALLOCATION_ALIGNMENT 16
 #endif
 
+#if (!defined(_MSC_VER)) || (_MSC_VER >= 1930)
+#if defined(_M_ARM64) || defined(__aarch64__) || defined(_M_ARM64EC)
+#if !defined(__APPLE__)
+// Had to temporary disable fp16 under APPLE ARM64, as compiling
+// the source files require a hardware specific compilation flag.
+// When building an universial binary for APPLE, this flag would
+// cause trouble for x64 target.
+// reference from MLAS
+#define XNNPACK_FP16_SUPPORTED
+#endif  //
+#endif  // ARM64
+#endif  // Visual Studio 16 or earlier does not support fp16 intrinsic
+
 std::pair<AllocatorPtr&, xnn_allocator*> GetStoredAllocator();
 
 }  // namespace xnnpack