From 0d547a2dc3fd755101509189b552a675f59a59c7 Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Fri, 29 Nov 2024 04:04:15 -0800
Subject: [PATCH 01/14] more performace flag

---
 .../coreml/coreml_provider_factory.h          |  3 +
 .../builders/impl/reduction_op_builder.cc     |  3 +-
 .../builders/impl/squeeze_op_builder.cc       | 15 ++--
 .../coreml/builders/model_builder.cc          |  6 +-
 .../providers/coreml/builders/model_builder.h |  5 +-
 .../core/providers/coreml/coreml_options.cc   | 15 +++-
 .../core/providers/coreml/coreml_options.h    |  6 ++
 .../core/providers/coreml/model/model.h       |  3 +-
 .../core/providers/coreml/model/model.mm      | 82 ++++++++++++++++---
 .../test/perftest/command_args_parser.cc      |  3 +
 onnxruntime/test/perftest/ort_test_session.cc | 11 ++-
 11 files changed, 121 insertions(+), 31 deletions(-)
diff --git a/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h b/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
index 3963b80de58a4..ee449d3edd4dd 100644
--- a/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
+++ b/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
@@ -49,6 +49,9 @@ static const char* const kCoremlProviderOption_MLComputeUnits = "MLComputeUnits"
 static const char* const kCoremlProviderOption_ModelFormat = "ModelFormat";
 static const char* const kCoremlProviderOption_RequireStaticInputShapes = "RequireStaticInputShapes";
 static const char* const kCoremlProviderOption_EnableOnSubgraphs = "EnableOnSubgraphs";
+static const char* const kCoremlProviderOption_SpecializationStrategy = "SpecializationStrategy";
+static const char* const kCoremlProviderOption_ProfileComputePlan = "ProfileComputePlan";
+static const char* const kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU = "AllowLowPrecisionAccumulationOnGPU";
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc
index f161b309a2425..65fdfb7a16a0c 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc
@@ -134,8 +134,7 @@ bool ReductionOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInpu
   }
 
 #if defined(TARGET_OS_IOS) && defined(TARGET_CPU_X86_64)
-  // to pass https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=1563483&view=logs&j=f7cc61a9-cc70-56e7-b06c-4668ca17e426
-  // ReductionOpTest.ReduceSum_half_bert
+  // skip ReductionOpTest.ReduceSum_half_bert because reduce_sum will output all zeros
   int32_t input_type;
   GetType(*input_defs[0], input_type, logger);
   if (node.OpType() == "ReduceSum" && input_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16) {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
index c8df7c1a43f65..9b6ad2904b579 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
@@ -11,7 +11,6 @@
 #include "core/providers/coreml/shape_utils.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/optimizer/initializer.h"
-#include "core/providers/cpu/tensor/unsqueeze.h"
 
 namespace onnxruntime {
 namespace coreml {
@@ -68,18 +67,14 @@ Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
 
-    std::string_view coreml_op_type = node.OpType() == "Squeeze" ? "squeeze" : "reshape";
+    std::string_view coreml_op_type = node.OpType() == "Squeeze" ? "squeeze" : "expand_dims";
     std::unique_ptr<Operation> op = model_builder.CreateOperation(node, coreml_op_type);
     AddOperationInput(*op, "x", input_defs[0]->Name());
 
-    if (coreml_op_type == "squeeze") {
-      if (!axes.empty()) {
-        // coreml squeeze op does support negative axes
-        AddOperationInput(*op, "axes", model_builder.AddConstant(op->type(), "axes", AsSpan(axes)));
-      }
-    } else {
-      TensorShapeVector output_shape = UnsqueezeBase::ComputeOutputShape(TensorShape(input_shape), axes);
-      AddOperationInput(*op, "shape", model_builder.AddConstant(op->type(), "shape", AsSpan(output_shape)));
+    // it's impossible to have empty exes input for unsqueeze
+    if (!axes.empty()) {
+      // coreml squeeze op does support negative axes
+      AddOperationInput(*op, "axes", model_builder.AddConstant(op->type(), "axes", AsSpan(axes)));
     }
     AddOperationOutput(*op, *node.OutputDefs()[0]);
     model_builder.AddOperation(std::move(op));
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.cc b/onnxruntime/core/providers/coreml/builders/model_builder.cc
index 2a02c1f4124f6..6486942199df7 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.cc
@@ -408,7 +408,7 @@ ModelBuilder::ModelBuilder(const GraphViewer& graph_viewer, const logging::Logge
     : graph_viewer_(graph_viewer),
       logger_(logger),
       coreml_version_(coreml_version),
-      coreml_compute_unit_(coreml_options.ComputeUnits()),
+      coreml_options_(coreml_options),
       create_ml_program_(coreml_options.CreateMLProgram()),
       model_output_path_(GetModelOutputPath(create_ml_program_)),
       onnx_input_names_(std::move(onnx_input_names)),
@@ -989,7 +989,7 @@ Status ModelBuilder::LoadModel(std::unique_ptr<Model>& model) {
                                     get_sanitized_io_info(std::move(input_output_info_)),
                                     std::move(scalar_outputs_),
                                     std::move(int64_outputs_),
-                                    logger_, coreml_compute_unit_);
+                                    logger_, coreml_options_);
   } else
 #endif
   {
@@ -999,7 +999,7 @@ Status ModelBuilder::LoadModel(std::unique_ptr<Model>& model) {
                                     std::move(input_output_info_),
                                     std::move(scalar_outputs_),
                                     std::move(int64_outputs_),
-                                    logger_, coreml_compute_unit_);
+                                    logger_, coreml_options_);
   }
 
   return model->LoadModel();  // load using CoreML API, including compilation
diff --git a/onnxruntime/core/providers/coreml/builders/model_builder.h b/onnxruntime/core/providers/coreml/builders/model_builder.h
index af47869f7e1c3..e19597cf0dc2e 100644
--- a/onnxruntime/core/providers/coreml/builders/model_builder.h
+++ b/onnxruntime/core/providers/coreml/builders/model_builder.h
@@ -7,6 +7,7 @@
 #include "core/graph/graph_viewer.h"
 #include "core/providers/coreml/builders/coreml_spec.h"
 #include "core/providers/coreml/model/model.h"
+#include "core/providers/coreml/coreml_options.h"
 
 #if defined(COREML_ENABLE_MLPROGRAM)
 // coremltools classes
@@ -22,8 +23,6 @@ class StorageWriter;
 #endif
 
 namespace onnxruntime {
-class CoreMLOptions;
-
 namespace coreml {
 
 class IOpBuilder;
@@ -218,7 +217,7 @@ class ModelBuilder {
   const GraphViewer& graph_viewer_;
   const logging::Logger& logger_;
   const int32_t coreml_version_;
-  const uint32_t coreml_compute_unit_;
+  CoreMLOptions coreml_options_;
   const bool create_ml_program_;         // ML Program (CoreML5, iOS 15+, macOS 12+) or NeuralNetwork (old)
   const std::string model_output_path_;  // create_ml_program_ ? dir for mlpackage : filename for mlmodel
 
diff --git a/onnxruntime/core/providers/coreml/coreml_options.cc b/onnxruntime/core/providers/coreml/coreml_options.cc
index df78f74383871..4ec780208e528 100644
--- a/onnxruntime/core/providers/coreml/coreml_options.cc
+++ b/onnxruntime/core/providers/coreml/coreml_options.cc
@@ -63,11 +63,14 @@ void CoreMLOptions::ValidateAndParseProviderOption(const ProviderOptions& option
       {"MLProgram", COREML_FLAG_CREATE_MLPROGRAM},
       {"NeuralNetwork", COREML_FLAG_USE_NONE},
   };
-  std::unordered_set<std::string> valid_options = {
+  const std::unordered_set<std::string_view> valid_options = {
       kCoremlProviderOption_MLComputeUnits,
       kCoremlProviderOption_ModelFormat,
       kCoremlProviderOption_RequireStaticInputShapes,
       kCoremlProviderOption_EnableOnSubgraphs,
+      kCoremlProviderOption_SpecializationStrategy,
+      kCoremlProviderOption_ProfileComputePlan,
+      kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU,
   };
   // Validate the options
   for (const auto& option : options) {
@@ -90,6 +93,16 @@ void CoreMLOptions::ValidateAndParseProviderOption(const ProviderOptions& option
       require_static_shape_ = option.second == "1";
     } else if (kCoremlProviderOption_EnableOnSubgraphs == option.first) {
       enable_on_subgraph_ = option.second == "1";
+    } else if (kCoremlProviderOption_SpecializationStrategy == option.first) {
+      if (option.second != "Default" && option.second != "FastPrediction") {
+        ORT_THROW("Invalid value for option ", option.first, ": ", option.second,
+                  ". Valid values are Default and FastPrediction.");
+      }
+      strategy_ = option.second;
+    } else if (kCoremlProviderOption_ProfileComputePlan == option.first) {
+      profile_compute_plan_ = option.second == "1";
+    } else if (kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU == option.first) {
+      allow_low_precision_accumulation_on_gpu_ = option.second == "1";
     }
   }
 }
diff --git a/onnxruntime/core/providers/coreml/coreml_options.h b/onnxruntime/core/providers/coreml/coreml_options.h
index 8bb748fcd69c9..fd05c96927bd1 100644
--- a/onnxruntime/core/providers/coreml/coreml_options.h
+++ b/onnxruntime/core/providers/coreml/coreml_options.h
@@ -14,6 +14,9 @@ class CoreMLOptions {
   bool create_mlprogram_{false};
   bool enable_on_subgraph_{false};
   uint32_t compute_units_{0};
+  std::string strategy_;
+  bool profile_compute_plan_{false};
+  bool allow_low_precision_accumulation_on_gpu_{false};
 
  public:
   explicit CoreMLOptions(uint32_t coreml_flags);
@@ -25,6 +28,9 @@ class CoreMLOptions {
   bool CreateMLProgram() const { return create_mlprogram_; }
   bool EnableOnSubgraph() const { return enable_on_subgraph_; }
   uint32_t ComputeUnits(uint32_t specific_flag = 0xffffffff) const { return compute_units_ & specific_flag; }
+  bool AllowLowPrecisionAccumulationOnGPU() const { return allow_low_precision_accumulation_on_gpu_; }
+  bool UseStrategy(std::string_view strategy) const { return strategy_ == strategy; }
+  bool ProfileComputePlan() const { return profile_compute_plan_ && create_mlprogram_; }
 
  private:
   void ValidateAndParseProviderOption(const ProviderOptions& options);
diff --git a/onnxruntime/core/providers/coreml/model/model.h b/onnxruntime/core/providers/coreml/model/model.h
index 68ecbe5fb80c4..84b7d741b4714 100644
--- a/onnxruntime/core/providers/coreml/model/model.h
+++ b/onnxruntime/core/providers/coreml/model/model.h
@@ -18,6 +18,7 @@
 #endif
 
 namespace onnxruntime {
+class CoreMLOptions;
 namespace coreml {
 
 class Execution;
@@ -53,7 +54,7 @@ class Model {
         std::unordered_map<std::string, OnnxTensorInfo>&& input_output_info,
         std::unordered_set<std::string>&& scalar_outputs,
         std::unordered_set<std::string>&& int64_outputs,
-        const logging::Logger& logger, uint32_t coreml_compute_unit);
+        const logging::Logger& logger, const CoreMLOptions& coreml_options);
 
   ~Model();
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Model);
diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm
index c8edb64ff55d7..775a48cbd8a4f 100644
--- a/onnxruntime/core/providers/coreml/model/model.mm
+++ b/onnxruntime/core/providers/coreml/model/model.mm
@@ -25,6 +25,7 @@
 #include "core/providers/coreml/model/host_utils.h"
 #include "core/providers/coreml/model/objc_str_utils.h"
 #include "core/providers/coreml/shape_utils.h"
+#include "core/providers/coreml/coreml_options.h"
 
 // force the linker to create a dependency on the CoreML framework so that in MAUI usage we don't need
 // to manually do this
@@ -300,6 +301,48 @@ Status GetMLMultiArrayCopyInfo(const MLMultiArray* _Nonnull array,
   return Status::OK();
 }
 
+void ProfileComputePlan(NSURL* compileUrl, MLModelConfiguration* config) {
+  [MLComputePlan loadContentsOfURL:compileUrl
+                     configuration:config
+                 completionHandler:^(MLComputePlan* _Nullable computePlan, NSError* _Nullable error) {
+                   if (@available(macOS 14.4, iOS 17.4, *)) {
+                   } else {
+                     NSLog(@"iOS 17.4+/macOS 14.4+ or later is required to use the compute plan API");
+                     return;
+                   }
+
+                   if (!computePlan) {
+                     NSLog(@"Error loading compute plan: %@", error);
+                     // Handle error.
+                     return;
+                   }
+                   MLModelStructureProgram* program = computePlan.modelStructure.program;
+                   if (!program) {
+                     NSLog(@"Error loading program from compute plan., this is not a mlprogram model");
+                     return;
+                   }
+
+                   MLModelStructureProgramFunction* mainFunction = program.functions[@"main"];
+                   if (!mainFunction) {
+                     NSLog(@"Error loading main function from program");
+                     return;
+                   }
+
+                   NSArray<MLModelStructureProgramOperation*>* operations = mainFunction.block.operations;
+                   NSLog(@"Number of operations, 'const' node is included. : %lu", operations.count);
+                   for (MLModelStructureProgramOperation* operation in operations) {
+                     // Get the compute device usage for the operation.
+                     MLComputePlanDeviceUsage* computeDeviceUsage = [computePlan computeDeviceUsageForMLProgramOperation:operation];
+                     id<MLComputeDeviceProtocol> preferredDevice = computeDeviceUsage.preferredComputeDevice;
+                     // Get the estimated cost of executing the operation.
+                     MLComputePlanCost* estimatedCost = [computePlan estimatedCostOfMLProgramOperation:operation];
+                     if (![operation.operatorName isEqualToString:@"const"]) {
+                       NSLog(@"Operation: %@, Device Usage: %@, Estimated Cost: %f", operation.operatorName, preferredDevice, estimatedCost.weight);
+                     }
+                   }
+                 }];
+}
+
 // Internal Execution class
 // This class is part of the model class and handles the calls into CoreML. Specifically, it performs
 // 1. Compile the model by given path for execution
@@ -307,7 +350,7 @@ Status GetMLMultiArrayCopyInfo(const MLMultiArray* _Nonnull array,
 // 3. The compiled model will be removed in dealloc or removed using cleanup function
 class Execution {
  public:
-  Execution(const std::string& path, const logging::Logger& logger, uint32_t coreml_flags);
+  Execution(const std::string& path, const logging::Logger& logger, const CoreMLOptions& coreml_options);
   ~Execution();
 
   Status LoadModel();
@@ -320,13 +363,13 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
   NSString* coreml_model_path_{nil};
   NSString* compiled_model_path_{nil};
   const logging::Logger& logger_;
-  uint32_t coreml_compute_unit_{0};
+  CoreMLOptions coreml_options_;
   MLModel* model_{nil};
 };
 
-Execution::Execution(const std::string& path, const logging::Logger& logger, uint32_t coreml_compute_unit)
+Execution::Execution(const std::string& path, const logging::Logger& logger, const CoreMLOptions& coreml_options)
     : logger_(logger),
-      coreml_compute_unit_(coreml_compute_unit) {
+      coreml_options_(coreml_options) {
   @autoreleasepool {
     coreml_model_path_ = util::Utf8StringToNSString(path.c_str());
   }
@@ -395,17 +438,36 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
       compiled_model_path_ = [compileUrl path];
 
       MLModelConfiguration* config = [[MLModelConfiguration alloc] init];
-
-      if (coreml_compute_unit_ & COREML_FLAG_USE_CPU_ONLY) {
+      uint32_t coreml_compute_unit = coreml_options_.ComputeUnits();
+      if (coreml_compute_unit & COREML_FLAG_USE_CPU_ONLY) {
         config.computeUnits = MLComputeUnitsCPUOnly;
-      } else if (coreml_compute_unit_ & COREML_FLAG_USE_CPU_AND_GPU) {
+      } else if (coreml_compute_unit & COREML_FLAG_USE_CPU_AND_GPU) {
         config.computeUnits = MLComputeUnitsCPUAndGPU;
-      } else if (coreml_compute_unit_ & COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE) {
+      } else if (coreml_compute_unit & COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE) {
         config.computeUnits = MLComputeUnitsCPUAndNeuralEngine;  // Apple Neural Engine
       } else {
         config.computeUnits = MLComputeUnitsAll;
       }
 
+      if (coreml_options_.AllowLowPrecisionAccumulationOnGPU()) {
+        config.allowLowPrecisionAccumulationOnGPU = YES;
+      }
+
+// Set the specialization strategy to FastPrediction  for macOS 10.15+
+#if !(TARGET_OS_OSX && (!defined(__MAC_15_0) || __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_15_0))
+      MLOptimizationHints* optimizationHints = [[MLOptimizationHints alloc] init];
+      if (coreml_options_.UseStrategy("FastPrediction")) {
+        optimizationHints.specializationStrategy = MLSpecializationStrategyFastPrediction;
+        config.optimizationHints = optimizationHints;
+      } else if (coreml_options_.UseStrategy("Default")) {
+        optimizationHints.specializationStrategy = MLSpecializationStrategyDefault;
+        config.optimizationHints = optimizationHints;
+      }
+#endif
+      if (coreml_options_.ProfileComputePlan()) {
+        ProfileComputePlan(compileUrl, config);
+      }
+
       model_ = [MLModel modelWithContentsOfURL:compileUrl configuration:config error:&error];
 
       if (error != nil || model_ == nil) {
@@ -524,8 +586,8 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
              std::unordered_set<std::string>&& scalar_outputs,
              std::unordered_set<std::string>&& int64_outputs,
              const logging::Logger& logger,
-             uint32_t coreml_flags)
-    : execution_(std::make_unique<Execution>(path, logger, coreml_flags)),
+             const CoreMLOptions& coreml_options)
+    : execution_(std::make_unique<Execution>(path, logger, coreml_options)),
       model_input_names_(std::move(model_input_names)),
       model_output_names_(std::move(model_output_names)),
       input_output_info_(std::move(input_output_info)),
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index e406405464d99..40dc49d9958cf 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -134,6 +134,9 @@ namespace perftest {
       "\t    [CoreML only] [MLComputeUnits]:[CPUAndNeuralEngine CPUAndGPU ALL CPUOnly] Specify to limit the backend device used to run the model.\n"
       "\t    [CoreML only] [AllowStaticInputShapes]:[0 1].\n"
       "\t    [CoreML only] [EnableOnSubgraphs]:[0 1].\n"
+      "\t    [CoreML only] [SpecializationStrategy]:[Default FastPrediction].\n"
+      "\t    [CoreML only] [ProfileComputePlan]:[0 1].\n"
+      "\t    [CoreML only] [AllowLowPrecisionAccumulationOnGPU]:[0 1].\n"
       "\t    [Example] [For CoreML EP] -e coreml -i \"ModelFormat|MLProgram MLComputeUnits|CPUAndGPU\"\n"
       "\n"
       "\t    [SNPE only] [runtime]: SNPE runtime, options: 'CPU', 'GPU', 'GPU_FLOAT16', 'DSP', 'AIP_FIXED_TF'. \n"
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index 02768b8c08e85..bab08007dd67c 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -346,7 +346,10 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
     static const std::unordered_set<std::string> available_keys = {kCoremlProviderOption_MLComputeUnits,
                                                                    kCoremlProviderOption_ModelFormat,
                                                                    kCoremlProviderOption_RequireStaticInputShapes,
-                                                                   kCoremlProviderOption_EnableOnSubgraphs};
+                                                                   kCoremlProviderOption_EnableOnSubgraphs,
+                                                                   kCoremlProviderOption_SpecializationStrategy,
+                                                                   kCoremlProviderOption_ProfileComputePlan,
+                                                                   kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU};
     ParseSessionConfigs(ov_string, provider_options, available_keys);
 
     std::unordered_map<std::string, std::string> available_options = {
@@ -364,6 +367,12 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
                  (provider_option.second == "1" || provider_option.second == "0")) {
       } else if (provider_option.first == kCoremlProviderOption_EnableOnSubgraphs &&
                  (provider_option.second == "0" || provider_option.second == "1")) {
+      } else if (provider_option.first == kCoremlProviderOption_SpecializationStrategy &&
+                 (provider_option.second == "Default" || provider_option.second == "FastPrediction")) {
+      } else if (provider_option.first == kCoremlProviderOption_ProfileComputePlan &&
+                 (provider_option.second == "0" || provider_option.second == "1")) {
+      } else if (provider_option.first == kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU &&
+                 (provider_option.second == "0" || provider_option.second == "1")) {
       } else {
         ORT_THROW("Invalid value for option ", provider_option.first, ": ", provider_option.second);
       }

From 364b897222fd9b75287a28f233422c38d6fb5e9c Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Sun, 1 Dec 2024 19:18:05 -0800
Subject: [PATCH 02/14] fix

---
 .../core/providers/coreml/model/model.mm      | 74 +++++++++----------
 1 file changed, 36 insertions(+), 38 deletions(-)

diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm
index 775a48cbd8a4f..61a6f371cc055 100644
--- a/onnxruntime/core/providers/coreml/model/model.mm
+++ b/onnxruntime/core/providers/coreml/model/model.mm
@@ -302,45 +302,43 @@ Status GetMLMultiArrayCopyInfo(const MLMultiArray* _Nonnull array,
 }
 
 void ProfileComputePlan(NSURL* compileUrl, MLModelConfiguration* config) {
-  [MLComputePlan loadContentsOfURL:compileUrl
-                     configuration:config
-                 completionHandler:^(MLComputePlan* _Nullable computePlan, NSError* _Nullable error) {
-                   if (@available(macOS 14.4, iOS 17.4, *)) {
-                   } else {
-                     NSLog(@"iOS 17.4+/macOS 14.4+ or later is required to use the compute plan API");
-                     return;
-                   }
-
-                   if (!computePlan) {
-                     NSLog(@"Error loading compute plan: %@", error);
-                     // Handle error.
-                     return;
-                   }
-                   MLModelStructureProgram* program = computePlan.modelStructure.program;
-                   if (!program) {
-                     NSLog(@"Error loading program from compute plan., this is not a mlprogram model");
-                     return;
-                   }
-
-                   MLModelStructureProgramFunction* mainFunction = program.functions[@"main"];
-                   if (!mainFunction) {
-                     NSLog(@"Error loading main function from program");
-                     return;
-                   }
-
-                   NSArray<MLModelStructureProgramOperation*>* operations = mainFunction.block.operations;
-                   NSLog(@"Number of operations, 'const' node is included. : %lu", operations.count);
-                   for (MLModelStructureProgramOperation* operation in operations) {
-                     // Get the compute device usage for the operation.
-                     MLComputePlanDeviceUsage* computeDeviceUsage = [computePlan computeDeviceUsageForMLProgramOperation:operation];
-                     id<MLComputeDeviceProtocol> preferredDevice = computeDeviceUsage.preferredComputeDevice;
-                     // Get the estimated cost of executing the operation.
-                     MLComputePlanCost* estimatedCost = [computePlan estimatedCostOfMLProgramOperation:operation];
-                     if (![operation.operatorName isEqualToString:@"const"]) {
-                       NSLog(@"Operation: %@, Device Usage: %@, Estimated Cost: %f", operation.operatorName, preferredDevice, estimatedCost.weight);
+  if (@available(macOS 14.4, iOS 17.4, *)) {
+    [MLComputePlan loadContentsOfURL:compileUrl
+                       configuration:config
+                   completionHandler:^(MLComputePlan* _Nullable computePlan, NSError* _Nullable error) {
+                     if (!computePlan) {
+                       NSLog(@"Error loading compute plan: %@", error);
+                       // Handle error.
+                       return;
                      }
-                   }
-                 }];
+                     MLModelStructureProgram* program = computePlan.modelStructure.program;
+                     if (!program) {
+                       NSLog(@"Error loading program from compute plan., this is not a mlprogram model");
+                       return;
+                     }
+
+                     MLModelStructureProgramFunction* mainFunction = program.functions[@"main"];
+                     if (!mainFunction) {
+                       NSLog(@"Error loading main function from program");
+                       return;
+                     }
+
+                     NSArray<MLModelStructureProgramOperation*>* operations = mainFunction.block.operations;
+                     NSLog(@"Number of operations, 'const' node is included. : %lu", operations.count);
+                     for (MLModelStructureProgramOperation* operation in operations) {
+                       // Get the compute device usage for the operation.
+                       MLComputePlanDeviceUsage* computeDeviceUsage = [computePlan computeDeviceUsageForMLProgramOperation:operation];
+                       id<MLComputeDeviceProtocol> preferredDevice = computeDeviceUsage.preferredComputeDevice;
+                       // Get the estimated cost of executing the operation.
+                       MLComputePlanCost* estimatedCost = [computePlan estimatedCostOfMLProgramOperation:operation];
+                       if (![operation.operatorName isEqualToString:@"const"]) {
+                         NSLog(@"Operation: %@, Device Usage: %@, Estimated Cost: %f", operation.operatorName, preferredDevice, estimatedCost.weight);
+                       }
+                     }
+                   }];
+  } else {
+    NSLog(@"iOS 17.4+/macOS 14.4+ or later is required to use the compute plan API");
+  }
 }
 
 // Internal Execution class

From bf095dde43859d79d1fa0971d29149a57e913607 Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Sun, 1 Dec 2024 20:12:16 -0800
Subject: [PATCH 03/14] MLComputePlan

---
 .../core/providers/coreml/model/model.mm      | 20 +++++++++++--------
 .../core/providers/coreml/model/model_stub.cc |  3 ++-
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm
index 61a6f371cc055..3ca1a6311e7e4 100644
--- a/onnxruntime/core/providers/coreml/model/model.mm
+++ b/onnxruntime/core/providers/coreml/model/model.mm
@@ -302,6 +302,7 @@ Status GetMLMultiArrayCopyInfo(const MLMultiArray* _Nonnull array,
 }
 
 void ProfileComputePlan(NSURL* compileUrl, MLModelConfiguration* config) {
+#if defined(__APPLE__) && defined(__clang__) && __clang_major__ >= 15
   if (@available(macOS 14.4, iOS 17.4, *)) {
     [MLComputePlan loadContentsOfURL:compileUrl
                        configuration:config
@@ -339,6 +340,7 @@ void ProfileComputePlan(NSURL* compileUrl, MLModelConfiguration* config) {
   } else {
     NSLog(@"iOS 17.4+/macOS 14.4+ or later is required to use the compute plan API");
   }
+#endif
 }
 
 // Internal Execution class
@@ -452,14 +454,16 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
       }
 
 // Set the specialization strategy to FastPrediction  for macOS 10.15+
-#if !(TARGET_OS_OSX && (!defined(__MAC_15_0) || __MAC_OS_X_VERSION_MAX_ALLOWED < __MAC_15_0))
-      MLOptimizationHints* optimizationHints = [[MLOptimizationHints alloc] init];
-      if (coreml_options_.UseStrategy("FastPrediction")) {
-        optimizationHints.specializationStrategy = MLSpecializationStrategyFastPrediction;
-        config.optimizationHints = optimizationHints;
-      } else if (coreml_options_.UseStrategy("Default")) {
-        optimizationHints.specializationStrategy = MLSpecializationStrategyDefault;
-        config.optimizationHints = optimizationHints;
+#if defined(__APPLE__) && defined(__clang__) && __clang_major__ >= 15
+      if (HAS_COREML8_OR_LATER) {
+        MLOptimizationHints* optimizationHints = [[MLOptimizationHints alloc] init];
+        if (coreml_options_.UseStrategy("FastPrediction")) {
+          optimizationHints.specializationStrategy = MLSpecializationStrategyFastPrediction;
+          config.optimizationHints = optimizationHints;
+        } else if (coreml_options_.UseStrategy("Default")) {
+          optimizationHints.specializationStrategy = MLSpecializationStrategyDefault;
+          config.optimizationHints = optimizationHints;
+        }
       }
 #endif
       if (coreml_options_.ProfileComputePlan()) {
diff --git a/onnxruntime/core/providers/coreml/model/model_stub.cc b/onnxruntime/core/providers/coreml/model/model_stub.cc
index c6f2e7401ea1e..e9036e2fc7e1a 100644
--- a/onnxruntime/core/providers/coreml/model/model_stub.cc
+++ b/onnxruntime/core/providers/coreml/model/model_stub.cc
@@ -4,6 +4,7 @@
 #include "core/providers/coreml/model/model.h"
 
 namespace onnxruntime {
+class CoreMLOptions;
 namespace coreml {
 
 class Execution {};
@@ -15,7 +16,7 @@ Model::Model(const std::string& /*path*/,
              std::unordered_set<std::string>&& scalar_outputs,
              std::unordered_set<std::string>&& int64_outputs,
              const logging::Logger& /*logger*/,
-             uint32_t /*coreml_flags*/)
+             const CoreMLOptions& /*coreml_flags*/)
     : execution_(std::make_unique<Execution>()),
       model_input_names_(std::move(model_input_names)),
       model_output_names_(std::move(model_output_names)),

From 5249880ceca8776af54809d4f10d25aa1f11f681 Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Sun, 1 Dec 2024 23:29:38 -0800
Subject: [PATCH 04/14] debug

---
 .../coreml/builders/impl/squeeze_op_builder.cc        | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
index 9b6ad2904b579..9abbff07d3600 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
@@ -57,13 +57,11 @@ Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                const Node& node,
                                                [[maybe_unused]] const logging::Logger& logger) const {
   std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
-  const auto& input_defs(node.InputDefs());
   auto* coreml_squeeze = layer->mutable_squeeze();
   TensorShapeVector axes;
   GetAxes(model_builder, node, axes);
-  std::vector<int64_t> input_shape;
-  GetShape(*input_defs[0], input_shape, logger);
 #if defined(COREML_ENABLE_MLPROGRAM)
+  const auto& input_defs(node.InputDefs());
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
 
@@ -76,6 +74,13 @@ Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
       // coreml squeeze op does support negative axes
       AddOperationInput(*op, "axes", model_builder.AddConstant(op->type(), "axes", AsSpan(axes)));
     }
+    std::vector<int64_t> input_shape;
+    std::cout << "========================\n";
+    GetShape(*node.OutputDefs()[0], input_shape, logger);
+    for (size_t i = 0; i < input_shape.size(); ++i) {
+      std::cout << input_shape[i] << " ";
+    }
+    std::cout << std::endl;
     AddOperationOutput(*op, *node.OutputDefs()[0]);
     model_builder.AddOperation(std::move(op));
   } else  // NOLINT

From 74953ca8f23321efcfd622b3c2907808b6598ff3 Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Mon, 2 Dec 2024 01:24:44 -0800
Subject: [PATCH 05/14] Revert "debug"

This reverts commit 5249880ceca8776af54809d4f10d25aa1f11f681.
---
 .../coreml/builders/impl/squeeze_op_builder.cc        | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
index 9abbff07d3600..9b6ad2904b579 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
@@ -57,11 +57,13 @@ Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                const Node& node,
                                                [[maybe_unused]] const logging::Logger& logger) const {
   std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
+  const auto& input_defs(node.InputDefs());
   auto* coreml_squeeze = layer->mutable_squeeze();
   TensorShapeVector axes;
   GetAxes(model_builder, node, axes);
+  std::vector<int64_t> input_shape;
+  GetShape(*input_defs[0], input_shape, logger);
 #if defined(COREML_ENABLE_MLPROGRAM)
-  const auto& input_defs(node.InputDefs());
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
 
@@ -74,13 +76,6 @@ Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
       // coreml squeeze op does support negative axes
       AddOperationInput(*op, "axes", model_builder.AddConstant(op->type(), "axes", AsSpan(axes)));
     }
-    std::vector<int64_t> input_shape;
-    std::cout << "========================\n";
-    GetShape(*node.OutputDefs()[0], input_shape, logger);
-    for (size_t i = 0; i < input_shape.size(); ++i) {
-      std::cout << input_shape[i] << " ";
-    }
-    std::cout << std::endl;
     AddOperationOutput(*op, *node.OutputDefs()[0]);
     model_builder.AddOperation(std::move(op));
   } else  // NOLINT

From 37e77a5edf2074db908b1b6405608d9eb7bdded1 Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Mon, 2 Dec 2024 02:40:39 -0800
Subject: [PATCH 06/14] handle x64_cpu bug

---
 .../builders/impl/batch_norm_op_builder.cc    |  2 +-
 .../builders/impl/reduction_op_builder.cc     |  2 +-
 .../builders/impl/squeeze_op_builder.cc       | 32 +++++++++++++++++--
 3 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc
index cc68fa6ec399a..442194cb31cbc 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/batch_norm_op_builder.cc
@@ -151,7 +151,7 @@ bool BatchNormalizationOpBuilder::IsOpSupportedImpl(const Node& node, const OpBu
     return false;
   }
 
-#if defined(TARGET_OS_IOS) && defined(TARGET_CPU_X86_64)
+#if defined(TARGET_OS_IOS) && defined(TARGET_CPU_X86_64) && TARGET_OS_IOS && TARGET_CPU_X86_64
   // To Pass IOS pipeline https://dev.azure.com/onnxruntime/onnxruntime/_build?definitionId=134&_a=summary
   auto input_dtype = input_defs[0]->TypeAsProto()->tensor_type().elem_type();
   if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 && input_params.coreml_version < 7) {
diff --git a/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc
index 65fdfb7a16a0c..d533b867bd454 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/reduction_op_builder.cc
@@ -133,7 +133,7 @@ bool ReductionOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInpu
     return false;
   }
 
-#if defined(TARGET_OS_IOS) && defined(TARGET_CPU_X86_64)
+#if defined(TARGET_OS_IOS) && defined(TARGET_CPU_X86_64) && TARGET_OS_IOS && TARGET_CPU_X86_64
   // skip ReductionOpTest.ReduceSum_half_bert because reduce_sum will output all zeros
   int32_t input_type;
   GetType(*input_defs[0], input_type, logger);
diff --git a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
index 9b6ad2904b579..6bc649828f26d 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
@@ -11,6 +11,11 @@
 #include "core/providers/coreml/shape_utils.h"
 #include "core/providers/shared/utils/utils.h"
 #include "core/optimizer/initializer.h"
+#include "core/providers/cpu/tensor/unsqueeze.h"
+
+#ifdef __APPLE__
+#include <TargetConditionals.h>
+#endif
 
 namespace onnxruntime {
 namespace coreml {
@@ -53,6 +58,24 @@ void SqueezeOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const
   }
 }
 
+#if defined(COREML_ENABLE_MLPROGRAM)
+void HandleX86ArchUnsqueezeScalarInput(ModelBuilder& model_builder,
+                                       const Node& node, const logging::Logger& logger) {
+  const auto& input_defs(node.InputDefs());
+  TensorShapeVector axes;
+  GetAxes(model_builder, node, axes);
+
+  std::vector<int64_t> input_shape;
+  GetShape(*input_defs[0], input_shape, logger);
+  auto op = model_builder.CreateOperation(node, "reshape");
+  AddOperationInput(*op, "x", input_defs[0]->Name());
+  TensorShapeVector output_shape = UnsqueezeBase::ComputeOutputShape(TensorShape(input_shape), axes);
+  AddOperationInput(*op, "shape", model_builder.AddConstant(op->type(), "shape", AsSpan(output_shape)));
+  AddOperationOutput(*op, *node.OutputDefs()[0]);
+  model_builder.AddOperation(std::move(op));
+}
+#endif
+
 Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                const Node& node,
                                                [[maybe_unused]] const logging::Logger& logger) const {
@@ -61,12 +84,17 @@ Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   auto* coreml_squeeze = layer->mutable_squeeze();
   TensorShapeVector axes;
   GetAxes(model_builder, node, axes);
-  std::vector<int64_t> input_shape;
-  GetShape(*input_defs[0], input_shape, logger);
 #if defined(COREML_ENABLE_MLPROGRAM)
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
 
+#if defined(TARGET_CPU_X86_64) && TARGET_CPU_X86_64
+    // expand_dims has limited requirements for static shape, however, X86_64 has a bug that it can't handle scalar input
+    if (node.OpType() == "Unsqueeze" && input_defs[0]->Shape()->dim_size() < 2) {
+      HandleX86ArchUnsqueezeScalarInput(model_builder, node, logger);
+      return Status::OK();
+    }
+#endif
     std::string_view coreml_op_type = node.OpType() == "Squeeze" ? "squeeze" : "expand_dims";
     std::unique_ptr<Operation> op = model_builder.CreateOperation(node, coreml_op_type);
     AddOperationInput(*op, "x", input_defs[0]->Name());

From d57d62fb99732c76ed6c39d7e2460765109c0776 Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Mon, 2 Dec 2024 19:41:56 +0800
Subject: [PATCH 07/14] Update squeeze_op_builder.cc

---
 .../core/providers/coreml/builders/impl/squeeze_op_builder.cc   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
index 6bc649828f26d..b50c7e3a4d508 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
@@ -80,11 +80,11 @@ Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                const Node& node,
                                                [[maybe_unused]] const logging::Logger& logger) const {
   std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
-  const auto& input_defs(node.InputDefs());
   auto* coreml_squeeze = layer->mutable_squeeze();
   TensorShapeVector axes;
   GetAxes(model_builder, node, axes);
 #if defined(COREML_ENABLE_MLPROGRAM)
+  const auto& input_defs(node.InputDefs());
   if (model_builder.CreateMLProgram()) {
     using namespace CoreML::Specification::MILSpec;
 

From e64175005feb99532f0aad816bfbb39eb4fae42b Mon Sep 17 00:00:00 2001
From: jicwen <jicwen@YiMacBook-Pro.local>
Date: Wed, 4 Dec 2024 15:13:13 +0800
Subject: [PATCH 08/14] add comments for new flag

---
 .../core/providers/coreml/coreml_provider_factory.h         | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h b/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
index ee449d3edd4dd..08e7596c3beaf 100644
--- a/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
+++ b/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
@@ -47,10 +47,16 @@ enum COREMLFlags {
 // and SessionOptionsAppendExecutionProvider (C API). For the old API, use COREMLFlags instead.
 static const char* const kCoremlProviderOption_MLComputeUnits = "MLComputeUnits";
 static const char* const kCoremlProviderOption_ModelFormat = "ModelFormat";
+// same as COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES
 static const char* const kCoremlProviderOption_RequireStaticInputShapes = "RequireStaticInputShapes";
 static const char* const kCoremlProviderOption_EnableOnSubgraphs = "EnableOnSubgraphs";
+// provided by https://developer.apple.com/documentation/coreml/mloptimizationhints-swift.struct/specializationstrategy-swift.property
+// Core ML segments the model’s compute graph and specializes each segment for the target compute device.
+// This process can affect the model loading time and the prediction latency.
+// Use this option to tailor the specialization strategy for your model.
 static const char* const kCoremlProviderOption_SpecializationStrategy = "SpecializationStrategy";
 static const char* const kCoremlProviderOption_ProfileComputePlan = "ProfileComputePlan";
+// please refer to https://developer.apple.com/documentation/coreml/mlmodelconfiguration/allowlowprecisionaccumulationongpu
 static const char* const kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU = "AllowLowPrecisionAccumulationOnGPU";
 
 #ifdef __cplusplus

From 123a4f0a71c4d70058cbe29bae91d5873e1c34f1 Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@.com>
Date: Thu, 5 Dec 2024 11:55:01 +0800
Subject: [PATCH 09/14] bypass staticanalyze

---
 onnxruntime/core/providers/coreml/model/model.mm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm
index 3ca1a6311e7e4..b33c265ba2293 100644
--- a/onnxruntime/core/providers/coreml/model/model.mm
+++ b/onnxruntime/core/providers/coreml/model/model.mm
@@ -302,7 +302,7 @@ Status GetMLMultiArrayCopyInfo(const MLMultiArray* _Nonnull array,
 }
 
 void ProfileComputePlan(NSURL* compileUrl, MLModelConfiguration* config) {
-#if defined(__APPLE__) && defined(__clang__) && __clang_major__ >= 15
+#if defined(__APPLE__) && defined(__clang__) && __clang_major__ >= 15 && !defined(__clang_analyzer__)
   if (@available(macOS 14.4, iOS 17.4, *)) {
     [MLComputePlan loadContentsOfURL:compileUrl
                        configuration:config
@@ -454,7 +454,7 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
       }
 
 // Set the specialization strategy to FastPrediction  for macOS 10.15+
-#if defined(__APPLE__) && defined(__clang__) && __clang_major__ >= 15
+#if defined(__APPLE__) && defined(__clang__) && __clang_major__ >= 15 && !defined(__clang_analyzer__)
       if (HAS_COREML8_OR_LATER) {
         MLOptimizationHints* optimizationHints = [[MLOptimizationHints alloc] init];
         if (coreml_options_.UseStrategy("FastPrediction")) {

From 17d2c4d453f80bc7cc888ed56a6450c991717d56 Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Fri, 6 Dec 2024 02:53:53 -0800
Subject: [PATCH 10/14] add more comments for flag

---
 .../core/providers/coreml/coreml_provider_factory.h           | 2 ++
 .../core/providers/coreml/builders/impl/squeeze_op_builder.cc | 3 +--
 onnxruntime/core/providers/coreml/model/model.mm              | 4 ++++
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h b/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
index 08e7596c3beaf..8b88f930753cf 100644
--- a/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
+++ b/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
@@ -55,6 +55,8 @@ static const char* const kCoremlProviderOption_EnableOnSubgraphs = "EnableOnSubg
 // This process can affect the model loading time and the prediction latency.
 // Use this option to tailor the specialization strategy for your model.
 static const char* const kCoremlProviderOption_SpecializationStrategy = "SpecializationStrategy";
+// this flag is used to profile coreml computeplan after model compile.
+// The profile result includes what hardwares of each ops dispatched to and the estimate excution time.
 static const char* const kCoremlProviderOption_ProfileComputePlan = "ProfileComputePlan";
 // please refer to https://developer.apple.com/documentation/coreml/mlmodelconfiguration/allowlowprecisionaccumulationongpu
 static const char* const kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU = "AllowLowPrecisionAccumulationOnGPU";
diff --git a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
index b50c7e3a4d508..a1b3a18265c70 100644
--- a/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
+++ b/onnxruntime/core/providers/coreml/builders/impl/squeeze_op_builder.cc
@@ -99,9 +99,8 @@ Status SqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
     std::unique_ptr<Operation> op = model_builder.CreateOperation(node, coreml_op_type);
     AddOperationInput(*op, "x", input_defs[0]->Name());
 
-    // it's impossible to have empty exes input for unsqueeze
     if (!axes.empty()) {
-      // coreml squeeze op does support negative axes
+      // coreml supports negative axes
       AddOperationInput(*op, "axes", model_builder.AddConstant(op->type(), "axes", AsSpan(axes)));
     }
     AddOperationOutput(*op, *node.OutputDefs()[0]);
diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm
index b33c265ba2293..4edd693d78805 100644
--- a/onnxruntime/core/providers/coreml/model/model.mm
+++ b/onnxruntime/core/providers/coreml/model/model.mm
@@ -301,6 +301,8 @@ Status GetMLMultiArrayCopyInfo(const MLMultiArray* _Nonnull array,
   return Status::OK();
 }
 
+// since __clang_major__ >= 15, MLComputePlan is introduced in <CoreML/CoreML.h>
+// we define __clang_analyzer__ here is for bypass static analysis
 void ProfileComputePlan(NSURL* compileUrl, MLModelConfiguration* config) {
 #if defined(__APPLE__) && defined(__clang__) && __clang_major__ >= 15 && !defined(__clang_analyzer__)
   if (@available(macOS 14.4, iOS 17.4, *)) {
@@ -454,6 +456,8 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
       }
 
 // Set the specialization strategy to FastPrediction  for macOS 10.15+
+// since __clang_major__ >= 15, optimizationHints is introduced in <CoreML/CoreML.h>
+// we define __clang_analyzer__ here is for bypass static analysis
 #if defined(__APPLE__) && defined(__clang__) && __clang_major__ >= 15 && !defined(__clang_analyzer__)
       if (HAS_COREML8_OR_LATER) {
         MLOptimizationHints* optimizationHints = [[MLOptimizationHints alloc] init];

From 2bc621f2c0b2f180f97f10a6f855a65bdc1448b9 Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Mon, 9 Dec 2024 12:13:35 +0800
Subject: [PATCH 11/14] Add comments for clang version checks

---
 onnxruntime/core/providers/coreml/model/model.mm | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm
index 4edd693d78805..14340a4ac8c36 100644
--- a/onnxruntime/core/providers/coreml/model/model.mm
+++ b/onnxruntime/core/providers/coreml/model/model.mm
@@ -302,6 +302,8 @@ Status GetMLMultiArrayCopyInfo(const MLMultiArray* _Nonnull array,
 }
 
 // since __clang_major__ >= 15, MLComputePlan is introduced in <CoreML/CoreML.h>
+// We are actually ensure the MacOS/IOS version and Xcode version is greater than `macOS 14.4, iOS 17.4`.
+// Otherwise, the compiler will complain `MLComputePlan` is not defined.
 // we define __clang_analyzer__ here is for bypass static analysis
 void ProfileComputePlan(NSURL* compileUrl, MLModelConfiguration* config) {
 #if defined(__APPLE__) && defined(__clang__) && __clang_major__ >= 15 && !defined(__clang_analyzer__)
@@ -457,6 +459,7 @@ Status Predict(const std::unordered_map<std::string, OnnxTensorData>& inputs,
 
 // Set the specialization strategy to FastPrediction  for macOS 10.15+
 // since __clang_major__ >= 15, optimizationHints is introduced in <CoreML/CoreML.h>
+// Same as above comments for why we are checking __clang_major__.
 // we define __clang_analyzer__ here is for bypass static analysis
 #if defined(__APPLE__) && defined(__clang__) && __clang_major__ >= 15 && !defined(__clang_analyzer__)
       if (HAS_COREML8_OR_LATER) {

From cf3d3899720384657ed3a3b071d6a538071d9a6a Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Mon, 9 Dec 2024 12:14:59 +0800
Subject: [PATCH 12/14] Add comments for clang version checks

---
 onnxruntime/core/providers/coreml/model/model.mm | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm
index 14340a4ac8c36..30161dd122388 100644
--- a/onnxruntime/core/providers/coreml/model/model.mm
+++ b/onnxruntime/core/providers/coreml/model/model.mm
@@ -303,6 +303,8 @@ Status GetMLMultiArrayCopyInfo(const MLMultiArray* _Nonnull array,
 
 // since __clang_major__ >= 15, MLComputePlan is introduced in <CoreML/CoreML.h>
 // We are actually ensure the MacOS/IOS version and Xcode version is greater than `macOS 14.4, iOS 17.4`.
+// The macro API_AVAILABLE should also be fine.
+ 
 // Otherwise, the compiler will complain `MLComputePlan` is not defined.
 // we define __clang_analyzer__ here is for bypass static analysis
 void ProfileComputePlan(NSURL* compileUrl, MLModelConfiguration* config) {

From 063341ce6f6f802ce231e26da1d30f27ba67d2d1 Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Mon, 9 Dec 2024 12:15:37 +0800
Subject: [PATCH 13/14] Update
 include/onnxruntime/core/providers/coreml/coreml_provider_factory.h

Co-authored-by: Scott McKay <skottmckay@gmail.com>
---
 .../core/providers/coreml/coreml_provider_factory.h          | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h b/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
index 8b88f930753cf..d035fd34bd072 100644
--- a/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
+++ b/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
@@ -55,8 +55,9 @@ static const char* const kCoremlProviderOption_EnableOnSubgraphs = "EnableOnSubg
 // This process can affect the model loading time and the prediction latency.
 // Use this option to tailor the specialization strategy for your model.
 static const char* const kCoremlProviderOption_SpecializationStrategy = "SpecializationStrategy";
-// this flag is used to profile coreml computeplan after model compile.
-// The profile result includes what hardwares of each ops dispatched to and the estimate excution time.
+// Profile the Core ML MLComputePlan.
+// This logs the hardware each operator is dispatched to and the estimated execution time.
+// Intended for developer usage but provide useful diagnostic information if performance is not as expected.
 static const char* const kCoremlProviderOption_ProfileComputePlan = "ProfileComputePlan";
 // please refer to https://developer.apple.com/documentation/coreml/mlmodelconfiguration/allowlowprecisionaccumulationongpu
 static const char* const kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU = "AllowLowPrecisionAccumulationOnGPU";

From 49625fcd96514633d872e1bd7e458496102fa3f6 Mon Sep 17 00:00:00 2001
From: wejoncy <wejoncy@163.com>
Date: Sun, 8 Dec 2024 20:38:27 -0800
Subject: [PATCH 14/14] format

---
 onnxruntime/core/providers/coreml/model/model.mm | 1 -
 1 file changed, 1 deletion(-)

diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm
index 30161dd122388..755dbfbd6e68c 100644
--- a/onnxruntime/core/providers/coreml/model/model.mm
+++ b/onnxruntime/core/providers/coreml/model/model.mm
@@ -304,7 +304,6 @@ Status GetMLMultiArrayCopyInfo(const MLMultiArray* _Nonnull array,
 // since __clang_major__ >= 15, MLComputePlan is introduced in <CoreML/CoreML.h>
 // We are actually ensure the MacOS/IOS version and Xcode version is greater than `macOS 14.4, iOS 17.4`.
 // The macro API_AVAILABLE should also be fine.
- 
 // Otherwise, the compiler will complain `MLComputePlan` is not defined.
 // we define __clang_analyzer__ here is for bypass static analysis
 void ProfileComputePlan(NSURL* compileUrl, MLModelConfiguration* config) {