From 7f0aac0d8a5cfc8fa08c3f51b9f8002ec6eed4ba Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Sat, 6 Jan 2024 00:15:50 +0800
Subject: [PATCH 01/13] Revert "[WebNN EP] Rename op logicalNot to not"
 (#18997)

Reverts microsoft/onnxruntime#18936

WebNN spec is discussing using the `logicalNot` name at
https://github.com/webmachinelearning/webnn/issues/496, and the Chromium
implementation has suspended the renaming change. For consistent, we
should keep using `logicalNot` in WebNN EP util it is finalized.
---
 onnxruntime/core/providers/webnn/builders/helper.h              | 2 +-
 .../core/providers/webnn/builders/impl/unary_op_builder.cc      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h
index 28857d3002ede..8b8b85339a87c 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.h
+++ b/onnxruntime/core/providers/webnn/builders/helper.h
@@ -179,7 +179,7 @@ static const InlinedHashMap<std::string, WebnnOpInfo> op_map = {
     {"Min", {"min", true}},
     {"Mul", {"mul", true}},
     {"Neg", {"neg", true}},
-    {"Not", {"not", false}},
+    {"Not", {"logicalNot", false}},
     {"Pad", {"pad", true}},
     {"Pow", {"pow", false}},
     {"PRelu", {"prelu", true}},
diff --git a/onnxruntime/core/providers/webnn/builders/impl/unary_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/unary_op_builder.cc
index 129532e91f5a0..e6c5cf24080cd 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/unary_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/unary_op_builder.cc
@@ -48,7 +48,7 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   } else if (op_type == "Neg") {
     output = model_builder.GetBuilder().call<emscripten::val>("neg", input);
   } else if (op_type == "Not") {
-    output = model_builder.GetBuilder().call<emscripten::val>("not", input);
+    output = model_builder.GetBuilder().call<emscripten::val>("logicalNot", input);
   } else if (op_type == "Reciprocal") {
     output = model_builder.GetBuilder().call<emscripten::val>("reciprocal", input);
   } else if (op_type == "Sin") {

From 447a3a7c706495fdc0f8dae8b1a130ef73af18e1 Mon Sep 17 00:00:00 2001
From: Jiajie Hu <jiajie.hu@intel.com>
Date: Sat, 6 Jan 2024 00:16:15 +0800
Subject: [PATCH 02/13] [js/webgpu] Fix Expand/Gather when input type is bool
 (#18999)

### Description
Also update the op test suite.

### Motivation and Context
Previously the *total* size in case `Expand - last dim is not divisible
by 4` was a multiple of 4, even though the *last dimension* was not, so
the bug has never been caught.
---
 js/web/lib/wasm/jsep/webgpu/ops/expand.ts |  2 +-
 js/web/lib/wasm/jsep/webgpu/ops/gather.ts |  2 +-
 js/web/test/data/ops/expand.jsonc         | 29 +++++++++++++++++++----
 js/web/test/data/ops/gather.jsonc         | 22 +++++++++++++++++
 4 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/expand.ts b/js/web/lib/wasm/jsep/webgpu/ops/expand.ts
index 3dc4e957e0fee..035d89755c7d7 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/expand.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/expand.ts
@@ -47,7 +47,7 @@ const createExpandProgramInfo = (inputs: readonly TensorView[]): ProgramInfo =>
   const outputShape: number[] = calculateOutputShape(inputShape, shape);
   const dataType = inputs[0].dataType;
   const components = dataType === DataType.bool ? 4 : 1;
-  const outputSize = ShapeUtil.size(outputShape) / components;
+  const outputSize = Math.ceil(ShapeUtil.size(outputShape) / components);
 
   const enableInputShapeUniform = enableShapesUniforms(inputShape.length);
   const enableOutputShapeUniform = enableShapesUniforms(outputShape.length);
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gather.ts b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
index 53ca094abfd62..469249f92ff28 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
@@ -31,7 +31,7 @@ const createGatherProgramInfo = (inputs: readonly TensorView[], attributes: Gath
 
   const axisDimLimit = inputShape[axis];
   const components = inputs[0].dataType === DataType.bool ? 4 : 1;
-  const outputSize = ShapeUtil.size(outputShape) / components;
+  const outputSize = Math.ceil(ShapeUtil.size(outputShape) / components);
 
   const enableInputShapesUniforms = enableShapesUniforms(inputs[0].dims.length);
   const inputShapeOrRank = enableInputShapesUniforms ? inputs[0].dims.length : inputs[0].dims;
diff --git a/js/web/test/data/ops/expand.jsonc b/js/web/test/data/ops/expand.jsonc
index 22bc04d558d98..613b4507b2b15 100644
--- a/js/web/test/data/ops/expand.jsonc
+++ b/js/web/test/data/ops/expand.jsonc
@@ -168,20 +168,39 @@
         "name": "Expand - last dim is not divisible by 4",
         "inputs": [
           {
-            "data": [true, false, false, true, true, true, false, false, false, true, true, true],
-            "dims": [2, 6],
+            "data": [true, false, false, true, true, true],
+            "dims": [1, 6],
             "type": "bool"
           },
           {
-            "data": [2, 1],
+            "data": [3, 1],
             "dims": [2],
             "type": "int64"
           }
         ],
         "outputs": [
           {
-            "data": [true, false, false, true, true, true, false, false, false, true, true, true],
-            "dims": [2, 6],
+            "data": [
+              true,
+              false,
+              false,
+              true,
+              true,
+              true,
+              true,
+              false,
+              false,
+              true,
+              true,
+              true,
+              true,
+              false,
+              false,
+              true,
+              true,
+              true
+            ],
+            "dims": [3, 6],
             "type": "bool"
           }
         ]
diff --git a/js/web/test/data/ops/gather.jsonc b/js/web/test/data/ops/gather.jsonc
index 0be077d237b88..d218d120d356d 100644
--- a/js/web/test/data/ops/gather.jsonc
+++ b/js/web/test/data/ops/gather.jsonc
@@ -99,6 +99,28 @@
     "operator": "Gather",
     "attributes": [],
     "cases": [
+      {
+        "name": "data[4] indices[]",
+        "inputs": [
+          {
+            "data": [false, true, false, false],
+            "dims": [4],
+            "type": "bool"
+          },
+          {
+            "data": [1],
+            "dims": [],
+            "type": "int32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [true],
+            "dims": [],
+            "type": "bool"
+          }
+        ]
+      },
       {
         "name": "data[2,4] indices[1]",
         "inputs": [

From efdcefcf8cea1b724dd7694a8acb62fa14268e83 Mon Sep 17 00:00:00 2001
From: PeixuanZuo <94887879+PeixuanZuo@users.noreply.github.com>
Date: Sat, 6 Jan 2024 02:05:34 +0800
Subject: [PATCH 03/13] [ROCm] fix security warning (#19017)

fix security warning
---
 .../github/linux/docker/migraphx-ci-pipeline-env.Dockerfile    | 3 ---
 tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile      | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
index 85d738d2167e1..6c71631368822 100644
--- a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
+++ b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
@@ -65,9 +65,6 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86
     conda update --all && \
     rm ~/miniconda.sh && conda clean -ya
 
-# Conda base patch
-RUN pip install cryptography==41.0.4
-
 # Create migraphx-ci environment
 ENV CONDA_ENVIRONMENT_PATH /opt/miniconda/envs/migraphx-ci
 ENV CONDA_DEFAULT_ENV migraphx-ci
diff --git a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
index 29048b79d4b81..4db9df80ed187 100644
--- a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
+++ b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
@@ -67,9 +67,6 @@ ENV CONDA_DEFAULT_ENV rocm-ci
 RUN conda create -y -n ${CONDA_DEFAULT_ENV} python=3.9
 ENV PATH ${CONDA_ENVIRONMENT_PATH}/bin:${PATH}
 
-# Conda base patch
-RUN pip install cryptography==41.0.4
-
 # Enable rocm-ci environment
 SHELL ["conda", "run", "-n", "rocm-ci", "/bin/bash", "-c"]
 

From 4190c29d2260bb8f274049d7eab2a634fae95a21 Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Fri, 5 Jan 2024 14:51:07 -0800
Subject: [PATCH 04/13] Add MatMulNBits accuracy_level parameter to
 quantization utilities. (#19015)

Allow MatMulNBits `accuracy_level` attribute (added in #17669) to be set to a particular value when the model is quantized.
---
 .../quantization/matmul_4bits_quantizer.py    | 37 ++++++++++++++++---
 .../models/llama/convert_to_onnx.py           | 37 ++++++++++++++-----
 .../transformers/models/llama/llama_inputs.py |  4 +-
 .../transformers/models/llama/llama_parity.py |  7 ++--
 4 files changed, 64 insertions(+), 21 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
index 9f90196e301e5..6293bcbbf95bd 100644
--- a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
+++ b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
@@ -4,10 +4,11 @@
 # license information.
 # --------------------------------------------------------------------------
 
+from __future__ import annotations
+
 import argparse
 import logging
 import os
-from typing import List, Tuple
 
 import numpy as np
 import numpy.typing as npt
@@ -26,16 +27,24 @@
 class MatMul4BitsQuantizer:
     """Perform 4b quantization of constant MatMul weights"""
 
-    def __init__(self, model: ModelProto, block_size: int, is_symmetric: bool, nodes_to_exclude=None):
+    def __init__(
+        self,
+        model: ModelProto,
+        block_size: int,
+        is_symmetric: bool,
+        accuracy_level: int | None = None,
+        nodes_to_exclude: list[str] | None = None,
+    ):
         if nodes_to_exclude is None:
             nodes_to_exclude = []
         self.model = ONNXModel(model)
         self.block_size = block_size
         self.is_symmetric = is_symmetric
+        self.accuracy_level = accuracy_level
         self.nodes_to_exclude = set(nodes_to_exclude)
 
     @staticmethod
-    def __get_initializer(name, graph_path: List[GraphProto]) -> Tuple[TensorProto, GraphProto]:
+    def __get_initializer(name, graph_path: list[GraphProto]) -> tuple[TensorProto, GraphProto]:
         for gid in range(len(graph_path) - 1, -1, -1):
             graph = graph_path[gid]
             for tensor in graph.initializer:
@@ -66,7 +75,7 @@ def int4_block_quant(self, fp32weight: npt.ArrayLike) -> np.ndarray:
 
         return (packed, scales, zero_point)
 
-    def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto]) -> NodeProto:
+    def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: list[GraphProto]) -> NodeProto:
         """If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node"""
 
         if node.op_type != "MatMul":
@@ -113,6 +122,8 @@ def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto])
         kwargs["N"] = cols
         kwargs["bits"] = 4
         kwargs["block_size"] = self.block_size
+        if self.accuracy_level is not None:
+            kwargs["accuracy_level"] = self.accuracy_level
 
         matmul_q4_node = onnx.helper.make_node(
             "MatMulNBits",
@@ -127,7 +138,7 @@ def _q4_matmul_node_weight(self, node: NodeProto, graph_stack: List[GraphProto])
 
         return matmul_q4_node
 
-    def _process_subgraph(self, graph_stack: List[GraphProto]):
+    def _process_subgraph(self, graph_stack: list[GraphProto]):
         new_nodes = []
         graph = graph_stack[-1]
 
@@ -201,6 +212,14 @@ def parse_args():
         type=bool,
         help="Indicate whether to quantize the model symmetrically",
     )
+    parser.add_argument(
+        "--accuracy_level",
+        required=False,
+        type=int,
+        help="Accuracy level of the 4-bit quantized MatMul computation. "
+        "Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details "
+        "(https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits).",
+    )
     parser.add_argument("-v", "--verbose", required=False, action="store_true")
     parser.set_defaults(verbose=False)
     parser.add_argument(
@@ -228,6 +247,12 @@ def parse_args():
         raise Exception(f"file {output_model_path} already exists")
 
     model = onnx.load(input_model_path)
-    quant = MatMul4BitsQuantizer(model, args.block_size, args.symmetric, nodes_to_exclude=args.nodes_to_exclude)
+    quant = MatMul4BitsQuantizer(
+        model=model,
+        block_size=args.block_size,
+        is_symmetric=args.symmetric,
+        accuracy_level=args.accuracy_level,
+        nodes_to_exclude=args.nodes_to_exclude,
+    )
     quant.process()
     quant.model.save_model_to_file(output_model_path, True)
diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
index e694b5050cc8c..bc09b52574a27 100644
--- a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
@@ -1,9 +1,10 @@
+from __future__ import annotations
+
 import argparse
 import logging
 import os
 import shutil
 from itertools import chain
-from typing import List
 
 import onnx
 import torch
@@ -21,11 +22,12 @@
 from onnxruntime import quantization as ort_quantization
 from onnxruntime.quantization.matmul_4bits_quantizer import MatMul4BitsQuantizer
 
+torch_export_onnx_opset_version = 14
 logger = logging.getLogger("")
 init_dist()
 
 
-def get_model_dynamic_axes(input_names: List[str], output_names: List[str]):
+def get_model_dynamic_axes(input_names: list[str], output_names: list[str]):
     dynamic_axes = {}
     for name in input_names + output_names:
         if name in input_names:
@@ -42,7 +44,7 @@ def get_model_dynamic_axes(input_names: List[str], output_names: List[str]):
     return dynamic_axes
 
 
-def get_model_with_past_kv_dynamic_axes(input_names: List[str], output_names: List[str]):
+def get_model_with_past_kv_dynamic_axes(input_names: list[str], output_names: list[str]):
     dynamic_axes = {}
     for name in input_names + output_names:
         if name in {"input_ids", "position_ids"}:
@@ -65,7 +67,7 @@ def get_model_with_past_kv_dynamic_axes(input_names: List[str], output_names: Li
     return dynamic_axes
 
 
-def get_merged_model_dynamic_axes(input_names: List[str], output_names: List[str]):
+def get_merged_model_dynamic_axes(input_names: list[str], output_names: list[str]):
     dynamic_axes = {}
     for name in input_names + output_names:
         if name in {"input_ids", "position_ids"}:
@@ -229,7 +231,7 @@ def run_torchscript_separate_export(
         input_names=input_names,
         output_names=output_names,
         dynamic_axes=dynamic_axes,
-        opset_version=13,
+        opset_version=torch_export_onnx_opset_version,
         do_constant_folding=True,
         verbose=args.verbose,
     )
@@ -288,7 +290,7 @@ def run_torchscript_separate_export(
         input_names=input_names,
         output_names=output_names,
         dynamic_axes=dynamic_axes,
-        opset_version=13,
+        opset_version=torch_export_onnx_opset_version,
         do_constant_folding=True,
         verbose=args.verbose,
     )
@@ -368,7 +370,7 @@ def run_torchscript_merged_export(
         input_names=input_names,
         output_names=output_names,
         dynamic_axes=dynamic_axes,
-        opset_version=13,
+        opset_version=torch_export_onnx_opset_version,
         do_constant_folding=True,
         verbose=args.verbose,
     )
@@ -412,7 +414,7 @@ def optimize_export(config: AutoConfig, input_path: str, output_path: str, remov
 
 
 def convert_to_float16(
-    args: argparse.Namespace, config: AutoConfig, old_paths: List[str], rank: int = 0, world_size: int = 1
+    args: argparse.Namespace, config: AutoConfig, old_paths: list[str], rank: int = 0, world_size: int = 1
 ):
     decoder_model_fp16_path = os.path.join(args.output, f"rank_{rank}_{args.model_name}_decoder_model_fp16.onnx")
     decoder_with_past_model_fp16_path = os.path.join(
@@ -635,7 +637,7 @@ def get_args():
         help="Run a specific quantization algorithm (blockwise for int4, smooth_quant for int8, quantize_dynamic for int8). Blockwise is recommended. Need to install extra packages in `requirements-quant.txt` for SmoothQuant.",
     )
 
-    blockwise_group = parser.add_argument_group("4-bit quantization")
+    blockwise_group = parser.add_argument_group("blockwise (4-bit quantization)")
 
     blockwise_group.add_argument(
         "--block_size",
@@ -645,6 +647,15 @@ def get_args():
         help="Block size to quantize with. See https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py for details.",
     )
 
+    blockwise_group.add_argument(
+        "--int4_accuracy_level",
+        required=False,
+        type=int,
+        help="Accuracy level of the 4-bit quantized MatMul computation. "
+        "Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details "
+        "(https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits).",
+    )
+
     smooth_quant_group = parser.add_argument_group("smooth_quant (8-bit quantization)")
 
     smooth_quant_group.add_argument(
@@ -937,7 +948,13 @@ def main():
                 for fp_path, int4_path in zip(old_paths, new_paths):
                     if os.path.exists(fp_path):
                         model = onnx.load_model(fp_path, load_external_data=True)
-                        quant = MatMul4BitsQuantizer(model, args.block_size, is_symmetric=True, nodes_to_exclude=[])
+                        quant = MatMul4BitsQuantizer(
+                            model=model,
+                            block_size=args.block_size,
+                            is_symmetric=True,
+                            accuracy_level=args.int4_accuracy_level,
+                            nodes_to_exclude=[],
+                        )
                         quant.process()
                         quant.model.save_model_to_file(int4_path, use_external_data_format=True)
                         del model
diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py b/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py
index bae1ae82e8f7e..a329b73259dda 100644
--- a/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py
+++ b/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple
+from __future__ import annotations
 
 import numpy as np
 import torch
@@ -235,7 +235,7 @@ def get_past_kv_inputs(config: AutoConfig, batch_size: int, past_seq_len: int, u
 
 
 # Convert list of past_key_values to dict of past_key and past_value
-def flatten_past_kv_inputs(past_key_values: List[Tuple[torch.Tensor, torch.Tensor]]):
+def flatten_past_kv_inputs(past_key_values: list[tuple[torch.Tensor, torch.Tensor]]):
     past_kv = {}
     for i, (past_k, past_v) in enumerate(past_key_values):
         past_kv[f"past_key_values.{i}.key"] = past_k.detach().cpu().numpy()
diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
index 418a65325c8f0..25d7519769604 100644
--- a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
+++ b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py
@@ -1,8 +1,9 @@
+from __future__ import annotations
+
 import argparse
 import logging
 import os
 import time
-from typing import List
 
 import numpy as np
 import torch
@@ -139,7 +140,7 @@ def verify_parity(
     return kv_cache_ortvalues
 
 
-def get_args(argv: List[str]):
+def get_args(argv: list[str]):
     parser = argparse.ArgumentParser()
 
     parser.add_argument(
@@ -232,7 +233,7 @@ def get_args(argv: List[str]):
     return args
 
 
-def main(argv: List[str] = []):  # noqa: B006
+def main(argv: list[str] = []):  # noqa: B006
     args = get_args(argv)
     setup_logger(args.verbose)
     logger.info(f"Arguments: {args}")

From db3c07608130e4853bb6d9db66fcf57f95a864e9 Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Mon, 8 Jan 2024 03:06:45 -0800
Subject: [PATCH 05/13] [ROCm] do not use failed miopen fusion compile (#19012)

The FusedConv operator for the ROCm EP could fail to compile the fused
operation, in which case it should not attempt to use the failed fusion
plan. In addition, the hash for the miopenConvolutionDescriptor_t for
newer ROCm versions was failing to use all components of the descriptor.
---
 onnxruntime/contrib_ops/rocm/fused_conv.cc | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/contrib_ops/rocm/fused_conv.cc b/onnxruntime/contrib_ops/rocm/fused_conv.cc
index d597e0d57fbcb..63804f79a32fb 100644
--- a/onnxruntime/contrib_ops/rocm/fused_conv.cc
+++ b/onnxruntime/contrib_ops/rocm/fused_conv.cc
@@ -76,7 +76,12 @@ struct FNVHash {
   void HashConvolutionDescriptor(miopenConvolutionDescriptor_t cdesc) {
     int spatial_dim = 1;
 #if ROCM_VERSION >= 50500
-    miopenGetConvolutionSpatialDim(cdesc, &spatial_dim);
+    MIOPEN_CALL(miopenGetConvolutionSpatialDim(cdesc, &spatial_dim));
+    std::vector<int> pads{spatial_dim};
+    std::vector<int> strides{spatial_dim};
+    std::vector<int> dilations{spatial_dim};
+    miopenConvolutionMode_t mode;
+    MIOPEN_CALL(miopenGetConvolutionNdDescriptor(cdesc, spatial_dim, &spatial_dim, pads.data(), strides.data(), dilations.data(), &mode));
 #else
     // Previous versions of MIOpen doesn't provide API to probe the dimension of a
     // miopenConvolutionDescriptor_t, so we have to guess.
@@ -100,11 +105,12 @@ struct FNVHash {
     pads.resize(spatial_dim);
     strides.resize(spatial_dim);
     dilations.resize(spatial_dim);
+#endif
     (*this) << spatial_dim;
     (*this) << pads;
     (*this) << strides;
     (*this) << dilations;
-#endif
+    (*this) << mode;
   }
 
  private:
@@ -313,6 +319,8 @@ class FusedConv : public onnxruntime::rocm::Conv<T, false> {
       auto ret = miopenCompileFusionPlan(handle, fusion->plan);
       if (miopenStatusSuccess == ret) {
         fusion->compiled_on.insert(handle);
+      } else {
+        return ret;
       }
       return miopenStatusSuccess;
     }

From e8ac97c8d864eb3088cf87732b5fc0a7d7df495f Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Mon, 8 Jan 2024 17:19:58 +0000
Subject: [PATCH 06/13] Move Windows GPU training job to A10  (#19041)

### Description
1. Update sm to 86

### Motivation and Context
We have more A10 quota then T4 and Nvidia AXX could be  partitioned
---
 .../ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml  | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
index fdb9238071c9e..eee38ac04b355 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
@@ -59,15 +59,14 @@ stages:
         BuildConfig: 'RelWithDebInfo'
         EnvSetupScript: setup_env_cuda.bat
         buildArch: x64
-        additionalBuildFlags: --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --skip_onnx_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
+        additionalBuildFlags: --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --skip_onnx_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
         RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
         ORT_EP_NAME: CUDA
         WITH_CACHE: true
-        # Some unit tests crash on A10 GPUs. So this job still needs to use T4.
-        MachinePool: onnxruntime-Win2022-GPU-T4
+        MachinePool: onnxruntime-Win2022-GPU-A10
         isTraining: true
 
 - stage: dml

From 52e560144978d453c73198c93043c3f1b8a30d04 Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Mon, 8 Jan 2024 12:44:12 -0800
Subject: [PATCH 07/13] [QNN Nuget Pipeline] Build with ML ops and detect ORT
 version (#19024)

### Description
- Removes `--disable_ml_ops` build flag
- Automatically detects ORT version from VERSION file via
`templates/set-version-number-variables-step.yml`. We will no longer
need to create a commit to update ORT versions.

### Motivation and Context
- A new unit test caused failures in the QNN Nuget pipeline because it
did not enable ml ops.
- Automate ORT version specification
---
 .../qnn-ep-nuget-packaging-pipeline.yml             | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
index d9aff36c4ad34..f6fcbd08ff03a 100644
--- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
@@ -9,11 +9,6 @@ parameters:
   type: string
   default: qnn-v2.17.0.231124_win
 
-- name: ort_package_version
-  displayName: OnnxRuntime Nuget package version
-  type: string
-  default: 1.15.0
-  
 - name: build_config
   displayName: Build Configuration
   type: string
@@ -47,7 +42,7 @@ jobs:
       buildArch: x64
       setVcvars: true
       ALLOW_RELEASED_ONNX_OPSET_ONLY: '1'
-      commonBuildArgs: '--compile_no_warning_as_error --disable_ml_ops --build_dir $(Build.BinariesDirectory)\Windows --skip_submodule_sync --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}}'
+      commonBuildArgs: '--compile_no_warning_as_error --build_dir $(Build.BinariesDirectory)\Windows --skip_submodule_sync --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}}'
 
     steps:
       - template: templates/set-version-number-variables-step.yml
@@ -90,7 +85,7 @@ jobs:
         displayName: 'Generating nuspec for the native Nuget package x64'
         inputs:
           script: |
-            python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version ${{ parameters.ort_package_version }} --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture x64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Windows\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Windows\packages --ort_build_path $(Build.BinariesDirectory)\Windows --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }}
+            python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version $(OnnxRuntimeVersion) --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture x64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Windows\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Windows\packages --ort_build_path $(Build.BinariesDirectory)\Windows --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }}
             cd $(Build.BinariesDirectory)\Windows\${{ parameters.build_config }}\${{ parameters.build_config }}
             nuget pack NativeNuget.nuspec
             mkdir $(Build.ArtifactStagingDirectory)\x64
@@ -130,7 +125,7 @@ jobs:
         displayName: 'Generate CMake Configuration for arm64'
         inputs:
           scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-          arguments: '--update --arm64 --disable_ml_ops --build_dir $(Build.BinariesDirectory)\Win_arm64 --skip_submodule_sync --skip_tests --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}}'
+          arguments: '--update --arm64 --build_dir $(Build.BinariesDirectory)\Win_arm64 --skip_submodule_sync --skip_tests --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --use_qnn --qnn_home ${{parameters.qnn_sdk_path_win}}'
 
       - task: VSBuild@1
         displayName: 'Build onnxruntime arm64'
@@ -178,7 +173,7 @@ jobs:
         displayName: 'Generating nuspec for the native Nuget package arm64'
         inputs:
           script: |
-            python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version ${{ parameters.ort_package_version }} --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture arm64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Win_arm64\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Win_arm64\packages --ort_build_path $(Build.BinariesDirectory)\Win_arm64 --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }}
+            python "$(Build.SourcesDirectory)\tools\nuget\generate_nuspec_for_native_nuget.py" --package_version $(OnnxRuntimeVersion) --package_name Microsoft.ML.OnnxRuntime.QNN --target_architecture arm64 --build_config ${{ parameters.build_config }} --native_build_path=$(Build.BinariesDirectory)\Win_arm64\${{ parameters.build_config }}\${{ parameters.build_config }} --packages_path $(Build.BinariesDirectory)\Win_arm64\packages --ort_build_path $(Build.BinariesDirectory)\Win_arm64 --sources_path $(Build.SourcesDirectory) --commit_id $(OnnxRuntimeGitCommitHash) --is_release_build ${{ parameters.IsReleaseBuild }} --sdk_info ${{ parameters.qnn_sdk_info }}
             cd $(Build.BinariesDirectory)\Win_arm64\${{ parameters.build_config }}\${{ parameters.build_config }}
             nuget pack NativeNuget.nuspec
             mkdir $(Build.ArtifactStagingDirectory)\arm64

From 99a8400e903ab330e3067629d9aa4e23ce82cf12 Mon Sep 17 00:00:00 2001
From: zesongw <zesong.wang@intel.com>
Date: Tue, 9 Jan 2024 09:16:52 +0800
Subject: [PATCH 08/13] [WebNN EP] Fall back resize nearest mode for WebNN CPU
 backend (#19039)

WebNN CPU backend only supports linear mode. Fall back for this case.
---
 .../webnn/builders/impl/resize_op_builder.cc  | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
index ea9fc379ee23f..186d1e7c1035a 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
@@ -30,7 +30,7 @@ class ResizeOpBuilder : public BaseOpBuilder {
   // Operator support related.
  private:
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
-                         const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
+                         const WebnnDeviceType device_type, const logging::Logger& logger) const override;
 
   // Resize opset 10- is very different than Resize opset 11+, with many key attributes missing.
   // We only support Resize opset 11+ here.
@@ -161,7 +161,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
 bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
                                         const Node& node,
-                                        const WebnnDeviceType /* device_type */,
+                                        const WebnnDeviceType device_type,
                                         const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
 
@@ -181,9 +181,18 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers
     const auto mode = helper.Get("mode", "nearest");
     bool is_linear_resize = mode == "linear";
     bool is_nearest_resize = mode == "nearest";
-    if (!is_linear_resize && !is_nearest_resize) {
-      LOGS(logger, VERBOSE) << "Resize unsupported input mode, " << mode;
-      return false;
+    // WebNN CPU backend only supports "linear" mode.
+    // WebNN GPU backend only supports "linear" and "nearest" modes.
+    if (device_type == WebnnDeviceType::CPU) {
+      if (!is_linear_resize) {
+        LOGS(logger, VERBOSE) << "Resize unsupported input mode, " << mode << " for CPU backend.";
+        return false;
+      }
+    } else {
+      if (!is_linear_resize && !is_nearest_resize) {
+        LOGS(logger, VERBOSE) << "Resize unsupported input mode, " << mode << " for GPU backend.";
+        return false;
+      }
     }
 
     const auto exclude_outside = helper.Get("exclude_outside", 0);

From 975a315cd70aac54a6c4ff8b7d4e0a76d25666de Mon Sep 17 00:00:00 2001
From: Jeff Bloomfield <38966965+jeffbloo@users.noreply.github.com>
Date: Mon, 8 Jan 2024 17:49:19 -0800
Subject: [PATCH 09/13] Fix x86 build error in GraphDescBuilder.cpp affecting
 packaging pipeline (#19045)

### Description
This addresses a 32 bit build error affecting the packaging pipeline


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
index adb4fd131119f..c6a15e76f4736 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
@@ -360,7 +360,7 @@ namespace Dml::GraphDescBuilder
                                 // The tensor description's size should be no larger than the constant input unless it was rounded to 
                                 // the required alignment.
                                 assert(((constantInput->GetTensorByteSize() + 3) & ~3) >= tensorDesc->totalTensorSizeInBytes);
-                                size_t minimumConstantSize = std::min(constantInput->GetTensorByteSize(), tensorDesc->totalTensorSizeInBytes);
+                                size_t minimumConstantSize = std::min(constantInput->GetTensorByteSize(), gsl::narrow_cast<size_t>(tensorDesc->totalTensorSizeInBytes));
                                 auto data = static_cast<const uint8_t*>(constantInput->GetData());
                                 std::vector<uint8_t> tensorData(data, data + minimumConstantSize);
 

From a8bb1df331e56e3a65106578b6475d89d17b27c5 Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Mon, 8 Jan 2024 17:58:38 -0800
Subject: [PATCH 10/13] [js/webgpu] fix heap access > 2GB (#19010)

---
 onnxruntime/core/providers/js/js_kernel.h     |  1 +
 .../core/providers/js/operators/conv.h        | 12 ++---
 .../providers/js/operators/conv_transpose.h   | 20 ++++----
 onnxruntime/core/providers/js/operators/pad.h |  2 +-
 .../core/providers/js/operators/reduce.h      | 46 +++++++++----------
 .../core/providers/js/operators/resize.h      |  2 +-
 .../core/providers/js/operators/slice.h       |  6 +--
 .../core/providers/js/operators/split.h       |  2 +-
 .../core/providers/js/operators/transpose.h   |  2 +-
 9 files changed, 47 insertions(+), 46 deletions(-)

diff --git a/onnxruntime/core/providers/js/js_kernel.h b/onnxruntime/core/providers/js/js_kernel.h
index 5c2d1f0b881ba..b850bea4bc275 100644
--- a/onnxruntime/core/providers/js/js_kernel.h
+++ b/onnxruntime/core/providers/js/js_kernel.h
@@ -67,6 +67,7 @@ namespace js {
                             float value;                                           \
                             ORT_ENFORCE(info.GetAttr<float>(#attr_name, &value));, \
                                                                                  , ({#attr_name : $1}), static_cast<double>(value))
+#define JSEP_HEAP_PTR(ptr) reinterpret_cast<uintptr_t>(ptr)
 
 // TODO:
 // class JsMultiProgramKernel : public OpKernel { /* TBD */ };
diff --git a/onnxruntime/core/providers/js/operators/conv.h b/onnxruntime/core/providers/js/operators/conv.h
index 5c0fbf93a4004..98a530c6b77f6 100644
--- a/onnxruntime/core/providers/js/operators/conv.h
+++ b/onnxruntime/core/providers/js/operators/conv.h
@@ -54,13 +54,13 @@ class ConvBase : public JsKernel {
                                  static_cast<int32_t>(conv_attrs_.group),
                                  static_cast<int32_t>(kernel_shape_0),
                                  static_cast<int32_t>(local_pads.size()),
-                                 reinterpret_cast<int32_t>(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2,
+                                 JSEP_HEAP_PTR(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2,
                                  static_cast<int32_t>(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0),
                                  static_cast<int32_t>(channels_last),
-                                 reinterpret_cast<int32_t>(&w_is_const_),
+                                 JSEP_HEAP_PTR(&w_is_const_),
                                  conv_attrs_.activation.c_str(),
                                  activation_params.size(),
-                                 reinterpret_cast<int32_t>(activation_params_ptr) >> 2);
+                                 JSEP_HEAP_PTR(activation_params_ptr) >> 2);
     } else {
       JSEP_INIT_KERNEL_ATTRIBUTE(Conv, ({
                                    "format" : $11 ? "NHWC" : "NCHW",
@@ -81,14 +81,14 @@ class ConvBase : public JsKernel {
                                  static_cast<int32_t>(kernel_shape_0),
                                  static_cast<int32_t>(kernel_shape_1),
                                  static_cast<int32_t>(local_pads.size()),
-                                 reinterpret_cast<int32_t>(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2,
+                                 JSEP_HEAP_PTR(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2,
                                  static_cast<int32_t>(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0),
                                  static_cast<int32_t>(conv_attrs_.strides.size() > 1 ? conv_attrs_.strides[1] : 0),
                                  static_cast<int32_t>(channels_last),
-                                 reinterpret_cast<int32_t>(&w_is_const_),
+                                 JSEP_HEAP_PTR(&w_is_const_),
                                  conv_attrs_.activation.c_str(),
                                  activation_params.size(),
-                                 reinterpret_cast<int32_t>(activation_params_ptr) >> 2);
+                                 JSEP_HEAP_PTR(activation_params_ptr) >> 2);
     }
   }
 
diff --git a/onnxruntime/core/providers/js/operators/conv_transpose.h b/onnxruntime/core/providers/js/operators/conv_transpose.h
index 5d30dc851e00f..353a946e95c21 100644
--- a/onnxruntime/core/providers/js/operators/conv_transpose.h
+++ b/onnxruntime/core/providers/js/operators/conv_transpose.h
@@ -64,11 +64,11 @@ class ConvTranspose : public JsKernel {
                                  static_cast<int32_t>(pads_1),
                                  static_cast<int32_t>(strides),
                                  static_cast<int32_t>(channels_last),
-                                 reinterpret_cast<int32_t>(&w_is_const_),
+                                 JSEP_HEAP_PTR(&w_is_const_),
                                  gsl::narrow_cast<int32_t>(local_output_padding.size()),
-                                 reinterpret_cast<int32_t>(local_output_padding_ptr) >> 2,
+                                 JSEP_HEAP_PTR(local_output_padding_ptr) >> 2,
                                  gsl::narrow_cast<int32_t>(local_output_shape.size()),
-                                 reinterpret_cast<int32_t>(local_output_shape_ptr) >> 2,
+                                 JSEP_HEAP_PTR(local_output_shape_ptr) >> 2,
                                  conv_transpose_attrs_.activation.c_str());
     } else {
       constexpr size_t pads_vec_size = 4;
@@ -114,17 +114,17 @@ class ConvTranspose : public JsKernel {
                                    "activation" : UTF8ToString($13)
                                  }),
                                  static_cast<int32_t>(conv_transpose_attrs_.auto_pad),
-                                 reinterpret_cast<int32_t>(local_dilations.data()) >> 2,
+                                 JSEP_HEAP_PTR(local_dilations.data()) >> 2,
                                  static_cast<int32_t>(conv_transpose_attrs_.group),
-                                 reinterpret_cast<int32_t>(local_kernel_shape.data()) >> 2,
-                                 reinterpret_cast<int32_t>(local_pads.data()) >> 2,
-                                 reinterpret_cast<int32_t>(local_strides.data()) >> 2,
+                                 JSEP_HEAP_PTR(local_kernel_shape.data()) >> 2,
+                                 JSEP_HEAP_PTR(local_pads.data()) >> 2,
+                                 JSEP_HEAP_PTR(local_strides.data()) >> 2,
                                  static_cast<int32_t>(channels_last),
-                                 reinterpret_cast<int32_t>(&w_is_const_),
+                                 JSEP_HEAP_PTR(&w_is_const_),
                                  gsl::narrow_cast<int32_t>(local_output_padding.size()),
-                                 reinterpret_cast<int32_t>(local_output_padding_ptr) >> 2,
+                                 JSEP_HEAP_PTR(local_output_padding_ptr) >> 2,
                                  gsl::narrow_cast<int32_t>(local_output_shape.size()),
-                                 reinterpret_cast<int32_t>(local_output_shape_ptr) >> 2,
+                                 JSEP_HEAP_PTR(local_output_shape_ptr) >> 2,
                                  conv_transpose_attrs_.activation.c_str());
     }
   }
diff --git a/onnxruntime/core/providers/js/operators/pad.h b/onnxruntime/core/providers/js/operators/pad.h
index 19168f40b4722..bf808be949cf8 100644
--- a/onnxruntime/core/providers/js/operators/pad.h
+++ b/onnxruntime/core/providers/js/operators/pad.h
@@ -26,7 +26,7 @@ class Pad : public JsKernel, public PadBase {
                                static_cast<int32_t>(mode_),
                                static_cast<double>(value_),
                                gsl::narrow_cast<int32_t>(pads.size()),
-                               reinterpret_cast<int32_t>((pads.size() > 0) ? pads.data() : nullptr) >> 2);
+                               JSEP_HEAP_PTR((pads.size() > 0) ? pads.data() : nullptr) >> 2);
   }
 };
 
diff --git a/onnxruntime/core/providers/js/operators/reduce.h b/onnxruntime/core/providers/js/operators/reduce.h
index a5a4aa834c2ca..95c4f2bec230d 100644
--- a/onnxruntime/core/providers/js/operators/reduce.h
+++ b/onnxruntime/core/providers/js/operators/reduce.h
@@ -8,29 +8,29 @@
 
 namespace onnxruntime {
 namespace js {
-#define JSEP_DEFINE_REDUCE_KERNEL(ReduceKernel)                                                              \
-  template <bool allow_multi_axes = true>                                                                    \
-  class ReduceKernel : public JsKernel, public ReduceKernelBase<allow_multi_axes> {                          \
-   public:                                                                                                   \
-    using ReduceKernelBase<allow_multi_axes>::axes_;                                                         \
-    using ReduceKernelBase<allow_multi_axes>::noop_with_empty_axes_;                                         \
-    using ReduceKernelBase<allow_multi_axes>::keepdims_;                                                     \
-    ReduceKernel(const OpKernelInfo& info) : JsKernel(info), ReduceKernelBase<allow_multi_axes>(info) {      \
-      std::vector<int32_t> axes(axes_.size());                                                               \
-      if (axes_.size() > 0) {                                                                                \
-        std::transform(axes_.begin(), axes_.end(), axes.begin(),                                             \
-                       [](int64_t axis) { return gsl::narrow_cast<int32_t>(axis); });                        \
-      }                                                                                                      \
-      JSEP_INIT_KERNEL_ATTRIBUTE(ReduceKernel, ({                                                            \
-                                   "keepDims" : !!$1,                                                        \
-                                   "noopWithEmptyAxes" : !!$2,                                               \
-                                   "axes" : $3 ? (Array.from(HEAP32.subarray($4, $4 + $3))) : [],            \
-                                 }),                                                                         \
-                                 static_cast<int32_t>(keepdims_),                                            \
-                                 static_cast<int32_t>(noop_with_empty_axes_),                                \
-                                 gsl::narrow_cast<int32_t>(axes.size()),                                     \
-                                 reinterpret_cast<int32_t>((axes.size() > 0) ? axes.data() : nullptr) >> 2); \
-    }                                                                                                        \
+#define JSEP_DEFINE_REDUCE_KERNEL(ReduceKernel)                                                         \
+  template <bool allow_multi_axes = true>                                                               \
+  class ReduceKernel : public JsKernel, public ReduceKernelBase<allow_multi_axes> {                     \
+   public:                                                                                              \
+    using ReduceKernelBase<allow_multi_axes>::axes_;                                                    \
+    using ReduceKernelBase<allow_multi_axes>::noop_with_empty_axes_;                                    \
+    using ReduceKernelBase<allow_multi_axes>::keepdims_;                                                \
+    ReduceKernel(const OpKernelInfo& info) : JsKernel(info), ReduceKernelBase<allow_multi_axes>(info) { \
+      std::vector<int32_t> axes(axes_.size());                                                          \
+      if (axes_.size() > 0) {                                                                           \
+        std::transform(axes_.begin(), axes_.end(), axes.begin(),                                        \
+                       [](int64_t axis) { return gsl::narrow_cast<int32_t>(axis); });                   \
+      }                                                                                                 \
+      JSEP_INIT_KERNEL_ATTRIBUTE(ReduceKernel, ({                                                       \
+                                   "keepDims" : !!$1,                                                   \
+                                   "noopWithEmptyAxes" : !!$2,                                          \
+                                   "axes" : $3 ? (Array.from(HEAP32.subarray($4, $4 + $3))) : [],       \
+                                 }),                                                                    \
+                                 static_cast<int32_t>(keepdims_),                                       \
+                                 static_cast<int32_t>(noop_with_empty_axes_),                           \
+                                 gsl::narrow_cast<int32_t>(axes.size()),                                \
+                                 JSEP_HEAP_PTR((axes.size() > 0) ? axes.data() : nullptr) >> 2);        \
+    }                                                                                                   \
   };
 
 JSEP_DEFINE_REDUCE_KERNEL(ReduceMax);
diff --git a/onnxruntime/core/providers/js/operators/resize.h b/onnxruntime/core/providers/js/operators/resize.h
index 65854222ba988..4b1c288ae3015 100644
--- a/onnxruntime/core/providers/js/operators/resize.h
+++ b/onnxruntime/core/providers/js/operators/resize.h
@@ -34,7 +34,7 @@ class Resize : public JsKernel, public UpsampleBase {
                                }),
                                static_cast<int32_t>(antialias_),
                                gsl::narrow_cast<int32_t>(axes.size()),
-                               reinterpret_cast<int32_t>((axes.size() > 0) ? axes.data() : nullptr) >> 2,
+                               JSEP_HEAP_PTR((axes.size() > 0) ? axes.data() : nullptr) >> 2,
                                resize_coordinate_transformation_mode.c_str(),
                                static_cast<double>(cubic_coeff_a_),
                                static_cast<int32_t>(exclude_outside_),
diff --git a/onnxruntime/core/providers/js/operators/slice.h b/onnxruntime/core/providers/js/operators/slice.h
index 6792997025d65..989adabf029a5 100644
--- a/onnxruntime/core/providers/js/operators/slice.h
+++ b/onnxruntime/core/providers/js/operators/slice.h
@@ -24,11 +24,11 @@ class Slice : public JsKernel, public SliceBase {
                                         "ends" : $3 ? Array.from(HEAP32.subarray($4, $4 + $3)) : [],
                                         "axes" : $5 ? Array.from(HEAP32.subarray($6, $6 + $5)) : []}),
                                gsl::narrow_cast<int32_t>(starts.size()),
-                               reinterpret_cast<int32_t>((starts.size() > 0) ? starts.data() : nullptr) >> 2,
+                               JSEP_HEAP_PTR((starts.size() > 0) ? starts.data() : nullptr) >> 2,
                                gsl::narrow_cast<int32_t>(ends.size()),
-                               reinterpret_cast<int32_t>((ends.size() > 0) ? ends.data() : nullptr) >> 2,
+                               JSEP_HEAP_PTR((ends.size() > 0) ? ends.data() : nullptr) >> 2,
                                gsl::narrow_cast<int32_t>(axes.size()),
-                               reinterpret_cast<int32_t>((axes.size() > 0) ? axes.data() : nullptr) >> 2);
+                               JSEP_HEAP_PTR((axes.size() > 0) ? axes.data() : nullptr) >> 2);
   }
 };
 
diff --git a/onnxruntime/core/providers/js/operators/split.h b/onnxruntime/core/providers/js/operators/split.h
index cfacc1aa6a363..1c1874e5aa98e 100644
--- a/onnxruntime/core/providers/js/operators/split.h
+++ b/onnxruntime/core/providers/js/operators/split.h
@@ -53,7 +53,7 @@ class Split : public JsKernel, public SplitBase {
                                static_cast<int32_t>(axis_),
                                static_cast<int32_t>(num_outputs_),
                                gsl::narrow_cast<int32_t>(split_sizes.size()),
-                               reinterpret_cast<int32_t>((split_sizes.size() > 0) ? split_sizes.data() : nullptr) >> 2);
+                               JSEP_HEAP_PTR((split_sizes.size() > 0) ? split_sizes.data() : nullptr) >> 2);
   }
 };
 
diff --git a/onnxruntime/core/providers/js/operators/transpose.h b/onnxruntime/core/providers/js/operators/transpose.h
index 311badbde0d11..dae442b9f5a13 100644
--- a/onnxruntime/core/providers/js/operators/transpose.h
+++ b/onnxruntime/core/providers/js/operators/transpose.h
@@ -27,7 +27,7 @@ class Transpose final : public JsKernel, public TransposeBase {
                                gsl::narrow_cast<int32_t>(perm_specified_ ? perm_.size() : 0),
                                // $2: index to HEAP32 of the first int32 element. calculated from right shift memory
                                //     address by 2
-                               reinterpret_cast<int32_t>(perm_specified_ && !perm.empty() ? perm.data() : nullptr) >> 2);
+                               JSEP_HEAP_PTR(perm_specified_ && !perm.empty() ? perm.data() : nullptr) >> 2);
   }
 };
 

From 8f024b739439c521cd19fe5a9830d4015de99bd7 Mon Sep 17 00:00:00 2001
From: Xu Xing <xing.xu@intel.com>
Date: Tue, 9 Jan 2024 10:16:25 +0800
Subject: [PATCH 11/13] [js/webgpu] Support uniforms for layer-norm (#18755)

---
 .../lib/wasm/jsep/webgpu/op-resolve-rules.ts  |  4 +-
 js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts | 83 ++++++++++---------
 2 files changed, 46 insertions(+), 41 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 8e1ec782079be..06c3c6c196501 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -17,7 +17,7 @@ import {gather, parseGatherAttributes} from './ops/gather';
 import {gatherElements, parseGatherElementsAttributes} from './ops/gather-elements';
 import {gemm, parseGemmAttributes} from './ops/gemm';
 import {instanceNorm, parseInstanceNormAttributes} from './ops/instance-norm';
-import {layerNorm, parseLayerNormAttributes} from './ops/layer-norm';
+import {layerNorm} from './ops/layer-norm';
 import {matMul} from './ops/matmul';
 import {multiHeadAttention, parseMultiHeadAttentionAttributes} from './ops/multi-head-attentiion';
 import {pad, parsePadAttributes} from './ops/pad';
@@ -83,7 +83,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['Greater', [binaryOps.greater]],
   ['GreaterOrEqual', [binaryOps.greaterOrEqual]],
   ['InstanceNormalization', [instanceNorm, parseInstanceNormAttributes]],
-  ['LayerNormalization', [layerNorm, parseLayerNormAttributes]],
+  ['LayerNormalization', [layerNorm]],
   ['LeakyRelu', [unaryOps.leakyRelu, unaryOps.parseAlphaAttributes]],
   ['Less', [binaryOps.less]],
   ['LessOrEqual', [binaryOps.lessOrEqual]],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
index 8a9eeecf2c68d..bc446079faf8f 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
@@ -4,12 +4,11 @@
 import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, ProgramInfo} from '../types';
+import {ComputeContext, ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types';
 
-import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType,} from './common';
+import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType, UniformsArrayType,} from './common';
 
-export interface LayerNormAttributes extends AttributeWithCacheKey {
+interface LayerNormAttributes {
   axis: number;
   epsilon: number;
 }
@@ -39,7 +38,7 @@ const createLayerNormProgramInfo =
        Got scale size of ${scaleSize} and bias size of ${biasSize}`);
       }
 
-      const meanInvStdDevDim = [];
+      const meanInvStdDevDim: number[] = [];
       for (let i = 0; i < xShape.length; ++i) {
         if (i < axis) {
           meanInvStdDevDim.push(xShape[i]);
@@ -47,50 +46,57 @@ const createLayerNormProgramInfo =
           meanInvStdDevDim.push(1);
         }
       }
-
       const components = getMaxComponents(normSize);
-      const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
-      const variables = [
-        inputVariable('x', inputs[0].dataType, inputs[0].dims, components),
-        inputVariable('scale', scale.dataType, scale.dims, components),
+      const inputDependencies: ProgramInputTensorInfoDependency[] = ['type', 'type'];
+      const programUniforms: ProgramUniform[] = [
+        {type: 'uint32', data: normCount}, {type: 'float32', data: normSize},
+        {type: 'uint32', data: Math.floor(normSize / components)}, {type: 'float32', data: attributes.epsilon}
       ];
       if (bias) {
-        variables.push(inputVariable('bias', bias.dataType, bias.dims, components));
+        inputDependencies.push('type');
       }
-      variables.push(outputVariable('output', inputs[0].dataType, outputShape, components));
-
       const hasMeanDataOutput = outputCount > 1;
       const hasInvStdOutput = outputCount > 2;
 
-      if (hasMeanDataOutput) {
-        variables.push(outputVariable('meanDataOutput', DataType.float, meanInvStdDevDim));
-      }
-      if (hasInvStdOutput) {
-        variables.push(outputVariable('invStdOutput', DataType.float, meanInvStdDevDim));
-      }
-
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const normSize: f32 = ${normSize};
-  const normSizeVectorized: u32 = ${normSize / components};
-  const epsilon: f32 = ${attributes.epsilon};
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
+        const variables = [
+          inputVariable('x', inputs[0].dataType, inputs[0].dims, components),
+          inputVariable('scale', scale.dataType, scale.dims, components),
+        ];
+        if (bias) {
+          variables.push(inputVariable('bias', bias.dataType, bias.dims, components));
+        }
+        variables.push(outputVariable('output', inputs[0].dataType, outputShape, components));
+        if (hasMeanDataOutput) {
+          variables.push(outputVariable('mean_data_output', DataType.float, meanInvStdDevDim));
+        }
+        if (hasInvStdOutput) {
+          variables.push(outputVariable('inv_std_output', DataType.float, meanInvStdDevDim));
+        }
 
-  ${shaderHelper.declareVariables(...variables)}
+        const uniforms: UniformsArrayType = [
+          {name: 'norm_count', type: 'u32'}, {name: 'norm_size', type: 'f32'},
+          {name: 'norm_size_vectorized', type: 'u32'}, {name: 'epsilon', type: 'f32'}
+        ];
+        return `
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(...variables)}
   ${shaderHelper.mainStart()}
-    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(normCount)}
-    let offset = global_idx * normSizeVectorized;
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.norm_count')}
+    let offset = global_idx * uniforms.norm_size_vectorized;
     var meanVector = ${fillVector('f32', components)};
     var meanSquareVector = ${fillVector('f32', components)};
 
-    for (var h: u32 = 0u; h < normSizeVectorized; h++) {
+    for (var h: u32 = 0u; h < uniforms.norm_size_vectorized; h++) {
       let value = ${castToF32(dataType, components, 'x[h + offset]')};
       meanVector += value;
       meanSquareVector += value * value;
     }
-    let mean = ${sumVector('meanVector', components)} / normSize;
-    let meanSquare = sqrt(${sumVector('meanSquareVector', components)} 
-      / normSize - mean * mean + epsilon);
+    let mean = ${sumVector('meanVector', components)} / uniforms.norm_size;
+    let meanSquare = sqrt(${sumVector('meanSquareVector', components)}
+      / uniforms.norm_size - mean * mean + uniforms.epsilon);
 
-    for (var j: u32 = 0; j < normSizeVectorized; j++) {
+    for (var j: u32 = 0; j < uniforms.norm_size_vectorized; j++) {
       let f32input = ${castToF32(dataType, components, 'x[j + offset]')};
       let f32scale = ${castToF32(dataType, components, 'scale[j]')};
       output[j + offset] = ${variables[0].type.value}((f32input - mean) / meanSquare * f32scale
@@ -98,9 +104,10 @@ const createLayerNormProgramInfo =
       );
     }
 
-    ${hasMeanDataOutput ? 'meanDataOutput[global_idx] = mean' : ''};
-    ${hasInvStdOutput ? 'invStdOutput[global_idx] = 1 / meanSquare' : ''};
+    ${hasMeanDataOutput ? 'mean_data_output[global_idx] = mean' : ''};
+    ${hasInvStdOutput ? 'inv_std_output[global_idx] = 1 / meanSquare' : ''};
   }`;
+      };
       const outputs = [{dims: outputShape, dataType: inputs[0].dataType}];
       if (hasMeanDataOutput) {
         outputs.push({dims: meanInvStdDevDim, dataType: DataType.float});
@@ -111,15 +118,13 @@ const createLayerNormProgramInfo =
 
       return {
         name: 'LayerNormalization',
-        shaderCache: {hint: `${attributes.cacheKey}|${outputCount}|${inputs.length}`},
-        getRunData: () => ({outputs, dispatchGroup: {x: Math.ceil(normCount / 64 /* workgroup size */)}}),
+        shaderCache: {hint: `${components};${outputCount}`, inputDependencies},
+        getRunData: () =>
+            ({outputs, dispatchGroup: {x: Math.ceil(normCount / 64 /* workgroup size */)}, programUniforms}),
         getShaderSource,
       };
     };
 
-export const parseLayerNormAttributes = (attributes: LayerNormAttributes): LayerNormAttributes =>
-    createAttributeWithCacheKey({axis: attributes.axis, epsilon: attributes.epsilon});
-
 export const layerNorm = (context: ComputeContext, attributes: LayerNormAttributes): void => {
   validateInputs(context.inputs);
   context.compute(createLayerNormProgramInfo(context.inputs, attributes, context.outputCount));

From 68c29ece23821b1d2b73ac55c2a4266c72865219 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Mon, 8 Jan 2024 19:46:33 -0800
Subject: [PATCH 12/13] In a Linux or Android build check if the compiler
 support bfloat16 and float16 (#18813)

### Description
Restrict clang version because we have an upcoming change that requires
clang version >=16 , which will mainly affect Android build.
---
 cmake/CMakeLists.txt                    | 27 +++++++++++++++++--------
 cmake/adjust_global_compile_flags.cmake | 25 +++++++++++++++++++++++
 2 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 34355fb0fd936..0f57258dca706 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -354,13 +354,7 @@ if (onnxruntime_USE_ROCM)
   endif()
 endif()
 
-if (APPLE)
-    if (NOT CMAKE_OSX_ARCHITECTURES)
-        message("Building ONNX Runtime for ${CMAKE_HOST_SYSTEM_PROCESSOR}")
-    endif()
-elseif (NOT WIN32 AND NOT APPLE)
-    message("Building ONNX Runtime for ${CMAKE_SYSTEM_PROCESSOR}")
-endif()
+
 
 # Single output director for all binaries
 set(RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin CACHE PATH "Single output directory for all binaries.")
@@ -493,6 +487,14 @@ endif()
 
 include(adjust_global_compile_flags.cmake)
 
+if (APPLE)
+  if (NOT CMAKE_OSX_ARCHITECTURES)
+    message("Building ONNX Runtime for ${CMAKE_HOST_SYSTEM_PROCESSOR} CPU ARCH")
+  endif()
+elseif (NOT WIN32 AND NOT APPLE)
+  message("Building ONNX Runtime for ${onnxruntime_target_platform} CPU ARCH")
+endif()
+
 # We need to link with libatomic on systems that do not have built-in atomics, or
 # don't have built-in support for 8 byte atomics
 # Derived from https://github.com/protocolbuffers/protobuf/blob/master/cmake/CMakeLists.txt
@@ -639,7 +641,16 @@ else()
   check_cxx_compiler_flag(-Wunused-variable HAS_UNUSED_VARIABLE)
   check_cxx_compiler_flag(-Wuseless-cast HAS_USELESS_CAST)
   check_function_exists(reallocarray HAS_REALLOCARRAY)
-
+  if (NOT APPLE AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND onnxruntime_target_platform STREQUAL "aarch64")
+   check_cxx_compiler_flag(-march=armv8.2-a+bf16 HAS_ARM64_BFLOAT16)
+   if(NOT HAS_ARM64_BFLOAT16)
+     message(FATAL_ERROR  "The compiler doesn't support BFLOAT16!!!")
+   endif()
+   check_cxx_compiler_flag(-march=armv8.2-a+fp16 HAS_ARM64_FLOAT16)
+   if(NOT HAS_ARM64_FLOAT16)
+     message(FATAL_ERROR  "The compiler doesn't support FLOAT16!!!")
+   endif()
+  endif()
   if (HAS_TAUTOLOGICAL_POINTER_COMPARE)
     #we may have extra null pointer checkings in debug build, it's not an issue
     list(APPEND ORT_WARNING_FLAGS -Wno-tautological-pointer-compare)
diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake
index e825bfeaea952..9f00c873715f4 100644
--- a/cmake/adjust_global_compile_flags.cmake
+++ b/cmake/adjust_global_compile_flags.cmake
@@ -300,6 +300,31 @@ if (MSVC)
   endif()
 else()
   if (NOT APPLE)
+    #XXX: Sometimes the value of CMAKE_SYSTEM_PROCESSOR is set but it's wrong. For example, if you run an armv7 docker
+    #image on an aarch64 machine with an aarch64 Ubuntu host OS, in the docker instance cmake may still report
+    # CMAKE_SYSTEM_PROCESSOR as aarch64 by default. Given compiling this code may need more than 2GB memory, we do not
+    # support compiling for ARM32 natively(only support cross-compiling), we will ignore this issue for now.
+    if(NOT CMAKE_SYSTEM_PROCESSOR)
+      message(WARNING "CMAKE_SYSTEM_PROCESSOR is not set. Please set it in your toolchain cmake file.")
+      # Try to detect it
+      if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" OR "${CMAKE_C_COMPILER_ID}" STREQUAL "Clang")
+        execute_process(
+		COMMAND "${CMAKE_C_COMPILER}" -dumpmachine
+		OUTPUT_VARIABLE GCC_DUMP_MACHINE_OUT OUTPUT_STRIP_TRAILING_WHITESPACE
+		ERROR_VARIABLE _err
+		RESULT_VARIABLE _res
+		)
+		if(NOT _res EQUAL 0)
+			message(SEND_ERROR "Failed to run 'gcc -dumpmachine':\n ${_res}")
+		endif()
+		string(REPLACE "-" ";" GCC_DUMP_MACHINE_OUT_LIST "${GCC_DUMP_MACHINE_OUT}")
+		list(LENGTH GCC_DUMP_MACHINE_OUT_LIST GCC_TRIPLET_LEN)
+		if(GCC_TRIPLET_LEN EQUAL 4)
+		  list(GET GCC_DUMP_MACHINE_OUT_LIST 0 CMAKE_SYSTEM_PROCESSOR)
+          message("Setting CMAKE_SYSTEM_PROCESSOR to ${CMAKE_SYSTEM_PROCESSOR}")
+        endif()
+      endif()
+    endif()
     set(onnxruntime_target_platform ${CMAKE_SYSTEM_PROCESSOR})
   endif()
   if (onnxruntime_BUILD_FOR_NATIVE_MACHINE)

From eb35896ede6e77bf4a453b9e7314e728e40f96ba Mon Sep 17 00:00:00 2001
From: zesongw <zesong.wang@intel.com>
Date: Tue, 9 Jan 2024 14:02:44 +0800
Subject: [PATCH 13/13] [WebNN EP] Update WebNN normalization ops (#18817)

Use batchNormalization, layerNormalization and instanceNormalization
instead of meanVarianceNormalization to implement normalization Ops. The
spec of meanVarianceNormalization has been deleted.
Remove groupNormalization.
---
 .../core/providers/webnn/builders/helper.h    |   7 +-
 .../builders/impl/normalization_op_builder.cc | 141 +++++++-----------
 .../webnn/builders/op_builder_factory.cc      |   1 -
 3 files changed, 57 insertions(+), 92 deletions(-)

diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h
index 8b8b85339a87c..5aec81af15761 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.h
+++ b/onnxruntime/core/providers/webnn/builders/helper.h
@@ -139,7 +139,7 @@ static const InlinedHashMap<std::string, WebnnOpInfo> op_map = {
     {"ArgMax", {"argMax", false}},
     {"ArgMin", {"argMin", false}},
     {"AveragePool", {"averagePool2d", true}},
-    {"BatchNormalization", {"meanVarianceNormalization", false}},
+    {"BatchNormalization", {"batchNormalization", false}},
     {"Cast", {"cast", false}},
     {"Ceil", {"ceil", true}},
     {"Clip", {"clamp", true}},
@@ -162,12 +162,11 @@ static const InlinedHashMap<std::string, WebnnOpInfo> op_map = {
     {"GlobalLpPool", {"l2Pool2d", false}},
     {"Greater", {"greater", false}},
     {"GreaterOrEqual", {"greaterOrEqual", false}},
-    {"GroupNormalization", {"meanVarianceNormalization", false}},
     {"HardSigmoid", {"hardSigmoid", false}},
     {"HardSwish", {"hardSwish", true}},
     {"Identity", {"identity", false}},
-    {"InstanceNormalization", {"meanVarianceNormalization", false}},
-    {"LayerNormalization", {"meanVarianceNormalization", false}},
+    {"InstanceNormalization", {"instanceNormalization", false}},
+    {"LayerNormalization", {"layerNormalization", false}},
     {"LeakyRelu", {"leakyRelu", true}},
     {"Less", {"lesser", false}},
     {"LessOrEqual", {"lesserOrEqual", false}},
diff --git a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
index 756a838cc0c3e..4d2470dfe7deb 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
@@ -27,8 +27,6 @@ class NormalizationOpBuilder : public BaseOpBuilder {
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
 };
 
-// All normalization are based on layout NCHW.
-// TODO: add support for NHWC.
 Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                      const Node& node,
                                                      const logging::Logger& logger) const {
@@ -61,49 +59,13 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder
     ORT_RETURN_IF_NOT(bias_shape == scale_shape, "The bias' shape should be equal to scale's shape.");
   }
 
-  std::vector<uint32_t> new_scale_shape;
-  if (scale_size < rank) {
-    if (op_type == "BatchNormalization") {
-      scale_shape.insert(scale_shape.begin(), 1);
-      scale_shape.insert(scale_shape.end(), rank - 2, 1);
-    } else if (op_type == "LayerNormalization") {
-      // Align right with leading ones.
-      scale_shape.insert(scale_shape.begin(), rank - scale_size, 1);
-    } else if (op_type == "InstanceNormalization") {
-      // Insert ones before and after the channel dimension.
-      scale_shape.insert(scale_shape.begin(), 1);
-      ORT_RETURN_IF(scale_size != 1 || rank < 2,
-                    "The scale size should be 1 and rank should be at least 2 for InstanceNorm.");
-      scale_shape.insert(scale_shape.end(), rank - scale_size - 1, 1);
-    } else if (op_type == "GroupNormalization") {
-      // The input will be reshaped to 3D later. So just insert ones before the channel and after.
-      scale_shape.insert(scale_shape.begin(), 1);
-      scale_shape.insert(scale_shape.end(), 1);
-    } else {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported normalization op: ", op_type);
-    }
+  emscripten::val scale = model_builder.GetOperand(input_defs[1]->Name());
+  options.set("scale", scale);
 
-    std::transform(scale_shape.cbegin(), scale_shape.cend(),
-                   std::back_inserter(new_scale_shape),
-                   [](int64_t dim) -> uint32_t { return SafeInt<uint32_t>(dim); });
-    emscripten::val reshape_scale = model_builder.GetOperand(input_defs[1]->Name());
-    emscripten::val reshape_output_scale =
-        model_builder.GetBuilder().call<emscripten::val>("reshape", reshape_scale, emscripten::val::array(new_scale_shape));
-    options.set("scale", reshape_output_scale);
-
-    if (input_defs.size() >= 3 && !input_defs[2]->Name().empty()) {
-      // Bias input exists, and bias's shape is the same as scale's shape.
-      emscripten::val reshape_bias = model_builder.GetOperand(input_defs[2]->Name());
-      emscripten::val reshape_output_bias =
-          model_builder.GetBuilder().call<emscripten::val>("reshape", reshape_bias, emscripten::val::array(new_scale_shape));
-      options.set("bias", reshape_output_bias);
-    }
-  } else {
-    options.set("scale", model_builder.GetOperand(input_defs[1]->Name()));
-    if (input_defs.size() >= 3 && !input_defs[2]->Name().empty()) {
-      // Bias input exists, and bias's shape is the same as scale's shape.
-      options.set("bias", model_builder.GetOperand(input_defs[2]->Name()));
-    }
+  if (input_defs.size() >= 3 && !input_defs[2]->Name().empty()) {
+    // Bias input exists, and bias's shape is the same as scale's shape.
+    emscripten::val bias = model_builder.GetOperand(input_defs[2]->Name());
+    options.set("bias", bias);
   }
 
   NodeAttrHelper helper(node);
@@ -114,56 +76,62 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder
     ORT_RETURN_IF_NOT(input_defs.size() == 5, "BatchNormalization requires five inputs.");
     emscripten::val mean = model_builder.GetOperand(input_defs[3]->Name());
     emscripten::val variance = model_builder.GetOperand(input_defs[4]->Name());
-    // Enlarge 1-D mean and variance to new scale shape.
-    emscripten::val reshape_mean =
-        model_builder.GetBuilder().call<emscripten::val>("reshape", mean, emscripten::val::array(new_scale_shape));
-    emscripten::val reshape_variance =
-        model_builder.GetBuilder().call<emscripten::val>("reshape", variance, emscripten::val::array(new_scale_shape));
-
-    std::vector<uint32_t> axes = {0};
-    for (uint32_t i = 2; i < rank; i++) {
-      axes.push_back(i);
+    if (model_builder.GetPreferredLayout() == DataLayout::NHWC) {
+      options.set("axis", rank - 1);
     }
-
-    options.set("axes", emscripten::val::array(axes));
-    options.set("mean", reshape_mean);
-    options.set("variance", reshape_variance);
-    output = model_builder.GetBuilder().call<emscripten::val>("meanVarianceNormalization", input, options);
+    output = model_builder.GetBuilder().call<emscripten::val>("batchNormalization", input, mean, variance, options);
   } else if (op_type == "LayerNormalization") {
     int64_t axis = helper.Get("axis", -1);
     axis = HandleNegativeAxis(axis, rank);
     std::vector<uint32_t> axes(rank - SafeInt<uint32_t>(axis));
-    std::iota(axes.begin(), axes.end(), axis);
+    if (model_builder.GetPreferredLayout() == DataLayout::NHWC && axis > 1) {
+      std::iota(axes.begin(), axes.end(), axis - 1);
+    } else {
+      std::iota(axes.begin(), axes.end(), axis);
+    }
     options.set("axes", emscripten::val::array(axes));
-    output = model_builder.GetBuilder().call<emscripten::val>("meanVarianceNormalization", input, options);
+    output = model_builder.GetBuilder().call<emscripten::val>("layerNormalization", input, options);
   } else if (op_type == "InstanceNormalization") {
-    std::vector<uint32_t> axes;
-    for (uint32_t i = 2; i < rank; i++) {
-      axes.emplace_back(i);
+    // WebNN spec only supports 4D input for instanceNormalization.
+    // Supports 3D input by prepending 1 size dimension.
+    // For models with dimensions greater than 4, they will be reshaped into 4D.
+    constexpr size_t webnn_shape_rank = 4;
+    if (input_shape.size() != webnn_shape_rank) {
+      std::vector<uint32_t> new_shape;
+      new_shape.reserve(std::max(input_shape.size(), webnn_shape_rank));
+      std::transform(input_shape.begin(), input_shape.end(),
+                     std::back_inserter(new_shape),
+                     [](int64_t dim) -> uint32_t { return SafeInt<uint32_t>(dim); });
+
+      size_t insertion_offset = (model_builder.GetPreferredLayout() == DataLayout::NHWC) ? 2 : 3;
+      ptrdiff_t excess_rank = new_shape.size() - webnn_shape_rank;
+      auto insertion_point = new_shape.begin() + insertion_offset;
+      if (input_shape.size() < webnn_shape_rank) {
+        // Pad the shape with extra 1's to satisfy WebNN v1's rank requirements.
+        new_shape.insert(insertion_point, -excess_rank, 1);
+      } else {
+        // Fold the extra range to fit within WebNN v1's rank requirements.
+        uint32_t sum = std::accumulate(
+            insertion_point, insertion_point + excess_rank + 1, 1, std::multiplies<uint32_t>());
+        new_shape.erase(insertion_point, insertion_point + excess_rank);
+        *insertion_point = sum;
+      }
+      input = model_builder.GetBuilder().call<emscripten::val>("reshape", input, emscripten::val::array(new_shape));
+    }
+
+    if (model_builder.GetPreferredLayout() == DataLayout::NHWC) {
+      options.set("layout", emscripten::val("nhwc"));
+    }
+    output = model_builder.GetBuilder().call<emscripten::val>("instanceNormalization", input, options);
+    // Reshape back to the original output shape for 3D input.
+    if (input_shape.size() != 4) {
+      std::vector<uint32_t> output_shape;
+      std::transform(input_shape.begin(), input_shape.end(),
+                     std::back_inserter(output_shape),
+                     [](int64_t dim) -> uint32_t { return SafeInt<uint32_t>(dim); });
+      output = model_builder.GetBuilder().call<emscripten::val>(
+          "reshape", output, emscripten::val::array(output_shape));
     }
-    options.set("axes", emscripten::val::array(axes));
-    output = model_builder.GetBuilder().call<emscripten::val>("meanVarianceNormalization", input, options);
-  } else if (op_type == "GroupNormalization") {
-    ORT_RETURN_IF_NOT(helper.HasAttr("num_groups"), "GroupNormalization num_group must be provided.");
-    int32_t group_count = helper.Get("num_groups", -1);
-    std::vector<uint32_t> orig_shape, new_shape;
-    std::transform(input_shape.cbegin(), input_shape.cend(),
-                   std::back_inserter(orig_shape),
-                   [](int64_t dim) -> uint32_t { return SafeInt<uint32_t>(dim); });
-    // Add N and Group.
-    ORT_RETURN_IF_NOT(rank >= 2, "Input for GroupNormalization cannot be a scalar or 1D");
-    new_shape.emplace_back(SafeInt<uint32_t>(input_shape[0]));
-    new_shape.emplace_back(SafeInt<uint32_t>(group_count));
-
-    ORT_RETURN_IF_NOT(group_count > 0 && input_shape[1] % group_count == 0,
-                      "GroupNormalization num_group must be divisible by group.");
-    new_shape.emplace_back(SafeInt<uint32_t>(std::reduce(input_shape.begin() + 2, input_shape.end(),
-                                                         input_shape[1] / group_count, std::multiplies<int64_t>())));
-    // Input will be reshaped to (N, group count, channels per group x D1 x D2 ... Dn) and recovered after normalization.
-    options.set("axes", emscripten::val::array(std::vector<uint32_t>{2}));
-    output = model_builder.GetBuilder().call<emscripten::val>("reshape", input, emscripten::val::array(new_shape));
-    output = model_builder.GetBuilder().call<emscripten::val>("meanVarianceNormalization", output, options);
-    output = model_builder.GetBuilder().call<emscripten::val>("reshape", output, emscripten::val::array(orig_shape));
   } else {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported normalization op: ", op_type);
   }
@@ -214,7 +182,6 @@ void CreateNormalizationOpBuilder(const std::string& op_type, OpBuilderRegistrat
   constexpr static std::string_view op_types[] =
       {
           "BatchNormalization",
-          "GroupNormalization",
           "InstanceNormalization",
           "LayerNormalization",
       };
diff --git a/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc b/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
index 463317a4dafda..613771eda71fe 100644
--- a/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
+++ b/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
@@ -111,7 +111,6 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
 
   {  // Normalization
     CreateNormalizationOpBuilder("BatchNormalization", op_registrations);
-    CreateNormalizationOpBuilder("GroupNormalization", op_registrations);
     CreateNormalizationOpBuilder("InstanceNormalization", op_registrations);
     CreateNormalizationOpBuilder("LayerNormalization", op_registrations);
   }