From ea55700e1cb3e4f8485e9c9cfd69ebe470701397 Mon Sep 17 00:00:00 2001
From: Arthur Islamov <arthur@islamov.ai>
Date: Fri, 4 Aug 2023 01:09:37 +0400
Subject: [PATCH 1/5] [js/web] JSEP Gather OP (#16855)

### Description
Added Gather op that works with both i32 and i64 indices, assuming that
values fall into i32 limit. The assumption is safe because it's not
possible to allocate more than 2gb buffer for inputs.

It treats all data from input tensor as u32, copying 1 or 2 elements for
i64, u64 and double.

---------

Co-authored-by: Guenther Schmuelling <guschmue@microsoft.com>
---
 js/web/docs/webgpu-operators.md               |   1 +
 .../lib/wasm/jsep/webgpu/op-resolve-rules.ts  |   2 +
 js/web/lib/wasm/jsep/webgpu/ops/gather.ts     | 107 ++++++++++++++++++
 js/web/test/suite-test-list.jsonc             |   7 +-
 .../providers/js/js_execution_provider.cc     |   8 ++
 .../core/providers/js/operators/gather.cc     |  53 +++++++++
 .../core/providers/js/operators/gather.h      |  24 ++++
 7 files changed, 199 insertions(+), 3 deletions(-)
 create mode 100644 js/web/lib/wasm/jsep/webgpu/ops/gather.ts
 create mode 100644 onnxruntime/core/providers/js/operators/gather.cc
 create mode 100644 onnxruntime/core/providers/js/operators/gather.h

diff --git a/js/web/docs/webgpu-operators.md b/js/web/docs/webgpu-operators.md
index 7b6d72bc78ecd..a0ff4a3aae9d3 100644
--- a/js/web/docs/webgpu-operators.md
+++ b/js/web/docs/webgpu-operators.md
@@ -35,6 +35,7 @@ Do not modify directly.*
 | Expand | ai.onnx(8-12,13+) |  |
 | Flatten | ai.onnx(1-8,9-10,11-12,13+) |  |
 | Floor | ai.onnx(6-12,13+) |  |
+| Gather | ai.onnx(1-10,11-12,13+) |  |
 | Gelu | com.microsoft(1+) |  |
 | Gemm | ai.onnx(7-8,9-10,11+) |  |
 | GlobalAveragePool | ai.onnx(1+); com.ms.internal.nhwc(1+) |  |
diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 4fa468cde4d6c..23b47033e548a 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -7,6 +7,7 @@ import {concat, parseConcatAttributes} from './ops/concat';
 import {conv, parseConvAttributes} from './ops/conv';
 import {convTranspose, parseConvTransposeAttributes} from './ops/conv-transpose';
 import {expand} from './ops/expand';
+import {gather, parseGatherAttributes} from './ops/gather';
 import {gelu} from './ops/gelu';
 import {gemm, parseGemmAttributes} from './ops/gemm';
 import {matMul} from './ops/matmul';
@@ -51,6 +52,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['Exp', [unaryOps.exp]],
   ['Expand', [expand]],
   ['Floor', [unaryOps.floor]],
+  ['Gather', [gather, parseGatherAttributes]],
   ['Gelu', [gelu]],
   ['Gemm', [gemm, parseGemmAttributes]],
   ['GlobalAveragePool', [pool.globalAveragePool, pool.parseGlobalAveragePoolAttributes]],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gather.ts b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
new file mode 100644
index 0000000000000..113bf7c7cc822
--- /dev/null
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gather.ts
@@ -0,0 +1,107 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+import {DataType} from '../../../wasm-common';
+import {TensorView} from '../../tensor';
+import {ShapeUtil} from '../../util';
+import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
+import {ComputeContext, GpuDataType, ProgramInfo, ProgramMetadata} from '../types';
+
+import {ShaderHelper} from './common';
+
+export interface GatherAttributes extends AttributeWithCacheKey {
+  axis: number;
+}
+
+const validateInputs = (inputs: readonly TensorView[]): void => {
+  if (!inputs || inputs.length !== 2) {
+    throw new Error('Gather requires 2 inputs.');
+  }
+};
+
+const createGatherProgramInfo =
+    (metadata: ProgramMetadata, inputs: readonly TensorView[], attributes: GatherAttributes): ProgramInfo => {
+      const inputShape = inputs[0].dims;
+      const indicesShape = inputs[1].dims;
+
+      const inputRank = inputShape.length;
+      const axis = ShapeUtil.normalizeAxis(attributes.axis, inputRank);
+
+      const outputShape = inputShape.slice(0);
+      outputShape.splice(axis, 1, ...indicesShape);
+
+      const inputDataType = inputs[0].dataType;
+      const block = ShapeUtil.sizeFromDimension(inputShape, axis + 1);
+      const elementSize = [DataType.int64, DataType.uint64, DataType.double].includes(inputDataType) ? 2 : 1;
+      const indicesElementSize = inputs[1].dataType === DataType.int64 ? 2 : 1;
+      const blockSize = elementSize * block;
+      const M = ShapeUtil.sizeToDimension(inputShape, axis);
+      const N = ShapeUtil.size(indicesShape);
+      const dataBatchElements = ShapeUtil.sizeFromDimension(inputShape, axis) * elementSize;
+      const gatheredBatchElements = N * block * elementSize;
+      const axisDimLimit = inputShape[axis];
+
+      const inputSize = ShapeUtil.size(inputShape) * elementSize;
+      const outputSize = ShapeUtil.size(outputShape) * elementSize;
+
+      const totalGathers = M * N;
+      // int64 indices would be treated as little endian i32 with assumption they fall in i32 limits
+      // That assumption is safe as it's not possible to allocate >2gb buffer for input tensor
+      // Input data will be treated as u32 or two u32 for 8-byte tensors
+      const getShaderSource = (shaderHelper: ShaderHelper) => `
+  const N: u32 = ${N};
+  const elementSize: u32 = ${elementSize};
+  const indicesElementSize: u32 = ${indicesElementSize};
+
+  @group(0) @binding(0) var<storage, read> input : array<u32>;
+  @group(0) @binding(1) var<storage, read> inputIndices : array<i32>;
+  @group(0) @binding(2) var<storage, read_write> output: array<u32>;
+
+  ${shaderHelper.mainStart()}
+    let batch: u32 = global_idx / N;
+    let i: u32 = global_idx % N;
+
+    let srcOffsetBatch: u32 = batch * ${dataBatchElements};
+    let dstOffsetBatch: u32 = batch * ${gatheredBatchElements};
+    var idx = inputIndices[i * indicesElementSize];
+    if (idx < 0) {
+        idx = idx + ${axisDimLimit};
+    }
+
+    let srcOffset = srcOffsetBatch + u32(idx) * ${blockSize};
+    let dstOffset = dstOffsetBatch + i * ${blockSize};
+    if (srcOffset >= ${inputSize}) {
+        return;
+    }
+    if (dstOffset >= ${outputSize}) {
+        return;
+    }
+    for (var j: u32 = 0; j < ${blockSize}; j++) {
+        output[dstOffset + j] = input[srcOffset + j];
+    }
+  }`;
+      return {
+        ...metadata,
+        outputs: [
+          {dims: outputShape, dataType: inputs[0].dataType, gpuDataType: GpuDataType.default},
+        ],
+        getShaderSource,
+        dispatchGroup: () => ({x: Math.ceil(totalGathers / 64 /* workgroup size */)})
+      };
+    };
+
+export const parseGatherAttributes = (attributes: Record<string, unknown>): GatherAttributes =>
+    createAttributeWithCacheKey({axis: attributes.axis as number});
+
+export const gather = (context: ComputeContext, attributes: GatherAttributes): void => {
+  const inputs = context.inputs;
+  validateInputs(inputs);
+
+  const metadata = {
+    name: 'Gather',
+    inputTypes: [GpuDataType.default, GpuDataType.default],
+    cacheHint: attributes.cacheKey + inputs[0].dataType.toString(10) + inputs[1].dataType.toString(10),
+  };
+
+  context.compute(createGatherProgramInfo(metadata, context.inputs, attributes));
+};
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index 00ac7acfc9179..c253aeff302e8 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -535,9 +535,10 @@
       "test_flatten_negative_axis4",
       "test_floor_example",
       "test_floor",
-      // "test_gather_0",
-      // "test_gather_1",
-      // "test_gather_2d_indices",
+      "test_gather_0",
+      "test_gather_1",
+      "test_gather_2d_indices",
+      "test_gather_negative_indices",
       // "test_gather_elements_0",
       // "test_gather_elements_1",
       // "test_gather_elements_negative_indices",
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index dba68137c7a80..677a2543014ce 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -266,6 +266,10 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomai
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, 18, Resize);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 19, Resize);
 
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, Gather);
+class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, Gather);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Gather);
+
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 10, 10, Resize);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 12, Resize);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 13, 17, Resize);
@@ -477,6 +481,10 @@ std::unique_ptr<KernelRegistry> RegisterKernels() {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 8, 12, Expand)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Expand)>,
 
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, Gather)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, Gather)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Gather)>,
+
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, Resize)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, Resize)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, Resize)>,
diff --git a/onnxruntime/core/providers/js/operators/gather.cc b/onnxruntime/core/providers/js/operators/gather.cc
new file mode 100644
index 0000000000000..ec1ae71243fe8
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/gather.cc
@@ -0,0 +1,53 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/js/js_kernel.h"
+
+#include "gather.h"
+
+namespace onnxruntime {
+namespace js {
+
+using AllSupportedSize =
+    TypeList<
+        float,
+        double,
+        int64_t,
+        uint64_t,
+        int32_t,
+        uint32_t>;
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Gather,
+    kOnnxDomain,
+    1,
+    10,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", BuildKernelDefConstraintsFromTypeList<AllSupportedSize>())
+        .TypeConstraint("Tind", BuildKernelDefConstraintsFromTypeList<TypeList<int32_t, int64_t>>()),
+    Gather);
+
+ONNX_OPERATOR_VERSIONED_KERNEL_EX(
+    Gather,
+    kOnnxDomain,
+    11,
+    12,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", BuildKernelDefConstraintsFromTypeList<AllSupportedSize>())
+        .TypeConstraint("Tind", BuildKernelDefConstraintsFromTypeList<TypeList<int32_t, int64_t>>()),
+    Gather);
+
+ONNX_OPERATOR_KERNEL_EX(
+    Gather,
+    kOnnxDomain,
+    13,
+    kJsExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", BuildKernelDefConstraintsFromTypeList<AllSupportedSize>())
+        .TypeConstraint("Tind", BuildKernelDefConstraintsFromTypeList<TypeList<int32_t, int64_t>>()),
+    Gather);
+
+}  // namespace js
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/js/operators/gather.h b/onnxruntime/core/providers/js/operators/gather.h
new file mode 100644
index 0000000000000..72603d461c2e1
--- /dev/null
+++ b/onnxruntime/core/providers/js/operators/gather.h
@@ -0,0 +1,24 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/js/js_kernel.h"
+
+namespace onnxruntime {
+namespace js {
+
+class Gather : public JsKernel {
+ public:
+  Gather(const OpKernelInfo& info) : JsKernel(info) {
+    int64_t axis = info.GetAttrOrDefault<int64_t>("axis", 0);
+
+    JSEP_INIT_KERNEL_ATTRIBUTE(Gather, ({
+                                 "axis" : Number($1),
+                               }),
+                               static_cast<int32_t>(axis));
+  }
+};
+
+}  // namespace js
+}  // namespace onnxruntime

From 641c3a4a37e56944bf9d2a22915f3d562c3daf8f Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Thu, 3 Aug 2023 14:20:20 -0700
Subject: [PATCH 2/5] [js/web] update op test schema (#16921)

### Description
update op test schema.

This changes fixes several problems for operator tests for web:
- `opsets` -> `opset`: an operator uses exactly one opset instead of
multiple
- `condition` -> `platformCondition`: make it less confusing
- `inputShapeDefinitions`: allows to test ORT behaviors when it get
no/partial/full shape info.

Added a JSON schema file and also an example file
---
 js/.vscode/settings.json                  |   8 +-
 js/web/script/test-runner-cli.ts          |  12 +-
 js/web/test/data/ops/_example.jsonc       | 103 ++++++++
 js/web/test/data/ops/gelu.jsonc           |   6 +-
 js/web/test/data/ops/pad-big.jsonc        |   2 +-
 js/web/test/data/ops/pad.jsonc            |  14 +-
 js/web/test/data/ops/pow-big-number.jsonc |   2 +-
 js/web/test/data/ops/resize-pack.jsonc    |  21 +-
 js/web/test/data/ops/split.jsonc          |   4 +-
 js/web/test/op-test-schema.json           | 282 ++++++++++++++++++++++
 js/web/test/suite-test-list.jsonc         |   4 +-
 js/web/test/test-main.ts                  |  18 +-
 js/web/test/test-runner.ts                |  86 ++++++-
 js/web/test/test-types.ts                 |  15 +-
 14 files changed, 510 insertions(+), 67 deletions(-)
 create mode 100644 js/web/test/data/ops/_example.jsonc
 create mode 100644 js/web/test/op-test-schema.json

diff --git a/js/.vscode/settings.json b/js/.vscode/settings.json
index 15eacc675acc6..4948899ec671b 100644
--- a/js/.vscode/settings.json
+++ b/js/.vscode/settings.json
@@ -46,5 +46,11 @@
   },
   "typescript.tsdk": "node_modules/typescript/lib",
   "git.detectSubmodules": false,
-  "cmake.configureOnOpen": false
+  "cmake.configureOnOpen": false,
+  "json.schemas": [
+    {
+      "fileMatch": ["web/test/data/ops/*.jsonc"],
+      "url": "./web/test/op-test-schema.json"
+    }
+  ]
 }
diff --git a/js/web/script/test-runner-cli.ts b/js/web/script/test-runner-cli.ts
index 382c2eba73d77..3c5a2881db096 100644
--- a/js/web/script/test-runner-cli.ts
+++ b/js/web/script/test-runner-cli.ts
@@ -234,7 +234,7 @@ async function main() {
       }
 
       const test = testIds && testIds.length > 0 ? allTests[testIds[0]] : undefined;
-      const condition = test && typeof test !== 'string' ? test.condition : undefined;
+      const platformCondition = test && typeof test !== 'string' ? test.platformCondition : undefined;
 
       const opsetVersion = folder.split('/')[0];
       const category = `node-${opsetVersion}-${backend}`;
@@ -243,14 +243,16 @@ async function main() {
         modelTests = [];
         opsetTests.set(category, modelTests);
       }
-      modelTests.push(modelTestFromFolder(path.resolve(TEST_DATA_MODEL_NODE_ROOT, folder), backend, condition, times));
+      modelTests.push(
+          modelTestFromFolder(path.resolve(TEST_DATA_MODEL_NODE_ROOT, folder), backend, platformCondition, times));
     }
 
     return Array.from(opsetTests.keys()).map(category => ({name: category, tests: opsetTests.get(category)!}));
   }
 
   function modelTestFromFolder(
-      testDataRootFolder: string, backend: string, condition?: Test.Condition, times?: number): Test.ModelTest {
+      testDataRootFolder: string, backend: string, platformCondition?: Test.PlatformCondition,
+      times?: number): Test.ModelTest {
     if (times === 0) {
       npmlog.verbose('TestRunnerCli.Init.Model', `Skip test data from folder: ${testDataRootFolder}`);
       return {name: path.basename(testDataRootFolder), backend, modelUrl: '', cases: []};
@@ -326,7 +328,7 @@ async function main() {
     npmlog.verbose('TestRunnerCli.Init.Model', ` Test set(s): ${cases.length} (${caseCount})`);
     npmlog.verbose('TestRunnerCli.Init.Model', '===============================================================');
 
-    return {name: path.basename(testDataRootFolder), condition, modelUrl, backend, cases};
+    return {name: path.basename(testDataRootFolder), platformCondition, modelUrl, backend, cases};
   }
 
   function tryLocateModelTestFolder(searchPattern: string): string {
@@ -385,7 +387,7 @@ async function main() {
       // field 'verbose' and 'backend' is not set
       for (const test of tests) {
         test.backend = backend;
-        test.opsets = test.opsets || [{domain: '', version: MAX_OPSET_VERSION}];
+        test.opset = test.opset || {domain: '', version: MAX_OPSET_VERSION};
       }
       npmlog.verbose('TestRunnerCli.Init.Op', 'Finished preparing test data.');
       npmlog.verbose('TestRunnerCli.Init.Op', '===============================================================');
diff --git a/js/web/test/data/ops/_example.jsonc b/js/web/test/data/ops/_example.jsonc
new file mode 100644
index 0000000000000..1c9f306a4c8a8
--- /dev/null
+++ b/js/web/test/data/ops/_example.jsonc
@@ -0,0 +1,103 @@
+// This file is an example of an operator test file.
+//
+// In this file, we demonstrate how to write a test file for ONNX operators.
+// There are 2 operator tests defined in this file:
+//
+//   - "Simple Abs test example": a simple operator test for Abs operator. This example shows how to write a simple test with minimal properties.
+//
+//   - "Conv2D with padding": a simple operator test for Conv operator with padding. This example shows how to write a test with all optional properties.
+//
+
+// test file starts with an array of test objects.
+[
+  // this is the first operator test object (Abs example).
+  {
+    "name": "Simple Abs op test example", // name of the test
+    "operator": "Abs", // OpType of the operator
+    "cases": [
+      // in this example, we only have one test case.
+      {
+        // name of the test case
+        "name": "3D float32 test",
+        "inputs": [
+          // specify the input tensor
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 11, 12, 13, 14, -1, -2, -3, -4, -5, -6, -7, -8, 101, 102, 103, 104],
+            "dims": [2, 3, 4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 11, 12, 13, 14, 1, 2, 3, 4, 5, 6, 7, 8, 101, 102, 103, 104],
+            "dims": [2, 3, 4],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  // this is the second operator test object (Conv example).
+  {
+    // name of the test
+    "name": "Conv op test example",
+
+    // OpType of the operator
+    "operator": "Conv",
+
+    // [optional] specify the attributes of the operator
+    "attributes": [{ "name": "kernel_shape", "data": [2, 2], "type": "ints" }],
+
+    // [optional] specify a regex pattern to match the platform description.
+    //
+    // If not specified, the test will run on all platforms.
+    // Otherwise, the test will only run on platforms that match the pattern.
+    "platformCondition": "",
+
+    // [optional] specify input shape definitions.
+    //
+    // Sometimes, input shape definitions can offer shape information for ONNX Runtime to optimize its inferencing behavior.
+    // For example, ORT will transform a NCHW Conv operator into a NHWC operator when the input shape is 4 dimensional.
+    // If the input shape dimension is unknown, ORT will not perform this optimization.
+    //
+    // In operator test, we can specify input shape definitions to test the optimized behavior.
+    //
+    // The array of input shape definitions should have the same length as the number of model's inputs.
+    //
+    "inputShapeDefinitions": [
+      // input 0 shape definition. use semantic names to specify the dynamic dimensions.
+      ["__input_0_dim_0", "__input_0_dim_1", "__input_0_dim_2", "__input_0_dim_3"],
+      // input 1 shape definition. use numbers to specify the static dimensions.
+      [1, 1, 2, 2]
+    ],
+
+    // [optional] specify the opset of the operator.
+    "opset": { "domain": "", "version": 13 },
+
+    // test cases is required.
+    "cases": [
+      {
+        "name": "NCHW Conv2D test",
+        "inputs": [
+          {
+            "data": [10, 20, 30, 40, 50, 60, 70, 80, 90],
+            "dims": [1, 1, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [1, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [370, 470, 670, 770],
+            "dims": [1, 1, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/data/ops/gelu.jsonc b/js/web/test/data/ops/gelu.jsonc
index 79e4335c2d276..b1546353bfeaf 100644
--- a/js/web/test/data/ops/gelu.jsonc
+++ b/js/web/test/data/ops/gelu.jsonc
@@ -2,7 +2,7 @@
   {
     "name": "gelu",
     "operator": "Gelu",
-    "opsets": [{ "domain": "com.microsoft", "version": 1 }],
+    "opset": { "domain": "com.microsoft", "version": 1 },
     "attributes": [],
     "cases": [
       {
@@ -16,7 +16,7 @@
         ],
         "outputs": [
           {
-            "data": [1.0, 0, 0, 2.0],
+            "data": [0.8413447141647339, -0.04550027847290039, 0, 1.9544997215270996],
             "dims": [2, 2],
             "type": "float32"
           }
@@ -33,7 +33,7 @@
         ],
         "outputs": [
           {
-            "data": [1.0],
+            "data": [0.8413447141647339],
             "dims": [],
             "type": "float32"
           }
diff --git a/js/web/test/data/ops/pad-big.jsonc b/js/web/test/data/ops/pad-big.jsonc
index b014f776593f7..601e1d58a4377 100644
--- a/js/web/test/data/ops/pad-big.jsonc
+++ b/js/web/test/data/ops/pad-big.jsonc
@@ -2,7 +2,7 @@
   {
     "name": "constant 2D",
     "operator": "Pad",
-    "opsets": [{ "domain": "", "version": 10 }],
+    "opset": { "domain": "", "version": 10 },
     "attributes": [
       { "name": "mode", "data": "reflect", "type": "string" },
       { "name": "pads", "data": [0, 0, 1, 1, 0, 0, 1, 1], "type": "ints" }
diff --git a/js/web/test/data/ops/pad.jsonc b/js/web/test/data/ops/pad.jsonc
index 1705eee9b095a..62414213b1d1e 100644
--- a/js/web/test/data/ops/pad.jsonc
+++ b/js/web/test/data/ops/pad.jsonc
@@ -2,7 +2,7 @@
   {
     "name": "constant 2D",
     "operator": "Pad",
-    "opsets": [{ "domain": "", "version": 10 }],
+    "opset": { "domain": "", "version": 10 },
     "attributes": [
       { "name": "mode", "data": "constant", "type": "string" },
       { "name": "value", "data": 1.2, "type": "float" },
@@ -35,7 +35,7 @@
   {
     "name": "constant 3D",
     "operator": "Pad",
-    "opsets": [{ "domain": "", "version": 10 }],
+    "opset": { "domain": "", "version": 10 },
     "attributes": [
       { "name": "mode", "data": "constant", "type": "string" },
       { "name": "value", "data": 2.3, "type": "float" },
@@ -79,7 +79,7 @@
   {
     "name": "Reflect 1D",
     "operator": "Pad",
-    "opsets": [{ "domain": "", "version": 10 }],
+    "opset": { "domain": "", "version": 10 },
     "attributes": [
       { "name": "mode", "data": "reflect", "type": "string" },
       { "name": "pads", "data": [5, 7], "type": "ints" }
@@ -107,7 +107,7 @@
   {
     "name": "Reflect 2D",
     "operator": "Pad",
-    "opsets": [{ "domain": "", "version": 10 }],
+    "opset": { "domain": "", "version": 10 },
     "attributes": [
       { "name": "mode", "data": "reflect", "type": "string" },
       { "name": "pads", "data": [3, 2, 2, 5], "type": "ints" }
@@ -139,7 +139,7 @@
   {
     "name": "Reflect 3D",
     "operator": "Pad",
-    "opsets": [{ "domain": "", "version": 10 }],
+    "opset": { "domain": "", "version": 10 },
     "attributes": [
       { "name": "mode", "data": "reflect", "type": "string" },
       { "name": "pads", "data": [1, 2, 2, 2, 3, 1], "type": "ints" }
@@ -182,7 +182,7 @@
   {
     "name": "Edge 2D",
     "operator": "Pad",
-    "opsets": [{ "domain": "", "version": 10 }],
+    "opset": { "domain": "", "version": 10 },
     "attributes": [
       { "name": "mode", "data": "edge", "type": "string" },
       { "name": "pads", "data": [3, 2, 2, 3], "type": "ints" }
@@ -214,7 +214,7 @@
   {
     "name": "Edge 3D",
     "operator": "Pad",
-    "opsets": [{ "domain": "", "version": 10 }],
+    "opset": { "domain": "", "version": 10 },
     "attributes": [
       { "name": "mode", "data": "edge", "type": "string" },
       { "name": "pads", "data": [1, 2, 2, 2, 3, 1], "type": "ints" }
diff --git a/js/web/test/data/ops/pow-big-number.jsonc b/js/web/test/data/ops/pow-big-number.jsonc
index 5a87fe15b8614..17693fa2d4a9c 100644
--- a/js/web/test/data/ops/pow-big-number.jsonc
+++ b/js/web/test/data/ops/pow-big-number.jsonc
@@ -3,7 +3,7 @@
     "name": "Pow with no attributes - big number",
     "operator": "Pow",
     "attributes": [],
-    "condition": "^((?!iOS).)*$", // does NOT contains 'iOS': large number cannot be handled in a half_float environment
+    "platformCondition": "^((?!iOS).)*$", // does NOT contains 'iOS': large number cannot be handled in a half_float environment
     "cases": [
       {
         "name": "T[2,4] T[3,2,4]",
diff --git a/js/web/test/data/ops/resize-pack.jsonc b/js/web/test/data/ops/resize-pack.jsonc
index c2df2f9dabf97..7b9a2ef96d0bc 100644
--- a/js/web/test/data/ops/resize-pack.jsonc
+++ b/js/web/test/data/ops/resize-pack.jsonc
@@ -2,12 +2,7 @@
   {
     "name": "ResizeBilinearPacked with mode half_pixel",
     "operator": "Resize",
-    "opsets": [
-      {
-        "domain": "",
-        "version": "11"
-      }
-    ],
+    "opset": { "domain": "", "version": 11 },
     "attributes": [
       // { "name": "scales", "data": [1.0, 1.0, 2.0, 3.0], "type": "floats" },
       {
@@ -54,12 +49,7 @@
   {
     "name": "ResizeBilinearPacked with mode align_corners",
     "operator": "Resize",
-    "opsets": [
-      {
-        "domain": "",
-        "version": "11"
-      }
-    ],
+    "opset": { "domain": "", "version": 11 },
     "attributes": [
       {
         "name": "coordinate_transformation_mode",
@@ -105,12 +95,7 @@
   {
     "name": "ResizeBilinearPacked with asymmetric",
     "operator": "Resize",
-    "opsets": [
-      {
-        "domain": "",
-        "version": "11"
-      }
-    ],
+    "opset": { "domain": "", "version": 11 },
     "attributes": [
       {
         "name": "coordinate_transformation_mode",
diff --git a/js/web/test/data/ops/split.jsonc b/js/web/test/data/ops/split.jsonc
index a173f10471d3d..46fc323cc6b0f 100644
--- a/js/web/test/data/ops/split.jsonc
+++ b/js/web/test/data/ops/split.jsonc
@@ -2,7 +2,7 @@
   {
     "name": "Split on Axis 0",
     "operator": "Split",
-    "opsets": [{ "domain": "", "version": 12 }],
+    "opset": { "domain": "", "version": 12 },
     "attributes": [
       { "name": "axis", "data": 0, "type": "int" },
       { "name": "split", "data": [2, 4], "type": "ints" }
@@ -35,7 +35,7 @@
   {
     "name": "Split on Axis 1 - 2D",
     "operator": "Split",
-    "opsets": [{ "domain": "", "version": 12 }],
+    "opset": { "domain": "", "version": 12 },
     "attributes": [
       { "name": "axis", "data": 1, "type": "int" },
       { "name": "split", "data": [2, 4], "type": "ints" }
diff --git a/js/web/test/op-test-schema.json b/js/web/test/op-test-schema.json
new file mode 100644
index 0000000000000..aa08e293863e3
--- /dev/null
+++ b/js/web/test/op-test-schema.json
@@ -0,0 +1,282 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema",
+  "type": "array",
+  "items": {
+    "properties": {
+      "name": {
+        "type": "string",
+        "title": "Name",
+        "description": "the name of the test case"
+      },
+      "operator": {
+        "type": "string",
+        "pattern": "[A-Z][a-zA-Z]*",
+        "title": "Operator",
+        "description": "the operator to use for the test case"
+      },
+      "attributes": {
+        "type": "array",
+        "description": "the attributes to use for the test case",
+        "items": {
+          "type": "object",
+          "oneOf": [
+            {
+              "properties": {
+                "name": {
+                  "type": "string",
+                  "description": "the name of the attribute"
+                },
+                "type": {
+                  "const": "int",
+                  "description": "the type of the attribute"
+                },
+                "data": {
+                  "type": "integer",
+                  "description": "the value of the attribute"
+                }
+              },
+              "required": ["name", "data", "type"],
+              "additionalProperties": false
+            },
+            {
+              "properties": {
+                "name": {
+                  "type": "string",
+                  "description": "the name of the attribute"
+                },
+                "type": {
+                  "const": "ints",
+                  "description": "the type of the attribute"
+                },
+                "data": {
+                  "type": "array",
+                  "items": {
+                    "type": "integer"
+                  },
+                  "description": "the value of the attribute"
+                }
+              },
+              "required": ["name", "data", "type"],
+              "additionalProperties": false
+            },
+            {
+              "properties": {
+                "name": {
+                  "type": "string",
+                  "description": "the name of the attribute"
+                },
+                "type": {
+                  "const": "float",
+                  "description": "the type of the attribute"
+                },
+                "data": {
+                  "type": "number",
+                  "description": "the value of the attribute"
+                }
+              },
+              "required": ["name", "data", "type"],
+              "additionalProperties": false
+            },
+            {
+              "properties": {
+                "name": {
+                  "type": "string",
+                  "description": "the name of the attribute"
+                },
+                "type": {
+                  "const": "floats",
+                  "description": "the type of the attribute"
+                },
+                "data": {
+                  "type": "array",
+                  "items": {
+                    "type": "number"
+                  },
+                  "description": "the value of the attribute"
+                }
+              },
+              "required": ["name", "data", "type"],
+              "additionalProperties": false
+            },
+            {
+              "properties": {
+                "name": {
+                  "type": "string",
+                  "description": "the name of the attribute"
+                },
+                "type": {
+                  "const": "string",
+                  "description": "the type of the attribute"
+                },
+                "data": {
+                  "type": "string",
+                  "description": "the value of the attribute"
+                }
+              },
+              "required": ["name", "data", "type"],
+              "additionalProperties": false
+            },
+            {
+              "properties": {
+                "name": {
+                  "type": "string",
+                  "description": "the name of the attribute"
+                },
+                "type": {
+                  "const": "strings",
+                  "description": "the type of the attribute"
+                },
+                "data": {
+                  "type": "array",
+                  "items": {
+                    "type": "string"
+                  },
+                  "description": "the value of the attribute"
+                }
+              },
+              "required": ["name", "data", "type"],
+              "additionalProperties": false
+            }
+          ]
+        }
+      },
+      "opset": {
+        "type": "object",
+        "description": "opset is an optional field that specifies the opset to use for the test case. If not specified, the latest opset of \"\"(onnx.ai) is used.",
+        "properties": {
+          "domain": {
+            "type": "string",
+            "description": "the domain of the opset"
+          },
+          "version": {
+            "type": "integer",
+            "description": "the version of the opset"
+          }
+        },
+        "required": ["domain", "version"],
+        "additionalProperties": false
+      },
+      "cases": {
+        "type": "array",
+        "description": "the test cases",
+        "items": {
+          "type": "object",
+          "properties": {
+            "name": {
+              "type": "string",
+              "description": "the name of the test case"
+            },
+            "inputs": {
+              "type": "array",
+              "description": "the test case inputs",
+              "items": {
+                "properties": {
+                  "type": {
+                    "enum": [
+                      "float32",
+                      "float64",
+                      "int8",
+                      "int16",
+                      "int32",
+                      "int64",
+                      "uint8",
+                      "uint16",
+                      "uint32",
+                      "uint64",
+                      "bool",
+                      "string"
+                    ]
+                  },
+                  "data": {
+                    "type": "array",
+                    "items": {
+                      "type": ["number", "string", "boolean"]
+                    }
+                  },
+                  "dims": {
+                    "type": "array",
+                    "items": {
+                      "type": "integer",
+                      "minimum": 0
+                    }
+                  }
+                },
+                "required": ["type", "data", "dims"],
+                "additionalProperties": false
+              }
+            },
+            "outputs": {
+              "type": "array",
+              "description": "the test case outputs",
+              "items": {
+                "properties": {
+                  "type": {
+                    "enum": [
+                      "float32",
+                      "float64",
+                      "int8",
+                      "int16",
+                      "int32",
+                      "int64",
+                      "uint8",
+                      "uint16",
+                      "uint32",
+                      "uint64",
+                      "bool",
+                      "string"
+                    ]
+                  },
+                  "data": {
+                    "type": "array",
+                    "items": {
+                      "type": ["number", "string", "boolean"]
+                    }
+                  },
+                  "dims": {
+                    "type": "array",
+                    "items": {
+                      "type": "integer",
+                      "minimum": 0
+                    }
+                  }
+                },
+                "required": ["type", "data", "dims"],
+                "additionalProperties": false
+              }
+            }
+          },
+          "required": ["name", "inputs", "outputs"],
+          "additionalProperties": false
+        }
+      },
+      "inputShapeDefinitions": {
+        "description": "inputShapeDefinitions is an optional field that specifies the shapes constraints for the test case inputs. It can be one of the following:\n - \"none\": no shape constraints for the test case inputs.\n - \"rankOnly\": the rank of the test case inputs are specified automatically, but not the shape.\n - \"static\": the shape of the test case inputs are fully specified automatically.\n - an array of shapes: the shapes constraints for the test case inputs. shape can be represented by an array, whose element is either a number for a static dimension or a string for a semantic(dynamic) dimension.",
+        "oneOf": [
+          {
+            "type": "array",
+            "items": {
+              "oneOf": [
+                {
+                  "type": "array",
+                  "items": {
+                    "type": ["integer", "string"]
+                  }
+                },
+                { "type": "null" }
+              ]
+            }
+          },
+          {
+            "enum": ["none", "rankOnly", "static"]
+          }
+        ]
+      },
+      "platformCondition": {
+        "type": "string",
+        "description": "the condition for the test case, a regex string applied on platform name. If not specified, the test will run on all platforms. Otherwise, the test will only run on platforms that match the pattern. see https://github.com/bestiejs/platform.js/"
+      }
+    },
+    "required": ["name", "operator", "cases"],
+    "additionalProperties": false
+  }
+}
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index c253aeff302e8..0fd848838edcf 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -152,7 +152,7 @@
       "test_softmax_example",
       {
         "name": "test_softmax_large_number",
-        "condition": "^((?!iOS).)*$" // does NOT contains 'iOS': large number cannot be handled in a half_float environment
+        "platformCondition": "^((?!iOS).)*$" // does NOT contains 'iOS': large number cannot be handled in a half_float environment
       },
       "test_sub_bcast",
       "test_sub_example",
@@ -183,7 +183,7 @@
       "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_min_keepdims_random",
       {
         "name": "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_prod_default_axes_keepdims_example",
-        "condition": "^((?!iOS).)*$" // does NOT contains 'iOS': large number cannot be handled in a half_float environment
+        "platformCondition": "^((?!iOS).)*$" // does NOT contains 'iOS': large number cannot be handled in a half_float environment
       },
       "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_prod_default_axes_keepdims_random",
       "opset{7,8,9,10,11,12,13,14,15,16,17}/test_reduce_prod_do_not_keepdims_example",
diff --git a/js/web/test/test-main.ts b/js/web/test/test-main.ts
index 614dc4e16d39d..d19a4a7b0e26c 100644
--- a/js/web/test/test-main.ts
+++ b/js/web/test/test-main.ts
@@ -86,14 +86,14 @@ function shouldSkipTest(test: Test.ModelTest|Test.OperatorTest) {
   if (!test.cases || test.cases.length === 0) {
     return true;
   }
-  if (!test.condition) {
+  if (!test.platformCondition) {
     return false;
   }
 
   if (!platform.description) {
     throw new Error('failed to check current platform');
   }
-  const regex = new RegExp(test.condition);
+  const regex = new RegExp(test.platformCondition);
   return !regex.test(platform.description);
 }
 
@@ -149,14 +149,16 @@ for (const group of ORT_WEB_TEST_CONFIG.op) {
         });
 
         after('Dispose Context', async () => {
-          if (ORT_WEB_TEST_CONFIG.profile) {
-            if (context instanceof ProtoOpTestContext) {
-              context.session.endProfiling();
-            } else {
-              OpTestContext.profiler.stop();
+          if (context) {
+            if (ORT_WEB_TEST_CONFIG.profile) {
+              if (context instanceof ProtoOpTestContext) {
+                context.session.endProfiling();
+              } else {
+                OpTestContext.profiler.stop();
+              }
             }
+            await context.dispose();
           }
-          await context.dispose();
         });
 
         for (const testCase of test.cases) {
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index d923837326f45..5552a8e299926 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -390,7 +390,7 @@ export class TensorResultValidator {
       case 'uint32':
       case 'int64':
       case 'bool':
-        return this.integerEqual(
+        return TensorResultValidator.integerEqual(
             actual.numberData as number[] | Uint8Array | Int8Array | Uint16Array | Int16Array | Uint32Array |
                 Int32Array,
             expected.numberData as number[] | Uint8Array | Int8Array | Uint16Array | Int16Array | Uint32Array |
@@ -463,7 +463,7 @@ export class TensorResultValidator {
 
     return true;
   }
-  integerEqual(
+  static integerEqual(
       actual: number[]|Uint8Array|Int8Array|Uint16Array|Int16Array|Uint32Array|Int32Array,
       expected: number[]|Uint8Array|Int8Array|Uint16Array|Int16Array|Uint32Array|Int32Array): boolean {
     if (actual.length !== expected.length) {
@@ -551,8 +551,8 @@ export class OpTestContext {
   }
   createOperator(): Operator {
     return initializeOperator(
-        this.sessionHandler, this.opTest.operator, this.opTest.attributes,
-        this.opTest.opsets ?? [{domain: '', version: 7}]);
+        this.sessionHandler, this.opTest.operator, this.opTest.attributes || [],
+        [this.opTest.opset ?? {domain: '', version: 7}]);
   }
 
   async dispose(): Promise<void> {
@@ -575,9 +575,9 @@ export class ProtoOpTestContext {
   session: ort.InferenceSession;
   readonly backendHint: string;
   constructor(test: Test.OperatorTest) {
-    const opsetImport = test.opsets!.map(opset => onnx.OperatorSetIdProto.create(opset));
+    const opsetImport = onnx.OperatorSetIdProto.create(test.opset);
     const operator = test.operator;
-    const attribute = test.attributes!.map(attr => {
+    const attribute = (test.attributes || []).map(attr => {
       const protoAttr = onnx.AttributeProto.create({name: attr.name});
       switch (attr.type) {
         case 'float':
@@ -623,23 +623,70 @@ export class ProtoOpTestContext {
 
     const model = onnx.ModelProto.create();
     model.irVersion = onnx.Version.IR_VERSION;
-    model.opsetImport = opsetImport;
+    model.opsetImport.push(opsetImport);
     model.graph = onnx.GraphProto.create();
 
     model.graph.node = [onnx.NodeProto.create({
       input: test.cases[0].inputs!.map((_, i) => `input_${i}`),
       output: test.cases[0].outputs!.map((_, i) => `output_${i}`),
       opType: operator,
+      domain: test.opset?.domain,
       name: operator,
       attribute
     })];
 
-    model.graph.input = test.cases[0].inputs!.map((input, i) => onnx.ValueInfoProto.create({
-      name: `input_${i}`,
-      type: onnx.TypeProto.create({
-        tensorType: onnx.TypeProto.Tensor.create({elemType: tensorDataTypeStringToEnum(input.type)}),
-      }),
-    }));
+    // normalize input shape definitions
+    let normalizedInputShapeDefinitions: ReadonlyArray<Test.InputShapeDefinition|undefined>;
+    if (!test.inputShapeDefinitions || test.inputShapeDefinitions === 'none') {
+      // if inputShapeDefinitions is not specified, use undefined for all inputs
+      normalizedInputShapeDefinitions = new Array(inputCount).fill(undefined);
+    } else if (test.inputShapeDefinitions === 'rankOnly') {
+      // if inputShapeDefinitions is 'rankOnly', use semantic names for all inputs. This means only rank is specified.
+      normalizedInputShapeDefinitions =
+          test.cases[0].inputs!.map((input, i) => input.dims.map((_, j) => `_input_${i}_d${j}`));
+
+      // check if all test cases have the same rank for each inputs
+      if (test.cases.some(
+              testCase =>
+                  testCase.inputs!.some((input, i) => input.dims.length !== test.cases[0].inputs![i].dims.length))) {
+        throw new Error(`Test cases for test: ${test.name} [${
+            test.operator}] must have the same rank for each inputs in different test cases`);
+      }
+    } else if (test.inputShapeDefinitions === 'static') {
+      // if inputShapeDefinitions is 'static', use the shape of the first test case for all inputs.
+      normalizedInputShapeDefinitions = test.cases[0].inputs!.map(input => input.dims);
+
+      // check if all test cases have the same shape for each inputs
+      if (test.cases.some(
+              testCase => testCase.inputs!.some(
+                  (input, i) => TensorResultValidator.integerEqual(input.dims, test.cases[0].inputs![i].dims)))) {
+        throw new Error(`Test cases for test: ${test.name} [${
+            test.operator}] must have the same shape for each inputs in different test cases`);
+      }
+    } else {
+      // if inputShapeDefinitions is specified as an array, use it as is.
+      // check if inputShapeDefinitions has the same number of inputs as test cases
+      if (test.inputShapeDefinitions && test.inputShapeDefinitions.length !== inputCount) {
+        throw new Error(
+            `Input shape definitions for test: ${test.name} [${test.operator}] must have the same number of inputs`);
+      }
+      normalizedInputShapeDefinitions = test.inputShapeDefinitions;
+    }
+
+    model.graph.input = test.cases[0].inputs!.map((input, i) => {
+      const shapeDefinition = normalizedInputShapeDefinitions[i];
+      const shape = shapeDefinition ? onnx.TensorShapeProto.create({
+        dim: shapeDefinition.map(
+            dim => onnx.TensorShapeProto.Dimension.create(typeof dim === 'string' ? {dimParam: dim} : {dimValue: dim}))
+      }) :
+                                      undefined;
+      return onnx.ValueInfoProto.create({
+        name: `input_${i}`,
+        type: onnx.TypeProto.create({
+          tensorType: onnx.TypeProto.Tensor.create({elemType: tensorDataTypeStringToEnum(input.type), shape}),
+        }),
+      });
+    });
 
     model.graph.output = test.cases[0].outputs!.map((output, i) => onnx.ValueInfoProto.create({
       name: `output_${i}`,
@@ -652,6 +699,19 @@ export class ProtoOpTestContext {
 
     this.backendHint = test.backend!;
     this.loadedData = onnx.ModelProto.encode(model).finish();
+
+    // in debug mode, open a new tab in browser for the generated onnx model.
+    if (ort.env.debug) {
+      const modelFile =
+          new File([this.loadedData], `op_test_generated_model_${test.name}.onnx`, {type: 'application/octet-stream'});
+      const modelTempUrl = URL.createObjectURL(modelFile);
+      const a = document.createElement('a');
+      a.href = modelTempUrl;
+      a.download = modelFile.name;
+      a.target = '_blank';
+      a.click();
+      URL.revokeObjectURL(modelTempUrl);
+    }
   }
   async init(): Promise<void> {
     this.session = await ort.InferenceSession.create(this.loadedData, {executionProviders: [this.backendHint]});
diff --git a/js/web/test/test-types.ts b/js/web/test/test-types.ts
index e6afdcafd7039..b86ac4e50cef8 100644
--- a/js/web/test/test-types.ts
+++ b/js/web/test/test-types.ts
@@ -33,7 +33,7 @@ export declare namespace Test {
    * Represent a string to describe the current environment.
    * Used in ModelTest and OperatorTest to determine whether to run the test or not.
    */
-  export type Condition = string;
+  export type PlatformCondition = string;
 
   export interface ModelTestCase {
     name: string;
@@ -46,7 +46,7 @@ export declare namespace Test {
     name: string;
     modelUrl: string;
     backend?: string;  // value should be populated at build time
-    condition?: Condition;
+    platformCondition?: PlatformCondition;
     cases: readonly ModelTestCase[];
   }
 
@@ -66,13 +66,16 @@ export declare namespace Test {
     version: number;
   }
 
+  export type InputShapeDefinition = ReadonlyArray<number|string>;
+
   export interface OperatorTest {
     name: string;
     operator: string;
-    opsets?: readonly OperatorTestOpsetImport[];
+    inputShapeDefinitions?: 'none'|'rankOnly'|'static'|ReadonlyArray<InputShapeDefinition|undefined>;
+    opset?: OperatorTestOpsetImport;
     backend?: string;  // value should be populated at build time
-    condition?: Condition;
-    attributes: readonly AttributeValue[];
+    platformCondition?: PlatformCondition;
+    attributes?: readonly AttributeValue[];
     cases: readonly OperatorTestCase[];
   }
 
@@ -86,7 +89,7 @@ export declare namespace Test {
     export type TestName = string;
     export interface TestDescription {
       name: string;
-      condition: Condition;
+      platformCondition: PlatformCondition;
     }
     export type Test = TestName|TestDescription;
   }

From 06096fcb31c9446e7951b87eac770998a90c54ed Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Thu, 3 Aug 2023 14:49:54 -0700
Subject: [PATCH 3/5] Hardcode xcodebuild destination iOS simulator OS to 16.4.
 (#16982)

---
 tools/ci_build/build.py                                      | 3 ++-
 tools/ci_build/github/apple/test_ios_packages.py             | 3 ++-
 .../github/azure-pipelines/mac-ios-packaging-pipeline.yml    | 5 +++--
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 3358ee7b9aeb7..d602c02875691 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1680,7 +1680,8 @@ def run_ios_tests(args, source_dir, config, cwd):
                 "-scheme",
                 xc_test_scheme,
                 "-destination",
-                f"platform=iOS Simulator,OS=latest,name={simulator_device_name}",
+                # hardcode iOS 16.4 for now. latest macOS-13 image defaults to iOS 17 (beta) which doesn't work.
+                f"platform=iOS Simulator,OS=16.4,name={simulator_device_name}",
             ],
             cwd=cwd,
         )
diff --git a/tools/ci_build/github/apple/test_ios_packages.py b/tools/ci_build/github/apple/test_ios_packages.py
index 661b65f0437bb..5ede0d6aa006c 100644
--- a/tools/ci_build/github/apple/test_ios_packages.py
+++ b/tools/ci_build/github/apple/test_ios_packages.py
@@ -129,7 +129,8 @@ def _test_ios_packages(args):
                     "-scheme",
                     "ios_package_test",
                     "-destination",
-                    f"platform=iOS Simulator,OS=latest,name={simulator_device_name}",
+                    # hardcode iOS 16.4 for now. latest macOS-13 image defaults to iOS 17 (beta) which doesn't work.
+                    f"platform=iOS Simulator,OS=16.4,name={simulator_device_name}",
                 ],
                 shell=False,
                 check=True,
diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
index 07fd5c4974d2f..361b47b7d90a2 100644
--- a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
@@ -15,7 +15,7 @@ name: "$(Date:yyyyMMdd)$(Rev:rrr)"  # build number format
 stages:
 - stage: IosPackaging_SetCommonVariables
   dependsOn: []
-  
+
   variables:
     skipComponentGovernanceDetection: true
 
@@ -112,6 +112,7 @@ stages:
         set -e -x
         cp "$(Pipeline.Workspace)/ios_packaging_artifacts_full/pod-archive-onnxruntime-c-$(ortPodVersion).zip" swift/
         export ORT_IOS_POD_LOCAL_PATH="swift/pod-archive-onnxruntime-c-$(ortPodVersion).zip"
-        xcodebuild test -scheme onnxruntime -destination 'platform=iOS Simulator,name=iPhone 14'
+        # hardcode iOS 16.4 for now. latest macOS-13 image defaults to iOS 17 (beta).
+        xcodebuild test -scheme onnxruntime -destination 'platform=iOS Simulator,OS=16.4,name=iPhone 14'
         rm swift/pod-archive-onnxruntime-c-$(ortPodVersion).zip
       displayName: "Test Package.swift usage"

From bda012a4b23ad7f94df02dd36714321956c21d4f Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Thu, 3 Aug 2023 15:23:55 -0700
Subject: [PATCH 4/5] Scripts to convert model with MulitHeadAttention to
 packing mode (#16925)

### Description

Update scripts for converting model with MulitHeadAttention to packing
mode.
- [x] Update symbolic shape inference for PackedMultiHeadAttention and
GatedRelativePositionBias
- [x] Update convert_to_packing_mode to handle model with
MulitHeadAttention


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../python/tools/symbolic_shape_infer.py      |  55 +++-
 .../python/tools/transformers/constants.py    |  19 ++
 .../transformers/convert_to_packing_mode.py   | 274 ++++++++++++++----
 3 files changed, 275 insertions(+), 73 deletions(-)

diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py
index 138a50cbe2e86..02265e9963bf8 100755
--- a/onnxruntime/python/tools/symbolic_shape_infer.py
+++ b/onnxruntime/python/tools/symbolic_shape_infer.py
@@ -163,7 +163,6 @@ def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""):
             "Reciprocal": self._pass_on_shape_and_type,
             "ReduceSum": self._infer_ReduceSum,
             "ReduceProd": self._infer_ReduceProd,
-            "RelativePositionBias": self._infer_RelativePositionBias,
             "Reshape": self._infer_Reshape,
             "Resize": self._infer_Resize,
             "Round": self._pass_on_shape_and_type,
@@ -190,26 +189,29 @@ def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""):
             "Neg": self._infer_symbolic_compute_ops,
             # contrib ops:
             "Attention": self._infer_Attention,
-            "PackedAttention": self._infer_PackedAttention,
-            "RemovePadding": self._infer_RemovePadding,
-            "RestorePadding": self._infer_RestorePadding,
+            "BiasAdd": self._infer_BiasAdd,
             "BiasGelu": self._infer_BiasGelu,
-            "MultiHeadAttention": self._infer_MultiHeadAttention,
+            "BiasSplitGelu": self._infer_BiasSplitGelu,
             "DecoderMaskedMultiHeadAttention": self._infer_DecoderMaskedMultiHeadAttention,
             "EmbedLayerNormalization": self._infer_EmbedLayerNormalization,
             "FastGelu": self._infer_FastGelu,
+            "GatedRelativePositionBias": self._infer_GatedRelativePositionBias,
             "Gelu": self._infer_Gelu,
             "GemmFastGelu": self._infer_GemmFastGelu,
+            "GroupNorm": self._infer_GroupNorm,
             "LayerNormalization": self._infer_LayerNormalization,
             "LongformerAttention": self._infer_LongformerAttention,
+            "MultiHeadAttention": self._infer_MultiHeadAttention,
+            "NhwcConv": self._infer_NhwcConv,
+            "PackedAttention": self._infer_PackedAttention,
+            "PackedMultiHeadAttention": self._infer_PackedMultiHeadAttention,
             "PythonOp": self._infer_PythonOp,
+            "RelativePositionBias": self._infer_RelativePositionBias,
+            "RemovePadding": self._infer_RemovePadding,
+            "RestorePadding": self._infer_RestorePadding,
             "SimplifiedLayerNormalization": self._infer_LayerNormalization,
             "SkipLayerNormalization": self._infer_SkipLayerNormalization,
             "SkipSimplifiedLayerNormalization": self._infer_SkipLayerNormalization,
-            "GroupNorm": self._infer_GroupNorm,
-            "BiasSplitGelu": self._infer_BiasSplitGelu,
-            "BiasAdd": self._infer_BiasAdd,
-            "NhwcConv": self._infer_NhwcConv,
         }
         self.aten_op_dispatcher_ = {
             "embedding": self._infer_Gather,
@@ -2113,6 +2115,28 @@ def _infer_Attention(self, node):  # noqa: N802
                     vi = self.known_vi_[node.output[1]]
                     vi.CopyFrom(helper.make_tensor_value_info(vi.name, output_dtype, present_shape))
 
+    def _infer_GatedRelativePositionBias(self, node):  # noqa: N802
+        # When padding is removed:
+        #   query_layer: (token_count, num_heads x head_size)
+        #   token_offset: (batch_size, seq_len)
+        # Otherwise:
+        #   query_layer: (batch_size, seq_len, num_heads x head_size)
+        #   token_offset: None
+        # Output shape: (batch_size, num_heads, seq_len, seq_len)
+        num_heads = get_attribute(node, "num_heads")
+
+        token_offset_shape = self._try_get_shape(node, 6)
+        if token_offset_shape is not None:
+            output_shape = [token_offset_shape[0], num_heads, token_offset_shape[1], token_offset_shape[1]]
+        else:
+            query_layer_shape = self._get_shape(node, 0)
+            assert query_layer_shape is not None and len(query_layer_shape) == 3
+            output_shape = [query_layer_shape[0], num_heads, query_layer_shape[1], query_layer_shape[1]]
+
+        output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, output_shape))
+
     def _infer_PackedAttention(self, node):  # noqa: N802
         shape = self._get_shape(node, 0)
         shape_weights = self._get_shape(node, 1)
@@ -2131,6 +2155,19 @@ def _infer_PackedAttention(self, node):  # noqa: N802
             vi = self.known_vi_[node.output[0]]
             vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, shape))
 
+    def _infer_PackedMultiHeadAttention(self, node):  # noqa: N802
+        shape_value = self._try_get_shape(node, 2)
+        if shape_value is not None and len(shape_value) == 2:
+            output_shape = shape_value
+        else:
+            shape_query = self._get_shape(node, 0)
+            assert shape_query is not None and len(shape_query) == 4
+            output_shape = [shape_query[0], shape_query[1] * shape_query[3]]
+
+        output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
+        vi = self.known_vi_[node.output[0]]
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, output_shape))
+
     def _infer_RemovePadding(self, node):  # noqa: N802
         shape = self._get_shape(node, 0)
         if shape and len(shape) == 3:
diff --git a/onnxruntime/python/tools/transformers/constants.py b/onnxruntime/python/tools/transformers/constants.py
index 9f12d4de5a37d..fc8f2cc2f58d3 100644
--- a/onnxruntime/python/tools/transformers/constants.py
+++ b/onnxruntime/python/tools/transformers/constants.py
@@ -7,7 +7,9 @@
 class Operators:
     ATTENTION = "Attention"
     LAYERNORM = "LayerNormalization"
+    MULTI_HEAD_ATTENTION = "MultiHeadAttention"
     PACKEDATTENTION = "PackedAttention"
+    PACKED_MULTI_HEAD_ATTENTION = "PackedMultiHeadAttention"
     REMOVEPADDING = "RemovePadding"
     RESTOREPADDING = "RestorePadding"
     SKIPLAYERNORM = "SkipLayerNormalization"
@@ -26,3 +28,20 @@ class AttentionInputIDs:
 class AttentionOutputIDs:
     OUTPUT = 0
     PRESENT = 1
+
+
+class MultiHeadAttentionInputIDs:
+    QUERY = 0
+    KEY = 1
+    VALUE = 2
+    BIAS = 3
+    KEY_PADDING_MASK = 4
+    RELATIVE_POSITION_BIAS = 5
+    PAST_KEY = 6
+    PAST_VALUE = 7
+
+
+class MultiHeadAttentionOutputIDs:
+    OUTPUT = 0
+    PRESENT_KEY = 1
+    PRESENT_VALUE = 2
diff --git a/onnxruntime/python/tools/transformers/convert_to_packing_mode.py b/onnxruntime/python/tools/transformers/convert_to_packing_mode.py
index f5ec5b884f5aa..0b8dbdcdd9638 100644
--- a/onnxruntime/python/tools/transformers/convert_to_packing_mode.py
+++ b/onnxruntime/python/tools/transformers/convert_to_packing_mode.py
@@ -9,7 +9,13 @@
 from typing import List, Union
 
 import coloredlogs
-from constants import AttentionInputIDs, AttentionOutputIDs, Operators
+from constants import (
+    AttentionInputIDs,
+    AttentionOutputIDs,
+    MultiHeadAttentionInputIDs,
+    MultiHeadAttentionOutputIDs,
+    Operators,
+)
 from onnx import helper, load_model
 from onnx_model import NodeProto, OnnxModel
 from shape_infer_helper import SymbolicShapeInferenceHelper
@@ -17,33 +23,33 @@
 logger = logging.getLogger(__name__)
 
 
-class PackingMode:
-    def __init__(
-        self,
-        model: OnnxModel,
-    ):
+class PackingAttentionBase:
+    def __init__(self, model: OnnxModel, attention_op_type: str):
         self.model: OnnxModel = model
         self.nodes_to_remove: List = []
         self.nodes_to_add: List = []
         self.prune_graph: bool = False
         self.node_name_to_graph_name: dict = {}
         self.this_graph_name: str = self.model.model.graph.name
-        self.attention_nodes = self.model.get_nodes_by_op_type(Operators.ATTENTION)
+        self.attention_op_type = attention_op_type
+        self.attention_nodes = self.model.get_nodes_by_op_type(attention_op_type)
 
     def _try_getting_attention_mask(self) -> Union[str, None]:
+        mask_index = (
+            AttentionInputIDs.MASK_INDEX
+            if self.attention_op_type == Operators.ATTENTION
+            else MultiHeadAttentionInputIDs.KEY_PADDING_MASK
+        )
         first_attention_node = self._try_getting_first_attention()
         # check if attention has mask
-        if not first_attention_node or len(first_attention_node.input) <= AttentionInputIDs.MASK_INDEX:
+        if not first_attention_node or len(first_attention_node.input) <= mask_index:
             return None
 
-        attention_mask = first_attention_node.input[AttentionInputIDs.MASK_INDEX]
+        attention_mask = first_attention_node.input[mask_index]
 
         # check if all attention nodes have same mask
         for node in self.attention_nodes:
-            if (
-                len(node.input) <= AttentionInputIDs.MASK_INDEX
-                or node.input[AttentionInputIDs.MASK_INDEX] != attention_mask
-            ):
+            if len(node.input) <= mask_index or node.input[mask_index] != attention_mask:
                 return None
 
         return attention_mask
@@ -62,22 +68,7 @@ def _try_getting_last_layernorm(self) -> Union[NodeProto, None]:
         return last_layernorm_node
 
     def _are_attentions_supportted(self) -> bool:
-        for node in self.attention_nodes:
-            if OnnxModel.get_node_attribute(node, "past_present_share_buffer") is not None:
-                return False
-            if OnnxModel.get_node_attribute(node, "do_rotary") is not None:
-                return False
-            unidirection_attr = OnnxModel.get_node_attribute(node, "unidirectional")
-            if unidirection_attr is not None and unidirection_attr != 0:
-                return False
-            if len(node.input) > AttentionInputIDs.PAST and not node.input[AttentionInputIDs.PAST]:
-                return False
-            if (
-                len(node.input) > AttentionInputIDs.PAST_SEQUENCE_LENGTH
-                and not node.input[AttentionInputIDs.PAST_SEQUENCE_LENGTH]
-            ):
-                return False
-        return True
+        raise NotImplementedError()
 
     def _insert_removepadding_node(self, inputs: List[str], outputs: List[str]) -> None:
         new_node = helper.make_node(
@@ -104,36 +95,16 @@ def _insert_restorepadding_node(self, inputs: List[str], outputs: List[str]) ->
         self.node_name_to_graph_name[new_node.name] = self.this_graph_name
 
     def _replace_attention_with_packing_attention(self, token_offset: str, cumulative_sequence_length: str) -> None:
-        for attention in self.attention_nodes:
-            packed_attention = helper.make_node(
-                Operators.PACKEDATTENTION,
-                inputs=[
-                    attention.input[AttentionInputIDs.INPUT],
-                    attention.input[AttentionInputIDs.WEIGHTS],
-                    attention.input[AttentionInputIDs.BIAS],
-                    token_offset,
-                    cumulative_sequence_length,
-                    attention.input[AttentionInputIDs.RELATIVE_POSITION_BIAS]
-                    if len(attention.input) > AttentionInputIDs.RELATIVE_POSITION_BIAS
-                    else "",
-                ],
-                outputs=[attention.output[AttentionOutputIDs.OUTPUT]],
-                name=self.model.create_node_name(Operators.PACKEDATTENTION),
-            )
+        raise NotImplementedError()
 
-            attributes = []
-            for attr in attention.attribute:
-                if attr.name in ["num_heads", "qkv_hidden_sizes", "scale"]:
-                    attributes.append(attr)
-
-            packed_attention.attribute.extend(attributes)
-            packed_attention.domain = "com.microsoft"
-            self.nodes_to_add.append(packed_attention)
-            self.nodes_to_remove.append(attention)
-            self.node_name_to_graph_name[packed_attention.name] = self.this_graph_name
+    def _get_input_to_remove_padding(self, first_attention_node) -> Union[str, None]:
+        if self.attention_op_type == Operators.ATTENTION:
+            return first_attention_node.input[AttentionInputIDs.INPUT]
+        return None
 
     def convert(self, use_symbolic_shape_infer: bool = True) -> None:
         logger.debug("start converting to packing model...")
+
         if not self._are_attentions_supportted():
             return
 
@@ -147,12 +118,14 @@ def convert(self, use_symbolic_shape_infer: bool = True) -> None:
             return
 
         # insert RemovePadding
-        first_attention_input = first_attention_node.input[AttentionInputIDs.INPUT]
-        input_to_remove_padding = first_attention_input
-        output_without_padding = first_attention_input + "_no_padding"
-        token_offset = first_attention_input + "_token_offset"
-        cumulated_seq_len = first_attention_input + "_cumulated_seq_len"
-        max_seq_len = first_attention_input + "_max_seq_len"
+        input_to_remove_padding = self._get_input_to_remove_padding(first_attention_node)
+        if not input_to_remove_padding:
+            return
+
+        output_without_padding = input_to_remove_padding + "_no_padding"
+        token_offset = input_to_remove_padding + "_token_offset"
+        cumulated_seq_len = input_to_remove_padding + "_cumulated_seq_len"
+        max_seq_len = input_to_remove_padding + "_max_seq_len"
         self._insert_removepadding_node(
             [input_to_remove_padding, attention_mask],
             [output_without_padding, token_offset, cumulated_seq_len, max_seq_len],
@@ -166,9 +139,9 @@ def convert(self, use_symbolic_shape_infer: bool = True) -> None:
         self.model.replace_output_of_all_nodes(last_layernorm_node.output[0], restorepadding_input)
         logger.debug(f"inserted RestorePadding after last {last_layernorm_node.op_type} layer")
 
-        # insert PackingAttention
+        # insert PackedAttention
         self._replace_attention_with_packing_attention(token_offset, cumulated_seq_len)
-        logger.debug("replaced Attention with PackedAttention")
+        logger.debug(f"replaced {self.attention_op_type} with Packed{self.attention_op_type}")
 
         self.model.remove_nodes(self.nodes_to_remove)
         self.model.add_nodes(self.nodes_to_add, self.node_name_to_graph_name)
@@ -187,6 +160,179 @@ def convert(self, use_symbolic_shape_infer: bool = True) -> None:
                 self.model.model = inferred_model
 
 
+class PackingAttention(PackingAttentionBase):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, Operators.ATTENTION)
+
+    def _are_attentions_supportted(self) -> bool:
+        for node in self.attention_nodes:
+            if OnnxModel.get_node_attribute(node, "past_present_share_buffer") is not None:
+                return False
+            if OnnxModel.get_node_attribute(node, "do_rotary") is not None:
+                return False
+            unidirection_attr = OnnxModel.get_node_attribute(node, "unidirectional")
+            if unidirection_attr is not None and unidirection_attr != 0:
+                return False
+            if len(node.input) > AttentionInputIDs.PAST and not node.input[AttentionInputIDs.PAST]:
+                return False
+            if (
+                len(node.input) > AttentionInputIDs.PAST_SEQUENCE_LENGTH
+                and not node.input[AttentionInputIDs.PAST_SEQUENCE_LENGTH]
+            ):
+                return False
+        return True
+
+    def _replace_attention_with_packing_attention(self, token_offset: str, cumulative_sequence_length: str) -> None:
+        for attention in self.attention_nodes:
+            relative_pos_bias = (
+                attention.input[AttentionInputIDs.RELATIVE_POSITION_BIAS]
+                if len(attention.input) > AttentionInputIDs.RELATIVE_POSITION_BIAS
+                else ""
+            )
+            packed_attention = helper.make_node(
+                Operators.PACKEDATTENTION,
+                inputs=[
+                    attention.input[AttentionInputIDs.INPUT],
+                    attention.input[AttentionInputIDs.WEIGHTS],
+                    attention.input[AttentionInputIDs.BIAS],
+                    token_offset,
+                    cumulative_sequence_length,
+                    relative_pos_bias,
+                ],
+                outputs=[attention.output[AttentionOutputIDs.OUTPUT]],
+                name=self.model.create_node_name(Operators.PACKEDATTENTION),
+            )
+
+            attributes = []
+            for attr in attention.attribute:
+                if attr.name in ["num_heads", "qkv_hidden_sizes", "scale"]:
+                    attributes.append(attr)
+
+            packed_attention.attribute.extend(attributes)
+            packed_attention.domain = "com.microsoft"
+            self.nodes_to_add.append(packed_attention)
+            self.nodes_to_remove.append(attention)
+            self.node_name_to_graph_name[packed_attention.name] = self.this_graph_name
+
+        logger.info("Converted %d Attention nodes to PackedAttention.", len(self.attention_nodes))
+
+
+class PackingMultiHeadAttention(PackingAttentionBase):
+    def __init__(self, model: OnnxModel):
+        super().__init__(model, Operators.MULTI_HEAD_ATTENTION)
+
+    def _check_empty_input(self, node, index: int, name: str):
+        """Check a node does not have given input."""
+        if len(node.input) > index:
+            if len(node.input[index]) > 0:
+                logger.error(f"node input {index} ({name}) is not supported in PackedMultiHeadAttention: {node}")
+                return False
+        return True
+
+    def _check_empty_output(self, node, index: int, name: str):
+        """Check a node does not have given input."""
+        if len(node.output) > index:
+            if len(node.output[index]) > 0:
+                logger.error(f"node output {index} ({name}) is not supported in PackedMultiHeadAttention: {node}")
+                return False
+        return True
+
+    def _are_attentions_supportted(self) -> bool:
+        for node in self.attention_nodes:
+            for attr in node.attribute:
+                if attr.name not in ["num_heads", "mask_filter_value", "scale"]:
+                    logger.error(f"node attribute {attr.name} is not supported in PackedMultiHeadAttention: {node}")
+                    return False
+
+            if node.input[MultiHeadAttentionInputIDs.KEY] and not node.input[MultiHeadAttentionInputIDs.VALUE]:
+                logger.error("packed kv format is not supported in PackedMultiHeadAttention")
+                return False
+
+            if not (
+                self._check_empty_input(node, MultiHeadAttentionInputIDs.PAST_KEY, "past_key")
+                and self._check_empty_input(node, MultiHeadAttentionInputIDs.PAST_VALUE, "past_key")
+                and self._check_empty_output(node, MultiHeadAttentionOutputIDs.PRESENT_KEY, "present_key")
+                and self._check_empty_output(node, MultiHeadAttentionOutputIDs.PRESENT_VALUE, "present_key")
+            ):
+                return False
+
+        return True
+
+    def _replace_attention_with_packing_attention(self, token_offset: str, cumulative_sequence_length: str) -> None:
+        gated_relative_pos_bias_count = 0
+        for mha in self.attention_nodes:
+            relative_pos_bias = (
+                mha.input[MultiHeadAttentionInputIDs.RELATIVE_POSITION_BIAS]
+                if len(mha.input) > MultiHeadAttentionInputIDs.RELATIVE_POSITION_BIAS
+                else ""
+            )
+            packed_mha = helper.make_node(
+                Operators.PACKED_MULTI_HEAD_ATTENTION,
+                inputs=[
+                    mha.input[MultiHeadAttentionInputIDs.QUERY],
+                    mha.input[MultiHeadAttentionInputIDs.KEY],
+                    mha.input[MultiHeadAttentionInputIDs.VALUE],
+                    mha.input[MultiHeadAttentionInputIDs.BIAS],
+                    token_offset,
+                    cumulative_sequence_length,
+                    relative_pos_bias,
+                ],
+                outputs=[mha.output[MultiHeadAttentionOutputIDs.OUTPUT]],
+                name=self.model.create_node_name(Operators.PACKED_MULTI_HEAD_ATTENTION),
+            )
+
+            attributes = []
+            for attr in mha.attribute:
+                if attr.name in ["num_heads", "mask_filter_value", "scale"]:
+                    attributes.append(attr)
+
+            packed_mha.attribute.extend(attributes)
+            packed_mha.domain = "com.microsoft"
+            self.nodes_to_add.append(packed_mha)
+            self.nodes_to_remove.append(mha)
+            self.node_name_to_graph_name[packed_mha.name] = self.this_graph_name
+
+            # Append token_offset input to GatedRelativePositionBias
+            if relative_pos_bias:
+                rel_pos_bias_node = self.model.get_parent(mha, MultiHeadAttentionInputIDs.RELATIVE_POSITION_BIAS)
+                if (
+                    rel_pos_bias_node
+                    and rel_pos_bias_node.op_type == "GatedRelativePositionBias"
+                    and len(rel_pos_bias_node.input) == 6
+                ):
+                    rel_pos_bias_node.input.append(token_offset)
+                    gated_relative_pos_bias_count += 1
+
+        logger.info("Converted %d MultiHeadAttention nodes to PackedMultiHeadAttention.", len(self.attention_nodes))
+        logger.info("Converted %d GatedRelativePositionBias nodes to packing mode.", gated_relative_pos_bias_count)
+
+    def _get_input_to_remove_padding(self, first_attention_node) -> Union[str, None]:
+        # When there are query, key and value inputs, we need to find the first input of the parent MatMul node.
+        matmul = self.model.get_parent(first_attention_node, 0)
+        if matmul and matmul.op_type == "MatMul":
+            return matmul.input[0]
+        return None
+
+
+class PackingMode:
+    def __init__(self, model: OnnxModel):
+        self.model = model
+
+    def convert(self, use_symbolic_shape_infer: bool = True) -> None:
+        if self.model.get_nodes_by_op_type(Operators.ATTENTION):
+            if self.model.get_nodes_by_op_type(Operators.MULTI_HEAD_ATTENTION):
+                logger.error("Packing mode does not support both Attention and MultiHeadAttention in same graph.")
+                return None
+            packing = PackingAttention(self.model)
+            return packing.convert(use_symbolic_shape_infer)
+        elif self.model.get_nodes_by_op_type(Operators.MULTI_HEAD_ATTENTION):
+            packing = PackingMultiHeadAttention(self.model)
+            return packing.convert(use_symbolic_shape_infer)
+        else:
+            logger.error("Packing mode requires either Attention or MultiHeadAttention node in onnx graph.")
+            return None
+
+
 def _parse_arguments():
     parser = argparse.ArgumentParser(
         description="Convert to packing mode tool for ONNX Runtime. It converts BERT like model to use packing mode."
@@ -226,7 +372,7 @@ def main():
 
     _setup_logger(args.verbose)
 
-    logger.debug("arguments:{args}")
+    logger.debug(f"arguments:{args}")
 
     if os.path.realpath(args.input) == os.path.realpath(args.output):
         logger.warning("Specified the same input and output path. Note that this may overwrite the original model")

From a25d0d296b1a5d2677c34b3ca8c06444b86879fc Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Thu, 3 Aug 2023 15:24:20 -0700
Subject: [PATCH 5/5] Add --mask_type option to generate different format of
 attention mask in bert_perf_test.py (#16976)

### Description
Add an option to generate different formats of attention_mask for
testing transformers models:
1 - 1D mask index, actual sequence length excluding padding
2 - 2D attention mask. Value 0 means padding, 1 otherwise.
3 - 1D, key lengths and cumulated sequence lengths of query and key

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../tools/transformers/bert_perf_test.py      | 13 +++-
 .../tools/transformers/bert_test_data.py      | 78 +++++++++++++++----
 .../transformers/compare_bert_results.py      | 15 +++-
 3 files changed, 90 insertions(+), 16 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/bert_perf_test.py b/onnxruntime/python/tools/transformers/bert_perf_test.py
index c843831be6779..984814f2f5cbb 100644
--- a/onnxruntime/python/tools/transformers/bert_perf_test.py
+++ b/onnxruntime/python/tools/transformers/bert_perf_test.py
@@ -57,6 +57,7 @@ class ModelSetting:
     opt_level: int
     input_tuning_results: Optional[str]
     output_tuning_results: Optional[str]
+    mask_type: int
 
 
 def create_session(
@@ -369,6 +370,7 @@ def run_performance(model_setting, test_setting, perf_results):
         input_mask,
         test_setting.average_sequence_length,
         test_setting.random_sequence_length,
+        mask_type=model_setting.mask_type,
     )
 
     run_perf_tests(model_setting, test_setting, perf_results, all_inputs)
@@ -524,6 +526,14 @@ def parse_arguments():
     )
     parser.set_defaults(random_sequence_length=False)
 
+    parser.add_argument(
+        "--mask_type",
+        required=False,
+        type=int,
+        default=2,
+        help="mask type: (1: mask index or sequence length, 2: raw 2D mask, 3: key len, cumulated lengths of query and key)",
+    )
+
     args = parser.parse_args()
     return args
 
@@ -541,7 +551,7 @@ def main():
     perf_results = manager.dict()
 
     batch_size_set = set(args.batch_size)
-    if not min(batch_size_set) >= 1 and max(batch_size_set) <= 128:
+    if not (min(batch_size_set) >= 1 and max(batch_size_set) <= 128):
         raise Exception("batch_size not in range [1, 128]")
 
     model_setting = ModelSetting(
@@ -552,6 +562,7 @@ def main():
         args.opt_level,
         args.input_tuning_results,
         args.output_tuning_results,
+        args.mask_type,
     )
 
     for batch_size in batch_size_set:
diff --git a/onnxruntime/python/tools/transformers/bert_test_data.py b/onnxruntime/python/tools/transformers/bert_test_data.py
index bed9eb4dbc1f1..6a7139d0e8085 100644
--- a/onnxruntime/python/tools/transformers/bert_test_data.py
+++ b/onnxruntime/python/tools/transformers/bert_test_data.py
@@ -74,12 +74,23 @@ def fake_segment_ids_data(segment_ids: TensorProto, batch_size: int, sequence_le
     return data
 
 
+def get_random_length(max_sequence_length: int, average_sequence_length: int):
+    assert average_sequence_length >= 1 and average_sequence_length <= max_sequence_length
+
+    # For uniform distribution, we find proper lower and upper bounds so that the average is in the middle.
+    if 2 * average_sequence_length > max_sequence_length:
+        return random.randint(2 * average_sequence_length - max_sequence_length, max_sequence_length)
+    else:
+        return random.randint(1, 2 * average_sequence_length - 1)
+
+
 def fake_input_mask_data(
     input_mask: TensorProto,
     batch_size: int,
     sequence_length: int,
     average_sequence_length: int,
     random_sequence_length: bool,
+    mask_type: int = 2,
 ) -> np.ndarray:
     """Create input tensor based on the graph input of segment_ids.
 
@@ -89,6 +100,9 @@ def fake_input_mask_data(
         sequence_length (int): sequence length
         average_sequence_length (int): average sequence length excluding paddings
         random_sequence_length (bool): whether use uniform random number for sequence length
+        mask_type (int): mask type - 1: mask index (sequence length excluding paddings). Shape is (batch_size).
+                                     2: 2D attention mask. Shape is (batch_size, sequence_length).
+                                     3: key len, cumulated lengths of query and key. Shape is (3 * batch_size + 2).
 
     Returns:
         np.ndarray: the input tensor created
@@ -100,20 +114,40 @@ def fake_input_mask_data(
         TensorProto.INT64,
     ]
 
-    data = np.zeros((batch_size, sequence_length), dtype=np.int32)
-    if random_sequence_length:
-        for i in range(batch_size):
-            # We use uniform distribution, so we find proper minimal and maximal so that the average is in the middle.
-            if 2 * average_sequence_length > sequence_length:
-                actual_seq_len = random.randint(2 * average_sequence_length - sequence_length, sequence_length)
-            else:
-                actual_seq_len = random.randint(1, 2 * average_sequence_length - 1)
-
-            for j in range(actual_seq_len):
-                data[i, j] = 1
+    if mask_type == 1:  # sequence length excluding paddings
+        data = np.ones((batch_size), dtype=np.int32)
+        if random_sequence_length:
+            for i in range(batch_size):
+                data[i] = get_random_length(sequence_length, average_sequence_length)
+        else:
+            for i in range(batch_size):
+                data[i] = average_sequence_length
+    elif mask_type == 2:  # 2D attention mask
+        data = np.zeros((batch_size, sequence_length), dtype=np.int32)
+        if random_sequence_length:
+            for i in range(batch_size):
+                actual_seq_len = get_random_length(sequence_length, average_sequence_length)
+                for j in range(actual_seq_len):
+                    data[i, j] = 1
+        else:
+            temp = np.ones((batch_size, average_sequence_length), dtype=np.int32)
+            data[: temp.shape[0], : temp.shape[1]] = temp
     else:
-        temp = np.ones((batch_size, average_sequence_length), dtype=np.int32)
-        data[: temp.shape[0], : temp.shape[1]] = temp
+        assert mask_type == 3
+        data = np.zeros((batch_size * 3 + 2), dtype=np.int32)
+        if random_sequence_length:
+            for i in range(batch_size):
+                data[i] = get_random_length(sequence_length, average_sequence_length)
+
+            for i in range(batch_size + 1):
+                data[batch_size + i] = data[batch_size + i - 1] + data[i - 1] if i > 0 else 0
+                data[2 * batch_size + 1 + i] = data[batch_size + i - 1] + data[i - 1] if i > 0 else 0
+        else:
+            for i in range(batch_size):
+                data[i] = average_sequence_length
+            for i in range(batch_size + 1):
+                data[batch_size + i] = i * average_sequence_length
+                data[2 * batch_size + 1 + i] = i * average_sequence_length
 
     if input_mask.type.tensor_type.elem_type == TensorProto.FLOAT:
         data = np.float32(data)
@@ -160,6 +194,7 @@ def fake_test_data(
     input_mask: TensorProto,
     average_sequence_length: int,
     random_sequence_length: bool,
+    mask_type: int,
 ):
     """Create given number of input data for testing
 
@@ -175,6 +210,7 @@ def fake_test_data(
         input_mask (TensorProto): graph input of attention mask
         average_sequence_length (int): average sequence length excluding paddings
         random_sequence_length (bool): whether use uniform random number for sequence length
+        mask_type (int): mask type 1 is mask index; 2 is 2D mask; 3 is key len, cumulated lengths of query and key
 
     Returns:
         List[Dict[str,numpy.ndarray]]: list of test cases, where each test case is a dictionary
@@ -195,7 +231,7 @@ def fake_test_data(
 
         if input_mask:
             inputs[input_mask.name] = fake_input_mask_data(
-                input_mask, batch_size, sequence_length, average_sequence_length, random_sequence_length
+                input_mask, batch_size, sequence_length, average_sequence_length, random_sequence_length, mask_type
             )
 
         if verbose and len(all_inputs) == 0:
@@ -215,6 +251,7 @@ def generate_test_data(
     input_mask: TensorProto,
     average_sequence_length: int,
     random_sequence_length: bool,
+    mask_type: int,
 ):
     """Create given number of input data for testing
 
@@ -229,6 +266,7 @@ def generate_test_data(
         input_mask (TensorProto): graph input of attention mask
         average_sequence_length (int): average sequence length excluding paddings
         random_sequence_length (bool): whether use uniform random number for sequence length
+        mask_type (int): mask type 1 is mask index; 2 is 2D mask; 3 is key len, cumulated lengths of query and key
 
     Returns:
         List[Dict[str,numpy.ndarray]]: list of test cases, where each test case is a dictionary
@@ -247,6 +285,7 @@ def generate_test_data(
         input_mask,
         average_sequence_length,
         random_sequence_length,
+        mask_type,
     )
     if len(all_inputs) != test_cases:
         print("Failed to create test data for test.")
@@ -474,6 +513,14 @@ def parse_arguments():
     )
     parser.set_defaults(random_sequence_length=False)
 
+    parser.add_argument(
+        "--mask_type",
+        required=False,
+        type=int,
+        default=2,
+        help="mask type: (1: mask index, 2: raw 2D mask, 3: key lengths, cumulated lengths of query and key)",
+    )
+
     args = parser.parse_args()
     return args
 
@@ -492,6 +539,7 @@ def create_and_save_test_data(
     only_input_tensors: bool,
     average_sequence_length: int,
     random_sequence_length: bool,
+    mask_type: int,
 ):
     """Create test data for a model, and save test data to a directory.
 
@@ -509,6 +557,7 @@ def create_and_save_test_data(
         only_input_tensors (bool): only save input tensors,
         average_sequence_length (int): average sequence length excluding paddings
         random_sequence_length (bool): whether use uniform random number for sequence length
+        mask_type(int): mask type
     """
     input_ids, segment_ids, input_mask = get_bert_inputs(model, input_ids_name, segment_ids_name, input_mask_name)
 
@@ -523,6 +572,7 @@ def create_and_save_test_data(
         input_mask,
         average_sequence_length,
         random_sequence_length,
+        mask_type,
     )
 
     for i, inputs in enumerate(all_inputs):
diff --git a/onnxruntime/python/tools/transformers/compare_bert_results.py b/onnxruntime/python/tools/transformers/compare_bert_results.py
index 33562acfd2242..4cb9585962143 100644
--- a/onnxruntime/python/tools/transformers/compare_bert_results.py
+++ b/onnxruntime/python/tools/transformers/compare_bert_results.py
@@ -89,6 +89,7 @@ def run_test(
     input_ids_name,
     segment_ids_name,
     input_mask_name,
+    mask_type,
 ):
     # Try deduce input names from optimized model.
     input_ids, segment_ids, input_mask = get_bert_inputs(
@@ -96,6 +97,7 @@ def run_test(
     )
 
     # Use random mask length for accuracy test. It might introduce slight inflation in latency reported in this script.
+    average_sequence_length = int(sequence_length / 2) if sequence_length >= 2 else sequence_length
     all_inputs = generate_test_data(
         batch_size,
         sequence_length,
@@ -105,7 +107,9 @@ def run_test(
         input_ids,
         segment_ids,
         input_mask,
-        random_mask_length=True,
+        average_sequence_length,
+        True,  # random sequence length
+        mask_type,
     )
 
     baseline_results, baseline_latency, output_names = run_model(
@@ -208,6 +212,14 @@ def parse_arguments():
         help="input name for attention mask",
     )
 
+    parser.add_argument(
+        "--mask_type",
+        required=False,
+        type=int,
+        default=2,
+        help="mask type: (1: mask index or sequence length, 2: raw 2D mask, 3: key len, cumulated lengths of query and key)",
+    )
+
     args = parser.parse_args()
     return args
 
@@ -235,6 +247,7 @@ def main():
         args.input_ids,
         args.segment_ids,
         args.input_mask,
+        args.mask_type,
     )