From fc1f463ff11e6da06d231e54bb6a10d23f50cfe5 Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Wed, 19 Jul 2023 19:55:00 -0700
Subject: [PATCH 01/34] [ios] Enable training package in packaging pipeline
 (#16683)

Build iOS training package in packaging pipeline.
Refactor iOS packaging pipeline to build different package variants in parallel.
---
 .../apple/assemble_ios_packaging_artifacts.sh |  43 +--
 ...ult_full_ios_framework_build_settings.json |   4 +-
 ...training_ios_framework_build_settings.json |   2 +
 .../upload_pod_archive_and_update_podspec.sh  |  30 ++
 .../mac-ios-packaging-pipeline.yml            | 326 ++++++------------
 .../stages/mac-ios-packaging-build-stage.yml  | 162 +++++++++
 6 files changed, 310 insertions(+), 257 deletions(-)
 create mode 100755 tools/ci_build/github/apple/upload_pod_archive_and_update_podspec.sh
 create mode 100644 tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
diff --git a/tools/ci_build/github/apple/assemble_ios_packaging_artifacts.sh b/tools/ci_build/github/apple/assemble_ios_packaging_artifacts.sh
index e71ac75e6c3ab..317048506ac67 100755
--- a/tools/ci_build/github/apple/assemble_ios_packaging_artifacts.sh
+++ b/tools/ci_build/github/apple/assemble_ios_packaging_artifacts.sh
@@ -6,7 +6,7 @@
 set -e
 set -x
 
-USAGE_TEXT="Usage: ${0} <binaries staging directory> <artifacts staging directory> <ORT pod version> <whether to upload the package archives, 'true' or 'false'>"
+USAGE_TEXT="Usage: ${0} <binaries staging directory> <artifacts staging directory> <ORT pod name> <ORT pod version>"
 
 abspath() {
   local INPUT_PATH=${1:?"Expected path as the first argument."}
@@ -17,39 +17,16 @@ abspath() {
 BINARIES_STAGING_DIR=$(abspath "${1:?${USAGE_TEXT}}")
 # staging directory for build artifacts (destination)
 ARTIFACTS_STAGING_DIR=$(abspath "${2:?${USAGE_TEXT}}")
-ORT_POD_VERSION=${3:?${USAGE_TEXT}}
-SHOULD_UPLOAD_ARCHIVES=${4:?${USAGE_TEXT}}
+POD_NAME=${3:?${USAGE_TEXT}}
+ORT_POD_VERSION=${4:?${USAGE_TEXT}}
 
-STORAGE_ACCOUNT_NAME="onnxruntimepackages"
-STORAGE_ACCOUNT_CONTAINER_NAME='$web'
-STORAGE_URL_PREFIX=$(az storage account show --name ${STORAGE_ACCOUNT_NAME} --query "primaryEndpoints.web" --output tsv)
+POD_ARCHIVE_BASENAME="pod-archive-${POD_NAME}-${ORT_POD_VERSION}.zip"
+PODSPEC_BASENAME="${POD_NAME}.podspec"
 
-assemble_and_upload_pod() {
-  local POD_NAME=${1:?"Expected pod name as first argument."}
-  local POD_ARCHIVE_BASENAME="pod-archive-${POD_NAME}-${ORT_POD_VERSION}.zip"
-  local PODSPEC_BASENAME="${POD_NAME}.podspec"
+pushd "${BINARIES_STAGING_DIR}/${POD_NAME}"
 
-  pushd ${BINARIES_STAGING_DIR}/${POD_NAME}
+# assemble the files in the artifacts staging directory
+zip -r "${ARTIFACTS_STAGING_DIR}/${POD_ARCHIVE_BASENAME}" ./* --exclude "${PODSPEC_BASENAME}"
+cp "${PODSPEC_BASENAME}" "${ARTIFACTS_STAGING_DIR}/${PODSPEC_BASENAME}"
 
-  # assemble the files in the artifacts staging directory
-  zip -r ${ARTIFACTS_STAGING_DIR}/${POD_ARCHIVE_BASENAME} * --exclude ${PODSPEC_BASENAME}
-  cp ${PODSPEC_BASENAME} ${ARTIFACTS_STAGING_DIR}/${PODSPEC_BASENAME}
-
-  if [[ "${SHOULD_UPLOAD_ARCHIVES}" == "true" ]]; then
-    # upload the pod archive and set the podspec source to the pod archive URL
-    az storage blob upload \
-      --account-name ${STORAGE_ACCOUNT_NAME} --container-name ${STORAGE_ACCOUNT_CONTAINER_NAME} \
-      --file ${ARTIFACTS_STAGING_DIR}/${POD_ARCHIVE_BASENAME} --name ${POD_ARCHIVE_BASENAME} \
-      --if-none-match "*"
-
-    sed -i "" -e "s|file:///http_source_placeholder|${STORAGE_URL_PREFIX}${POD_ARCHIVE_BASENAME}|" \
-      ${ARTIFACTS_STAGING_DIR}/${PODSPEC_BASENAME}
-  fi
-
-  popd
-}
-
-assemble_and_upload_pod "onnxruntime-mobile-c"
-assemble_and_upload_pod "onnxruntime-mobile-objc"
-assemble_and_upload_pod "onnxruntime-c"
-assemble_and_upload_pod "onnxruntime-objc"
+popd
diff --git a/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json b/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json
index fd07074209d20..621af55fad7fa 100644
--- a/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json
+++ b/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json
@@ -14,9 +14,9 @@
         "--use_xcode",
         "--build_apple_framework",
         "--use_coreml",
+        "--use_xnnpack",
         "--skip_tests",
         "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF",
-        "--apple_deploy_target=12.0",
-        "--use_xnnpack"
+        "--apple_deploy_target=12.0"
     ]
 }
diff --git a/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json b/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json
index aa9bdc483d609..ec7fcafce04f2 100644
--- a/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json
+++ b/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json
@@ -14,6 +14,8 @@
         "--use_xcode",
         "--enable_training_apis",
         "--build_apple_framework",
+        "--use_coreml",
+        "--use_xnnpack",
         "--skip_tests",
         "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF",
         "--apple_deploy_target=12.0"
diff --git a/tools/ci_build/github/apple/upload_pod_archive_and_update_podspec.sh b/tools/ci_build/github/apple/upload_pod_archive_and_update_podspec.sh
new file mode 100755
index 0000000000000..27b391fb0dc1b
--- /dev/null
+++ b/tools/ci_build/github/apple/upload_pod_archive_and_update_podspec.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Note: This script is intended to be called from the iOS CocoaPods package release pipeline or a similar context.
+
+set -e
+set -x
+
+USAGE_TEXT="Usage: ${0} <path to pod archive> <path to podspec>"
+
+abspath() {
+  local INPUT_PATH=${1:?"Expected path as the first argument."}
+  echo "$(cd "$(dirname "${INPUT_PATH}")" && pwd)/$(basename "${INPUT_PATH}")"
+}
+
+POD_ARCHIVE_PATH=$(abspath "${1:?${USAGE_TEXT}}")
+PODSPEC_PATH=$(abspath "${2:?${USAGE_TEXT}}")
+
+POD_ARCHIVE_BASENAME=$(basename "${POD_ARCHIVE_PATH}")
+
+STORAGE_ACCOUNT_NAME="onnxruntimepackages"
+STORAGE_ACCOUNT_CONTAINER_NAME="\$web"
+STORAGE_URL_PREFIX=$(az storage account show --name ${STORAGE_ACCOUNT_NAME} --query "primaryEndpoints.web" --output tsv)
+
+# upload the pod archive and set the podspec source to the pod archive URL
+az storage blob upload \
+  --account-name ${STORAGE_ACCOUNT_NAME} --container-name ${STORAGE_ACCOUNT_CONTAINER_NAME} \
+  --file "${POD_ARCHIVE_PATH}" --name "${POD_ARCHIVE_BASENAME}" \
+  --if-none-match "*"
+
+sed -i "" -e "s|file:///http_source_placeholder|${STORAGE_URL_PREFIX}${POD_ARCHIVE_BASENAME}|" "${PODSPEC_PATH}"
diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
index d7509180860c1..07fd5c4974d2f 100644
--- a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml
@@ -1,235 +1,117 @@
 parameters:
-- name: BuildType
+- name: buildType
   displayName: |-
     Type of build.
     "release": A release build to be published for an official ONNX Runtime release.
-    "prerelease": A pre-release build to be published for validation prior to release.
-    "normal": A normal build not for publication.
+    "normal": A normal build. This can be published as a pre-release build for validation prior to release.
   type: string
   values:
   - release
-  - prerelease
   - normal
   default: normal
 
 name: "$(Date:yyyyMMdd)$(Rev:rrr)"  # build number format
 
-jobs:
-- job: IosPackaging
-  displayName: "iOS Packaging"
-
-  pool:
-    vmImage: "macOS-13"
-
+stages:
+- stage: IosPackaging_SetCommonVariables
+  dependsOn: []
+  
   variables:
-    xcodeVersion: "14.3"
-
-  timeoutInMinutes: 300
-
-  steps:
-  - task: InstallAppleCertificate@2
-    inputs:
-      certSecureFile: '$(ios_signing_certificate_name)'
-      certPwd: '$(ios_signing_certificate_password)'
-      keychain: 'temp'
-      deleteCert: true
-    displayName: 'Install ORT Mobile Test Signing Certificate'
-
-  - task: InstallAppleProvisioningProfile@1
-    inputs:
-      provProfileSecureFile: '$(ios_provision_profile_name)'
-      removeProfile: true
-    displayName: 'Install ORT Mobile Test Provisioning Profile'
-
-  - task: UsePythonVersion@0
-    inputs:
-      versionSpec: "3.9"
-      addToPath: true
-      architecture: "x64"
-
-  - template: templates/use-xcode-version.yml
-    parameters:
-      xcodeVersion: ${{ variables.xcodeVersion }}
-
-  - template: templates/install-appcenter.yml
-
-  - script: |
-      pip install -r tools/ci_build/github/apple/ios_packaging.requirements.txt
-    displayName: "Install Python requirements"
-
-  - bash: |
-      set -e
-
-      BUILD_TYPE="${{ parameters.BuildType }}"
-      BASE_VERSION="$(cat ./VERSION_NUMBER)"
-      SHORT_COMMIT_HASH="$(git rev-parse --short HEAD)"
-      DEV_VERSION="${BASE_VERSION}-dev+$(Build.BuildNumber).${SHORT_COMMIT_HASH}"
-
-      case "${BUILD_TYPE}" in
-        ("release")
-          VERSION="${BASE_VERSION}"; SHOULD_UPLOAD_ARCHIVES="true" ;;
-        ("prerelease")
-          VERSION="${DEV_VERSION}";  SHOULD_UPLOAD_ARCHIVES="true" ;;
-        ("normal")
-          VERSION="${DEV_VERSION}";  SHOULD_UPLOAD_ARCHIVES="false" ;;
-        (*)
-          echo "Invalid build type: ${BUILD_TYPE}"; exit 1 ;;
-      esac
-
-      # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote.
-      set +x
-
-      set_var() {
-        local VAR_NAME=${1:?}
-        local VAR_VALUE=${2:?}
-        echo "##vso[task.setvariable variable=${VAR_NAME}]${VAR_VALUE}"
-        echo "${VAR_NAME}: ${VAR_VALUE}"
-      }
-
-      set_var "ORT_POD_VERSION" "${VERSION}"
-      set_var "ORT_SHOULD_UPLOAD_ARCHIVES" "${SHOULD_UPLOAD_ARCHIVES}"
-    displayName: "Set variables"
-
-  - script: |
-      $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_protobuf.sh -p $(Build.BinariesDirectory)/protobuf_install -d $(Build.SourcesDirectory)/cmake/deps.txt
-    displayName: "Build Host Protoc"
-
-  # create and test mobile pods
-  - script: |
-      python tools/ci_build/github/apple/build_and_assemble_ios_pods.py \
-        --build-dir "$(Build.BinariesDirectory)/ios_framework_mobile" \
-        --staging-dir "$(Build.BinariesDirectory)/staging" \
-        --pod-version "${ORT_POD_VERSION}" \
-        --test \
-        --variant Mobile \
-        --build-settings-file tools/ci_build/github/apple/default_mobile_ios_framework_build_settings.json \
-        --include-ops-by-config tools/ci_build/github/android/mobile_package.required_operators.config \
-        -b="--path_to_protoc_exe" -b "$(Build.BinariesDirectory)/protobuf_install/bin/protoc"
-    displayName: "[Mobile] Build iOS framework and assemble pod package files"
-
-  - script: |
-      python tools/ci_build/github/apple/test_ios_packages.py \
-        --fail_if_cocoapods_missing \
-        --framework_info_file "$(Build.BinariesDirectory)/ios_framework_mobile/framework_info.json" \
-        --c_framework_dir "$(Build.BinariesDirectory)/ios_framework_mobile/framework_out" \
-        --variant Mobile \
-        --test_project_stage_dir "$(Build.BinariesDirectory)/app_center_test_mobile" \
-        --prepare_test_project_only
-    displayName: "[Mobile] Assemble test project for App Center"
-
-  - task: Xcode@5
-    inputs:
-      actions: 'build-for-testing'
-      configuration: 'Debug'
-      xcWorkspacePath: '$(Build.BinariesDirectory)/app_center_test_mobile/ios_package_test/ios_package_test.xcworkspace'
-      sdk: 'iphoneos'
-      scheme: 'ios_package_test'
-      xcodeVersion: 'specifyPath'
-      xcodeDeveloperDir: '/Applications/Xcode_${{ variables.xcodeVersion }}.app/Contents/Developer'
-      signingOption: 'manual'
-      signingIdentity: '$(APPLE_CERTIFICATE_SIGNING_IDENTITY)'
-      provisioningProfileName: 'iOS Team Provisioning Profile'
-      args: '-derivedDataPath $(Build.BinariesDirectory)/app_center_test_mobile/ios_package_test/DerivedData'
-      workingDirectory: '$(Build.BinariesDirectory)/app_center_test_mobile/ios_package_test/'
-    displayName: '[Mobile] Build iphone arm64 tests'
-
-  - script: |
-      set -e -x
-      appcenter test run xcuitest \
-        --app "AI-Frameworks/ORT-Mobile-iOS" \
-        --devices $(app_center_test_devices) \
-        --test-series "master" \
-        --locale "en_US" \
-        --build-dir $(Build.BinariesDirectory)/app_center_test_mobile/ios_package_test/DerivedData/Build/Products/Debug-iphoneos \
-        --token $(app_center_api_token)
-    displayName: "[Mobile] Run E2E tests on App Center"
-
-  # create and test full pods
-  - script: |
-      python tools/ci_build/github/apple/build_and_assemble_ios_pods.py \
-        --build-dir "$(Build.BinariesDirectory)/ios_framework_full" \
-        --staging-dir "$(Build.BinariesDirectory)/staging" \
-        --pod-version "${ORT_POD_VERSION}" \
-        --test \
-        --variant Full \
-        --build-settings-file tools/ci_build/github/apple/default_full_ios_framework_build_settings.json \
-        -b="--path_to_protoc_exe" -b "$(Build.BinariesDirectory)/protobuf_install/bin/protoc"
-    displayName: "[Full] Build iOS framework and assemble pod package files"
-
-  - script: |
-      python tools/ci_build/github/apple/test_ios_packages.py \
-        --fail_if_cocoapods_missing \
-        --framework_info_file "$(Build.BinariesDirectory)/ios_framework_full/framework_info.json" \
-        --c_framework_dir "$(Build.BinariesDirectory)/ios_framework_full/framework_out" \
-        --variant Full \
-        --test_project_stage_dir "$(Build.BinariesDirectory)/app_center_test_full" \
-        --prepare_test_project_only
-    displayName: "[Full] Assemble test project for App Center"
-
-  - task: Xcode@5
-    inputs:
-      actions: 'build-for-testing'
-      configuration: 'Debug'
-      xcWorkspacePath: '$(Build.BinariesDirectory)/app_center_test_full/ios_package_test/ios_package_test.xcworkspace'
-      sdk: 'iphoneos'
-      scheme: 'ios_package_test'
-      signingOption: 'manual'
-      signingIdentity: '$(APPLE_CERTIFICATE_SIGNING_IDENTITY)'
-      provisioningProfileName: 'iOS Team Provisioning Profile'
-      args: '-derivedDataPath $(Build.BinariesDirectory)/app_center_test_full/ios_package_test/DerivedData'
-      workingDirectory: $(Build.BinariesDirectory)/app_center_test_full/ios_package_test/
-    displayName: '[Full] Build iphone arm64 tests'
-
-  - script: |
-      set -e -x
-      appcenter test run xcuitest \
-        --app "AI-Frameworks/ORT-Mobile-iOS" \
-        --devices $(app_center_test_devices) \
-        --test-series "master" \
-        --locale "en_US" \
-        --build-dir $(Build.BinariesDirectory)/app_center_test_full/ios_package_test/DerivedData/Build/Products/Debug-iphoneos \
-        --token $(app_center_api_token)
-    displayName: "[Full] Run E2E tests on App Center"
-
-  - task: AzureCLI@2
-    inputs:
-      azureSubscription: 'AIInfraBuildOnnxRuntimeOSS'
-      scriptType: 'bash'
-      scriptLocation: 'scriptPath'
-      scriptPath: 'tools/ci_build/github/apple/assemble_ios_packaging_artifacts.sh'
-      arguments: >-
-        "$(Build.BinariesDirectory)/staging"
-        "$(Build.ArtifactStagingDirectory)"
-        "$(ORT_POD_VERSION)"
-        "$(ORT_SHOULD_UPLOAD_ARCHIVES)"
-    displayName: "Assemble artifacts"
-
-  - script: |
-      set -e -x
-      ls -R "$(Build.ArtifactStagingDirectory)"
-    displayName: "List staged artifacts"
-
-  - script: |
-      set -e -x
-      shasum -a 256 "$(Build.ArtifactStagingDirectory)/pod-archive-onnxruntime-c-${ORT_POD_VERSION}.zip"
-    displayName: "Print ORT iOS Pod checksum"
-
-
-  # copy the pod archive to a path relative to Package.swift and set the env var required by Package.swift to use that.
-  # xcodebuild will implicitly use Package.swift and build/run the .testTarget (tests in swift/onnxTests).
-  # once that's done cleanup the copy of the pod zip file
-  - script: |
-      set -e -x
-      cp "$(Build.ArtifactStagingDirectory)/pod-archive-onnxruntime-c-${ORT_POD_VERSION}.zip" swift/
-      export ORT_IOS_POD_LOCAL_PATH="swift/pod-archive-onnxruntime-c-${ORT_POD_VERSION}.zip"
-      xcodebuild test -scheme onnxruntime -destination 'platform=iOS Simulator,name=iPhone 14'
-      rm swift/pod-archive-onnxruntime-c-*.zip
-    displayName: "Test Package.swift usage"
-
-  - publish: "$(Build.ArtifactStagingDirectory)"
-    artifact: ios_packaging_artifacts
-    displayName: "Publish artifacts"
-
-  - template: templates/component-governance-component-detection-steps.yml
-    parameters :
-      condition : 'succeeded'
+    skipComponentGovernanceDetection: true
+
+  jobs:
+  - job: j
+    displayName: "Set common variables"
+
+    pool:
+      vmImage: "macOS-13"
+
+    timeoutInMinutes: 5
+
+    steps:
+    - bash: |
+        set -e
+
+        BUILD_TYPE="${{ parameters.buildType }}"
+        BASE_VERSION="$(cat ./VERSION_NUMBER)"
+        SHORT_COMMIT_HASH="$(git rev-parse --short HEAD)"
+        DEV_VERSION="${BASE_VERSION}-dev+$(Build.BuildNumber).${SHORT_COMMIT_HASH}"
+
+        case "${BUILD_TYPE}" in
+          ("release")
+            VERSION="${BASE_VERSION}" ;;
+          ("normal")
+            VERSION="${DEV_VERSION}" ;;
+          (*)
+            echo "Invalid build type: ${BUILD_TYPE}"; exit 1 ;;
+        esac
+
+        # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote.
+        set +x
+
+        set_var() {
+          local VAR_NAME=${1:?}
+          local VAR_VALUE=${2:?}
+          echo "##vso[task.setvariable variable=${VAR_NAME};isoutput=true;isreadonly=true]${VAR_VALUE}"
+          echo "${VAR_NAME}: ${VAR_VALUE}"
+        }
+
+        set_var "ORT_POD_VERSION" "${VERSION}"
+      displayName: "Set common variables"
+      name: SetCommonVariables
+
+- template: templates/stages/mac-ios-packaging-build-stage.yml
+  parameters:
+    packageVariant: Mobile
+
+- template: templates/stages/mac-ios-packaging-build-stage.yml
+  parameters:
+    packageVariant: Full
+
+- template: templates/stages/mac-ios-packaging-build-stage.yml
+  parameters:
+    packageVariant: Training
+
+- stage: IosPackaging_TestPackageSwift_Full
+  dependsOn:
+  - IosPackaging_SetCommonVariables
+  - IosPackaging_Build_Full
+
+  jobs:
+  - job: j
+    displayName: "Test Package.swift with full package"
+
+    pool:
+      vmImage: "macOS-13"
+
+    variables:
+      xcodeVersion: "14.3"
+      ortPodVersion: $[stageDependencies.IosPackaging_SetCommonVariables.j.outputs['SetCommonVariables.ORT_POD_VERSION']]
+      skipComponentGovernanceDetection: true
+
+    timeoutInMinutes: 10
+
+    steps:
+    - template: templates/use-xcode-version.yml
+      parameters:
+        xcodeVersion: ${{ variables.xcodeVersion }}
+
+    - download: current
+      artifact: ios_packaging_artifacts_full
+      displayName: "Download full build artifacts"
+
+    - script: |
+        set -e -x
+        shasum -a 256 "$(Pipeline.Workspace)/ios_packaging_artifacts_full/pod-archive-onnxruntime-c-$(ortPodVersion).zip"
+      displayName: "Print ORT iOS Pod checksum"
+
+    # copy the pod archive to a path relative to Package.swift and set the env var required by Package.swift to use that.
+    # xcodebuild will implicitly use Package.swift and build/run the .testTarget (tests in swift/onnxTests).
+    # once that's done cleanup the copy of the pod zip file
+    - script: |
+        set -e -x
+        cp "$(Pipeline.Workspace)/ios_packaging_artifacts_full/pod-archive-onnxruntime-c-$(ortPodVersion).zip" swift/
+        export ORT_IOS_POD_LOCAL_PATH="swift/pod-archive-onnxruntime-c-$(ortPodVersion).zip"
+        xcodebuild test -scheme onnxruntime -destination 'platform=iOS Simulator,name=iPhone 14'
+        rm swift/pod-archive-onnxruntime-c-$(ortPodVersion).zip
+      displayName: "Test Package.swift usage"
diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
new file mode 100644
index 0000000000000..15254ce4d1d5b
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml
@@ -0,0 +1,162 @@
+parameters:
+- name: packageVariant
+  type: string
+  values:
+  - Mobile
+  - Full
+  - Training
+
+stages:
+- stage: IosPackaging_Build_${{ parameters.packageVariant }}
+  dependsOn:
+  - IosPackaging_SetCommonVariables
+
+  jobs:
+  - job: j
+    displayName: "Build iOS package for variant: ${{ parameters.packageVariant}}"
+
+    pool:
+      vmImage: "macOS-13"
+
+    variables:
+      xcodeVersion: "14.3"
+      ortPodVersion: $[stageDependencies.IosPackaging_SetCommonVariables.j.outputs['SetCommonVariables.ORT_POD_VERSION']]
+
+      ${{ if eq(parameters.packageVariant, 'Mobile') }}:
+        buildSettingsFile: "tools/ci_build/github/apple/default_mobile_ios_framework_build_settings.json"
+        optionalIncludeOpsByConfigOption: "--include-ops-by-config tools/ci_build/github/android/mobile_package.required_operators.config"
+        cPodName: onnxruntime-mobile-c
+        objcPodName: onnxruntime-mobile-objc
+
+      ${{ if eq(parameters.packageVariant, 'Full') }}:
+        buildSettingsFile: "tools/ci_build/github/apple/default_full_ios_framework_build_settings.json"
+        cPodName: onnxruntime-c
+        objcPodName: onnxruntime-objc
+
+      ${{ if eq(parameters.packageVariant, 'Training') }}:
+        buildSettingsFile: "tools/ci_build/github/apple/default_training_ios_framework_build_settings.json"
+        cPodName: onnxruntime-training-c
+        objcPodName: onnxruntime-training-objc
+
+    timeoutInMinutes: 120
+
+    steps:
+    - script: |
+        if [[ -z "$(ortPodVersion)" ]]; then
+          echo "ORT pod version is unspecified. Make sure that the IosPackaging_SetCommonVariables stage has run."
+          exit 1
+        fi
+      displayName: 'Ensure version is set'
+
+    - task: InstallAppleCertificate@2
+      inputs:
+        certSecureFile: '$(ios_signing_certificate_name)'
+        certPwd: '$(ios_signing_certificate_password)'
+        keychain: 'temp'
+        deleteCert: true
+      displayName: 'Install ORT Mobile Test Signing Certificate'
+
+    - task: InstallAppleProvisioningProfile@1
+      inputs:
+        provProfileSecureFile: '$(ios_provision_profile_name)'
+        removeProfile: true
+      displayName: 'Install ORT Mobile Test Provisioning Profile'
+
+    - task: UsePythonVersion@0
+      inputs:
+        versionSpec: "3.9"
+        addToPath: true
+        architecture: "x64"
+
+    - template: ../use-xcode-version.yml
+      parameters:
+        xcodeVersion: ${{ variables.xcodeVersion }}
+
+    - template: ../install-appcenter.yml
+
+    - script: |
+        pip install -r tools/ci_build/github/apple/ios_packaging.requirements.txt
+      displayName: "Install Python requirements"
+
+    - script: |
+        $(Build.SourcesDirectory)/tools/ci_build/github/linux/docker/inference/x64/python/cpu/scripts/install_protobuf.sh -p $(Build.BinariesDirectory)/protobuf_install -d $(Build.SourcesDirectory)/cmake/deps.txt
+      displayName: "Build Host Protoc"
+
+    # create and test mobile pods
+    - script: |
+        python tools/ci_build/github/apple/build_and_assemble_ios_pods.py \
+          --build-dir "$(Build.BinariesDirectory)/ios_framework" \
+          --staging-dir "$(Build.BinariesDirectory)/staging" \
+          --pod-version "$(ortPodVersion)" \
+          --test \
+          --variant ${{ parameters.packageVariant }} \
+          --build-settings-file "${{ variables.buildSettingsFile }}" \
+          ${{ variables.optionalIncludeOpsByConfigOption }} \
+          -b="--path_to_protoc_exe=$(Build.BinariesDirectory)/protobuf_install/bin/protoc"
+      displayName: "Build iOS framework and assemble pod package files"
+
+    - script: |
+        python tools/ci_build/github/apple/test_ios_packages.py \
+          --fail_if_cocoapods_missing \
+          --framework_info_file "$(Build.BinariesDirectory)/ios_framework/framework_info.json" \
+          --c_framework_dir "$(Build.BinariesDirectory)/ios_framework/framework_out" \
+          --variant ${{ parameters.packageVariant }} \
+          --test_project_stage_dir "$(Build.BinariesDirectory)/app_center_test" \
+          --prepare_test_project_only
+      displayName: "Assemble test project for App Center"
+
+    - task: Xcode@5
+      inputs:
+        actions: 'build-for-testing'
+        configuration: 'Debug'
+        xcWorkspacePath: '$(Build.BinariesDirectory)/app_center_test/ios_package_test/ios_package_test.xcworkspace'
+        sdk: 'iphoneos'
+        scheme: 'ios_package_test'
+        xcodeVersion: 'specifyPath'
+        xcodeDeveloperDir: '/Applications/Xcode_${{ variables.xcodeVersion }}.app/Contents/Developer'
+        signingOption: 'manual'
+        signingIdentity: '$(APPLE_CERTIFICATE_SIGNING_IDENTITY)'
+        provisioningProfileName: 'iOS Team Provisioning Profile'
+        args: '-derivedDataPath $(Build.BinariesDirectory)/app_center_test/ios_package_test/DerivedData'
+        workingDirectory: '$(Build.BinariesDirectory)/app_center_test/ios_package_test/'
+      displayName: 'Build App Center iPhone arm64 tests'
+
+    - script: |
+        set -e -x
+        appcenter test run xcuitest \
+          --app "AI-Frameworks/ORT-Mobile-iOS" \
+          --devices $(app_center_test_devices) \
+          --test-series "master" \
+          --locale "en_US" \
+          --build-dir $(Build.BinariesDirectory)/app_center_test/ios_package_test/DerivedData/Build/Products/Debug-iphoneos \
+          --token $(app_center_api_token)
+      displayName: "Run E2E tests on App Center"
+
+    - script: |
+        set -e -x
+
+        for POD_NAME in "${{ variables.cPodName}}" "${{ variables.objcPodName }}";
+        do
+          ./tools/ci_build/github/apple/assemble_ios_packaging_artifacts.sh \
+            "$(Build.BinariesDirectory)/staging" \
+            "$(Build.ArtifactStagingDirectory)" \
+            "${POD_NAME}" \
+            "$(ortPodVersion)"
+        done
+
+        # copy over helper script for use in release pipeline
+        cp tools/ci_build/github/apple/upload_pod_archive_and_update_podspec.sh "$(Build.ArtifactStagingDirectory)"
+      displayName: "Assemble artifacts"
+
+    - script: |
+        set -e -x
+        ls -R "$(Build.ArtifactStagingDirectory)"
+      displayName: "List staged artifacts"
+
+    - publish: "$(Build.ArtifactStagingDirectory)"
+      artifact: "ios_packaging_artifacts_${{ lower(parameters.packageVariant) }}"
+      displayName: "Publish artifacts"
+
+    - template: ../component-governance-component-detection-steps.yml
+      parameters :
+        condition : 'succeeded'

From 8b866060f20d9cb2d66bf027ef03813928e6666e Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Thu, 20 Jul 2023 14:30:29 +1000
Subject: [PATCH 02/34] Comment out ORT-Nightly feed in test app NuGet.config
 (#16762)

### Description
<!-- Describe your changes. -->
Comment out ORT-Nightly feed in NuGet.config to see if that makes the
Secure Supply Chain Analysis CI step happy.

Add info to readme on manually adding feed and using it.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 csharp/tools/MauiModelTester/NuGet.config |  4 +++-
 csharp/tools/MauiModelTester/ReadMe.md    | 13 ++++++++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/csharp/tools/MauiModelTester/NuGet.config b/csharp/tools/MauiModelTester/NuGet.config
index d6fbb88c27b40..24675ee3f58f2 100644
--- a/csharp/tools/MauiModelTester/NuGet.config
+++ b/csharp/tools/MauiModelTester/NuGet.config
@@ -6,9 +6,11 @@
   <packageSources>
     <clear />
     <add key="NuGet Official" value="https://api.nuget.org/v3/index.json" />
+    <!-- Uncomment the following line to use packages from the ORT nightly feed
     <add key="ORT-Nightly" value="https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json" />
+    -->
   </packageSources>
   <disabledPackageSources>
     <clear />
   </disabledPackageSources>
-</configuration>
\ No newline at end of file
+</configuration>
diff --git a/csharp/tools/MauiModelTester/ReadMe.md b/csharp/tools/MauiModelTester/ReadMe.md
index 24f3c265446e1..43628a68601cb 100644
--- a/csharp/tools/MauiModelTester/ReadMe.md
+++ b/csharp/tools/MauiModelTester/ReadMe.md
@@ -16,13 +16,24 @@ Resources\Raw\test_data\test_data_set_0
 The MAUI application will read the model and test data from those locations and should need no other changes to be able
 to execute the model.
 
-NOTE: The project uses builds from the nightly feed to keep things simple.
+The project uses builds from the nightly feed by default to keep things simple.
 
 If it was part of the main ONNX Runtime C# solution we'd have to
   - add the ORT nightly feed to the top level nuget.config
     - this potentially adds confusion about where nuget packages come from in unit tests
   - keep updating the referenced nightly packages so they remain valid so the complete solution builds in the CI
 
+You will need to manually add the ORT-Nightly feed to the packageSources section of the nuget.config in this directory.
+  - `<add key="ORT-Nightly" value="https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/nuget/v3/index.json" />`
+  - This feed isn't allowed in the checked in nuget.config
+
+If you need to update the ORT packages used by the app to the latest nightly:
+- In Visual Studio, Tools -> Nuget Package Manager -> Manage NuGet Packages for Solution...
+- Make sure 'Include prerelease' is checked
+- Set Package Source to ORT-Nightly
+- Update Microsoft.ML.OnnxRuntime, Microsoft.ML.OnnxRuntime.Managed and Microsoft.ML.OnnxRuntime.Extensions to the
+latest build.
+
 ## Testing C# or native code changes
 
 If you have new code to test the easiest way is to run the nuget packaging pipeline on

From c314d7724f4f2e7e95b0d94e76dbacd9f7141531 Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Thu, 20 Jul 2023 16:52:13 +0800
Subject: [PATCH 03/34] Update dml gpu pool to onnxruntime-Win2022-GPU-dml-A10
 (#16765)

### Description
onnxruntime-Win2022-GPU-dml-A10 is using VS2022.



### Motivation and Context
1. Upgrade VS2019 to VS2022 to fix prefast issue.
---
 .../azure-pipelines/templates/py-packaging-stage.yml      | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
index 7bd69ef4b1269..07218772c0ab8 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -303,7 +303,7 @@ stages:
 
       - template: py-win-gpu.yml
         parameters:
-          MACHINE_POOL: 'onnxruntime-Win2019-GPU-dml-A10'
+          MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
           PYTHON_VERSION: '3.8'
           EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
           ENV_SETUP_SCRIPT: setup_env.bat
@@ -311,7 +311,7 @@ stages:
 
       - template: py-win-gpu.yml
         parameters:
-          MACHINE_POOL: 'onnxruntime-Win2019-GPU-dml-A10'
+          MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
           PYTHON_VERSION: '3.9'
           EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
           ENV_SETUP_SCRIPT: setup_env.bat
@@ -319,7 +319,7 @@ stages:
 
       - template: py-win-gpu.yml
         parameters:
-          MACHINE_POOL: 'onnxruntime-Win2019-GPU-dml-A10'
+          MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
           PYTHON_VERSION: '3.10'
           EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
           ENV_SETUP_SCRIPT: setup_env.bat
@@ -327,7 +327,7 @@ stages:
 
       - template: py-win-gpu.yml
         parameters:
-          MACHINE_POOL: 'onnxruntime-Win2019-GPU-dml-A10'
+          MACHINE_POOL: 'onnxruntime-Win2022-GPU-dml-A10'
           PYTHON_VERSION: '3.11'
           EP_BUILD_FLAGS: --use_dml --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.18362.0 --enable_wcos
           ENV_SETUP_SCRIPT: setup_env.bat

From 2bc9fbb62141dc8ab3c9355690cf2cb2888217eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@users.noreply.github.com>
Date: Thu, 20 Jul 2023 16:02:22 +0200
Subject: [PATCH 04/34] Fix url in the code documentation (graph optimizations)
 (#16770)

### Description
Fix a wrong url in the documentation as mentioned in issue #16678.



### Motivation and Context
Better documentation.
---
 include/onnxruntime/core/session/onnxruntime_c_api.h | 2 +-
 objectivec/include/ort_enums.h                       | 2 +-
 onnxruntime/python/tools/transformers/optimizer.py   | 2 +-
 tools/python/util/optimize_onnx_model.py             | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 12e68630eaf48..75359602f8ea4 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -987,7 +987,7 @@ struct OrtApi {
 
   /** \brief Set the optimization level to apply when loading a graph
    *
-   * Please see https://onnxruntime.ai/docs/performance/graph-optimizations.html for an in-depth explanation
+   * Please see https://onnxruntime.ai/docs/performance/model-optimizations/graph-optimizations.html for an in-depth explanation
    * \param[in,out] options The session options object
    * \param[in] graph_optimization_level The optimization level
    *
diff --git a/objectivec/include/ort_enums.h b/objectivec/include/ort_enums.h
index d505ce2c4e5d5..6bc29aaa00965 100644
--- a/objectivec/include/ort_enums.h
+++ b/objectivec/include/ort_enums.h
@@ -43,7 +43,7 @@ typedef NS_ENUM(int32_t, ORTTensorElementDataType) {
 /**
  * The ORT graph optimization levels.
  * See here for more details:
- * https://onnxruntime.ai/docs/performance/graph-optimizations.html
+ * https://onnxruntime.ai/docs/performance/model-optimizations/graph-optimizations.html
  */
 typedef NS_ENUM(int32_t, ORTGraphOptimizationLevel) {
   ORTGraphOptimizationLevelNone,
diff --git a/onnxruntime/python/tools/transformers/optimizer.py b/onnxruntime/python/tools/transformers/optimizer.py
index 8719380ff2163..99b48e501481e 100644
--- a/onnxruntime/python/tools/transformers/optimizer.py
+++ b/onnxruntime/python/tools/transformers/optimizer.py
@@ -207,7 +207,7 @@ def optimize_model(
 ):
     """Optimize Model by OnnxRuntime and/or python fusion logic.
 
-    ONNX Runtime has graph optimizations (https://onnxruntime.ai/docs/performance/graph-optimizations.html).
+    ONNX Runtime has graph optimizations (https://onnxruntime.ai/docs/performance/model-optimizations/graph-optimizations.html).
     However, the coverage is limited. We also have graph fusions that implemented in Python to improve the coverage.
     They can combined: ONNX Runtime will run first when opt_level > 0, then graph fusions in Python will be applied.
 
diff --git a/tools/python/util/optimize_onnx_model.py b/tools/python/util/optimize_onnx_model.py
index 4cb9b862b37cb..b7ebb54b9c8fa 100644
--- a/tools/python/util/optimize_onnx_model.py
+++ b/tools/python/util/optimize_onnx_model.py
@@ -14,7 +14,7 @@ def optimize_model_helper():
         f"{os.path.basename(__file__)}:{optimize_model_helper.__name__}",
         description="""
                                      Optimize an ONNX model using ONNX Runtime to the specified level.
-                                     See https://onnxruntime.ai/docs/performance/graph-optimizations.html for more
+                                     See https://onnxruntime.ai/docs/performance/model-optimizations/graph-optimizations.html for more
                                      details of the optimization levels.""",
     )
 

From 0e40049eb277bcc468b66b46d0bf79542e424b93 Mon Sep 17 00:00:00 2001
From: zesongw <zesong.wang@intel.com>
Date: Thu, 20 Jul 2023 22:57:48 +0800
Subject: [PATCH 05/34] [WebNN EP] Add support for Op Pad. (#16732)

### Description
<!-- Describe your changes. -->
Support Op Pad for WebNN EP. It aims to support three modes (constant,
reflect and edge). For now, only constant can be tested with Chrome
Canary.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Support more models like SD1.5-VAE-encode.
---
 .../core/providers/webnn/builders/helper.h    |  38 ++++
 .../webnn/builders/impl/pad_op_builder.cc     | 171 ++++++++++++++++++
 .../webnn/builders/op_builder_factory.cc      |   4 +
 .../webnn/builders/op_builder_factory.h       |   1 +
 4 files changed, 214 insertions(+)
 create mode 100644 onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc

diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h
index 0a8fb6bf1db1b..af4780323f7cb 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.h
+++ b/onnxruntime/core/providers/webnn/builders/helper.h
@@ -84,6 +84,43 @@ bool ReadIntArrayFrom1DTensor(const onnx::TensorProto& tensor, std::vector<T>& a
   return true;
 }
 
+inline bool ReadScalarTensorData(const onnx::TensorProto& tensor, emscripten::val& scalar, const logging::Logger& logger) {
+  std::vector<uint8_t> unpacked_tensor;
+  auto status = onnxruntime::utils::UnpackInitializerData(tensor, unpacked_tensor);
+  if (!status.IsOK()) {
+    LOGS(logger, ERROR) << "Error while unpacking tensor: " << status.ErrorMessage();
+    return false;
+  }
+  switch (tensor.data_type()) {
+    case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+      scalar = emscripten::val{*reinterpret_cast<uint8_t*>(unpacked_tensor.data())};
+      break;
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
+      scalar = emscripten::val{MLFloat16::FromBits(*reinterpret_cast<uint16_t*>(unpacked_tensor.data())).ToFloat()};
+      break;
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
+      scalar = emscripten::val{*reinterpret_cast<float*>(unpacked_tensor.data())};
+      break;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT32:
+      scalar = emscripten::val{*reinterpret_cast<int32_t*>(unpacked_tensor.data())};
+      break;
+    case ONNX_NAMESPACE::TensorProto_DataType_INT64:
+      scalar = emscripten::val{*reinterpret_cast<int64_t*>(unpacked_tensor.data())};
+      break;
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT32:
+      scalar = emscripten::val{*reinterpret_cast<uint32_t*>(unpacked_tensor.data())};
+      break;
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT64:
+      scalar = emscripten::val{*reinterpret_cast<uint64_t*>(unpacked_tensor.data())};
+      break;
+    default:
+      LOGS(logger, ERROR) << "Unsupported data type : " << tensor.data_type();
+      return false;
+      break;
+  }
+  return true;
+}
+
 bool IsInputSupported(const NodeArg& node_arg, const std::string& parent_name, const logging::Logger& logger);
 
 // Get a list of groups of supported nodes, each group represents a subgraph supported by WebNN EP.
@@ -128,6 +165,7 @@ static const InlinedHashMap<std::string, std::string> op_map = {
     {"Mul", "mul"},
     {"Neg", "neg"},
     {"Not", "logicalNot"},
+    {"Pad", "pad"},
     {"Pow", "pow"},
     {"Reciprocal", "reciprocal"},
     {"ReduceMax", "reduceMax"},
diff --git a/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc
new file mode 100644
index 0000000000000..76aa0194423c7
--- /dev/null
+++ b/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc
@@ -0,0 +1,171 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) Intel Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/common/safeint.h"
+#include "core/providers/common.h"
+#include "core/providers/shared/utils/utils.h"
+#include "core/providers/webnn/builders/helper.h"
+#include "core/providers/webnn/builders/model_builder.h"
+#include "core/providers/webnn/builders/op_builder_factory.h"
+
+#include "base_op_builder.h"
+#include "builder_utils.h"
+
+namespace onnxruntime {
+namespace webnn {
+
+class PadOpBuilder : public BaseOpBuilder {
+  // Add operator related.
+ public:
+  void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
+
+ private:
+  Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
+                               const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
+
+  // Operator support related.
+ private:
+  bool IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& node,
+                         const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
+};
+
+// Add operator related.
+
+// ONNX mode to WebNN mode mapping.
+const InlinedHashMap<std::string, std::string> supported_mode = {
+    {"constant", "constant"},
+    {"reflect", "reflection"},
+    {"edge", "edge"},
+};
+
+// Skip for pads, constant value, and axes.
+void PadOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const {
+  for (size_t i = 1; i < node.InputDefs().size(); i++) {
+    model_builder.AddInitializerToSkip(node.InputDefs()[i]->Name());
+    model_builder.AddInputToSkip(node.InputDefs()[i]->Name());
+  }
+}
+
+Status PadOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
+                                           const Node& node,
+                                           const logging::Logger& logger) const {
+  const auto& input_defs = node.InputDefs();
+  const auto& initializers = model_builder.GetInitializerTensors();
+  ORT_RETURN_IF(input_defs.size() < 1, "Pad has no inputs");
+  emscripten::val input = model_builder.GetOperand(input_defs[0]->Name());
+  std::vector<int64_t> input_shape;
+  ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get input shape");
+
+  emscripten::val options = emscripten::val::object();
+
+  NodeAttrHelper helper(node);
+  const auto pad_mode = helper.Get("mode", std::string("constant"));
+  std::vector<int32_t> start_padding;
+  std::vector<int32_t> end_padding;
+  ORT_RETURN_IF(supported_mode.find(pad_mode) == supported_mode.end(), "WebNN dose not support mode", pad_mode);
+  const auto webnn_mode = supported_mode.find(pad_mode)->second;
+  options.set("mode", emscripten::val(webnn_mode));
+
+  const auto opset = node.SinceVersion();
+  // From opset 11, pads, constant value and axes are inputs.
+  if (opset >= 11) {
+    ORT_RETURN_IF(input_defs.size() < 2, "Pads is required at opset ", opset);
+    std::vector<int64_t> pads;
+    const auto& pads_tensor = *initializers.at(input_defs[1]->Name());
+    ORT_RETURN_IF_NOT(ReadIntArrayFrom1DTensor(pads_tensor, pads, logger), "Error while read pads tensor");
+
+    // Constant value and axes are optional.
+    if (input_defs.size() >= 3) {
+      const auto value_tensor = *initializers.at(input_defs[2]->Name());
+      emscripten::val value = emscripten::val::object();
+      ORT_RETURN_IF_NOT(ReadScalarTensorData(value_tensor, value, logger), "Cannot read constant value");
+      options.set("value", value);
+    }
+
+    if (input_defs.size() == 4) {
+      const auto input_rank = input_shape.size();
+      std::vector<int64_t> axes;
+      const auto& axes_tensor = *initializers.at(input_defs[3]->Name());
+      ORT_RETURN_IF_NOT(ReadIntArrayFrom1DTensor(axes_tensor, axes, logger), "Error while read axes tensor");
+      std::vector<size_t> axes_index;
+      std::transform(
+          axes.begin(), axes.end(), std::back_inserter(axes_index),
+          [input_rank](int64_t axis) -> int32_t { return SafeInt<int32_t>(HandleNegativeAxis(axis, input_rank)); });
+      start_padding.resize(input_rank, 0);
+      end_padding.resize(input_rank, 0);
+      for (size_t i = 0; i < axes_index.size(); i++) {
+        size_t index = axes_index[i];
+        start_padding[index] = SafeInt<int32_t>(pads[i]);
+        end_padding[index] = SafeInt<int32_t>(pads[i + pads.size() / 2]);
+      }
+    } else {
+      std::transform(pads.begin(), pads.begin() + pads.size() / 2, std::back_inserter(start_padding),
+                     [](int64_t axis) -> int32_t { return SafeInt<int32_t>(axis); });
+
+      std::transform(pads.begin() + pads.size() / 2, pads.end(), std::back_inserter(end_padding),
+                     [](int64_t axis) -> int32_t { return SafeInt<int32_t>(axis); });
+    }
+  } else {
+    // Before opset 11, pads, constant value are attributes.
+    ORT_RETURN_IF_NOT(helper.HasAttr("pads"), "Pads is required as attribute in opset ", opset);
+    const auto pads = helper.Get("pads", std::vector<int>());
+    const auto value = helper.Get("value", 0.0f);
+    start_padding = std::vector<int32_t>(pads.begin(), pads.begin() + pads.size() / 2);
+    end_padding = std::vector<int32_t>(pads.begin() + pads.size() / 2, pads.end());
+    options.set("value", value);
+  }
+
+  emscripten::val output = model_builder.GetBuilder().call<emscripten::val>("pad", input,
+                                                                            emscripten::val::array(start_padding),
+                                                                            emscripten::val::array(end_padding),
+                                                                            options);
+
+  model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output));
+  return Status::OK();
+}
+
+// Operator support related.
+bool PadOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
+                                     const Node& node,
+                                     const WebnnDeviceType /* device_type */,
+                                     const logging::Logger& logger) const {
+  const auto& op_type = node.OpType();
+  const auto& input_defs = node.InputDefs();
+  const auto opset = node.SinceVersion();
+
+  NodeAttrHelper helper(node);
+  const auto pad_mode = helper.Get("mode", "constant");
+  if (supported_mode.find(pad_mode) == supported_mode.end()) {
+    LOGS(logger, VERBOSE) << op_type << " WebNN does not support mode " << pad_mode;
+    return false;
+  }
+
+  if (input_defs.size() < 1) {
+    LOGS(logger, VERBOSE) << op_type << " requires at least one input (data)";
+    return false;
+  }
+
+  if (opset >= 11) {
+    if (input_defs.size() < 2) {
+      LOGS(logger, VERBOSE) << op_type << " at opset " << opset << " requires at least two inputs (data and pads)";
+      return false;
+    }
+    for (size_t i = 1; i < input_defs.size(); i++) {
+      if (!Contains(initializers, input_defs[i]->Name())) {
+        LOGS(logger, VERBOSE) << "Input [" << input_defs[i]->Name() << "] must be known as initializer";
+        return false;
+      }
+    }
+  }
+
+  return true;
+}  // namespace webnn
+
+void CreatePadOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
+  op_registrations.builders.push_back(std::make_unique<PadOpBuilder>());
+  op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get());
+}
+
+}  // namespace webnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc b/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
index 416c9e1bf9eaa..80f89d8f4495a 100644
--- a/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
+++ b/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
@@ -104,6 +104,10 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
     CreateNormalizationOpBuilder("LayerNormalization", op_registrations);
   }
 
+  {  // Pad
+    CreatePadOpBuilder("Pad", op_registrations);
+  }
+
   {  // Pool
     CreatePoolOpBuilder("GlobalAveragePool", op_registrations);
     CreatePoolOpBuilder("GlobalMaxPool", op_registrations);
diff --git a/onnxruntime/core/providers/webnn/builders/op_builder_factory.h b/onnxruntime/core/providers/webnn/builders/op_builder_factory.h
index 2dde66dff4708..57124faddde02 100644
--- a/onnxruntime/core/providers/webnn/builders/op_builder_factory.h
+++ b/onnxruntime/core/providers/webnn/builders/op_builder_factory.h
@@ -32,6 +32,7 @@ void CreateGatherOpBuilder(const std::string& op_type, OpBuilderRegistrations& o
 void CreateGemmOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateLogicalOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateNormalizationOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+void CreatePadOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreatePoolOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateReductionOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 void CreateReshapeOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);

From bb136f86c8a1d0bcbdc2a77cb16f1c26c9ebd817 Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Thu, 20 Jul 2023 09:57:38 -0700
Subject: [PATCH 06/34] [ROCm][MIGraphX] for googletest dep, set
 OVERRIDE_FIND_PACKAGE (#16715)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Otherwise, an unsupported version of gtest/gmock will be found at
/opt/conda/include for ROCm builds. Though this issue was initially
found for ROCm builds, the issue is generic. onnxruntime requires a
specific version of googletest and should not rely on locating
googletest using find_package.

The ROCm error was:

```
In file included from /opt/conda/include/gmock/gmock-spec-builders.h:75,
                 from /opt/conda/include/gmock/gmock-generated-function-mockers.h:47,
                 from /opt/conda/include/gmock/gmock-function-mocker.h:39,
                 from /opt/conda/include/gmock/gmock.h:61,
                 from /stage/onnxruntime/onnxruntime/test/util/test_utils.cc:17:
/opt/conda/include/gmock/gmock-matchers.h: In instantiation of ‘bool testing::internal::PointwiseMatcher<TupleMatcher, RhsContainer>::Impl<LhsContainer>::
MatchAndExplain(LhsContainer, testing::MatchResultListener*) const [with LhsContainer = const gsl::span<const float>&; TupleMatcher = testing::internal::
FloatingEq2Matcher<float>; RhsContainer = gsl::span<const float>]’:
/opt/conda/include/gmock/gmock-matchers.h:2303:10:   required from here
/opt/conda/include/gmock/gmock-matchers.h:2312:48: error: no type named ‘const_iterator’ in ‘testing::internal::PointwiseMatcher<testing::internal::
FloatingEq2Matcher<float>, gsl::span<const float> >::Impl<const gsl::span<const float>&>::LhsStlContainer’ {aka ‘class gsl::span<const float>’}
```
---
 cmake/external/onnxruntime_external_deps.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index c89f9febe0ed8..4cb3182c6301e 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -46,8 +46,8 @@ if (onnxruntime_BUILD_UNIT_TESTS)
   FetchContent_Declare(
     googletest
     URL ${DEP_URL_googletest}
-    FIND_PACKAGE_ARGS NAMES GTest
     URL_HASH SHA1=${DEP_SHA1_googletest}
+    OVERRIDE_FIND_PACKAGE
   )
 endif()
 

From eaea34f8e29df9fb21fab675a3a895084407f306 Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Fri, 21 Jul 2023 01:39:30 +0800
Subject: [PATCH 07/34] [WebNN EP] Support PRelu op (#16756)

---
 onnxruntime/core/providers/webnn/builders/helper.h             | 1 +
 .../core/providers/webnn/builders/impl/binary_op_builder.cc    | 3 +++
 .../core/providers/webnn/builders/op_builder_factory.cc        | 1 +
 3 files changed, 5 insertions(+)

diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h
index af4780323f7cb..93d36e7761545 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.h
+++ b/onnxruntime/core/providers/webnn/builders/helper.h
@@ -167,6 +167,7 @@ static const InlinedHashMap<std::string, std::string> op_map = {
     {"Not", "logicalNot"},
     {"Pad", "pad"},
     {"Pow", "pow"},
+    {"PRelu", "prelu"},
     {"Reciprocal", "reciprocal"},
     {"ReduceMax", "reduceMax"},
     {"ReduceMean", "reduceMean"},
diff --git a/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc
index ac08f7a3f2366..8ae9ad6b2f829 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc
@@ -39,6 +39,8 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
     output = model_builder.GetBuilder().call<emscripten::val>("div", input0, input1);
   } else if (op_type == "Pow") {
     output = model_builder.GetBuilder().call<emscripten::val>("pow", input0, input1);
+  } else if (op_type == "PRelu") {
+    output = model_builder.GetBuilder().call<emscripten::val>("prelu", input0, input1);
   } else {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                            "BinaryOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);
@@ -59,6 +61,7 @@ void CreateBinaryOpBuilder(const std::string& op_type, OpBuilderRegistrations& o
           "Mul",
           "Div",
           "Pow",
+          "PRelu",
       };
 
   op_registrations.builders.push_back(std::make_unique<BinaryOpBuilder>());
diff --git a/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc b/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
index 80f89d8f4495a..2536ae0ae44d6 100644
--- a/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
+++ b/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
@@ -37,6 +37,7 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
     CreateBinaryOpBuilder("Mul", op_registrations);
     CreateBinaryOpBuilder("Div", op_registrations);
     CreateBinaryOpBuilder("Pow", op_registrations);
+    CreateBinaryOpBuilder("PRelu", op_registrations);
   }
 
   {  // Ternary

From a8c263f92c63e155e9c97c56ee561c8a81ec152e Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Thu, 20 Jul 2023 16:22:14 -0700
Subject: [PATCH 08/34] [QNN EP] Update QNN SDK to 2.12 (#16750)

### Description
- Updates the default QNN SDK to 2.12 for CI pipelines
- Adds a disabled InstanceNormalization test for regression on QNN SDK
2.12
- Cleans up logs for unsupported ops.

### Motivation and Context
Test with the latest QNN SDK.
---
 .../providers/qnn/qnn_execution_provider.cc   | 16 ++--
 onnxruntime/test/optimizer/qdq_test_utils.h   | 43 ---------
 .../providers/qnn/instance_norm_htp_test.cc   | 91 +++++++++++++++++--
 .../test/providers/qnn/qnn_test_utils.cc      | 29 +++++-
 ...arm64-v8a-QNN-crosscompile-ci-pipeline.yml |  2 +-
 .../azure-pipelines/linux-qnn-ci-pipeline.yml | 12 +--
 .../qnn-ep-nuget-packaging-pipeline.yml       |  4 +-
 .../win-qnn-arm64-ci-pipeline.yml             |  8 +-
 .../azure-pipelines/win-qnn-ci-pipeline.yml   |  6 +-
 9 files changed, 137 insertions(+), 74 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index 86bf879a26612..43998084618c0 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -173,29 +173,33 @@ bool QNNExecutionProvider::IsNodeSupported(qnn::QnnModelWrapper& qnn_model_wrapp
       // For Cast, need to call IsOpSupported (below) to validate input and output types.
       // For other single non-qdq nodes, immediately return not supported.
       if (node_unit.OpType() != "Cast") {
-        LOGS(logger, VERBOSE) << "Non-QDQ single node is not supported for NPU backend. Node name: " << node_unit.Name()
-                              << " Op type: " << node_unit.OpType();
+        LOGS(logger, WARNING) << "Non-QDQ " << node_unit.OpType()
+                              << " operators are not supported on HTP or DSP backends. " << node_unit.OpType()
+                              << " node `" << node_unit.Name() << " will not be assigned to QNN EP.";
         return false;
       }
     }
 
     // Non-NPU backend, quantized model not supported, but a QDQ node encountered
     if (!is_npu_backend && IsQdqNode(node_unit)) {
-      LOGS(logger, ERROR) << "There's no reason to run a QDQ model on non HTP/DSP backend!";
+      LOGS(logger, ERROR) << "QDQ models are only supported on HTP or DSP backends. "
+                          << node_unit.OpType() << " node `" << node_unit.Name() << "` will not be assigned to QNN EP.";
       return false;
     }
 
     bool supported = false;
     const auto* op_builder = qnn::GetOpBuilder(node_unit.OpType());
     if (op_builder == nullptr) {
-      LOGS(logger, VERBOSE) << "Op not implemented in QNN EP. Op type: " << node_unit.OpType();
+      LOGS(logger, WARNING) << "Operators of type `" << node_unit.OpType() << "` are not supported by QNN EP."
+                            << node_unit.OpType() << " node `" << node_unit.Name()
+                            << "` will not be assigned to QNN EP.";
     } else {
       auto status = op_builder->IsOpSupported(qnn_model_wrapper,
                                               node_unit, logger,
                                               is_npu_backend);
       if (Status::OK() != status) {
-        LOGS(logger, VERBOSE) << "Op type: " << node_unit.OpType()
-                              << ", not supported: " << status.ErrorMessage();
+        LOGS(logger, WARNING) << node_unit.OpType() << " node `" << node_unit.Name()
+                              << "` is not supported: " << status.ErrorMessage();
       }
       supported = (Status::OK() == status);
     }
diff --git a/onnxruntime/test/optimizer/qdq_test_utils.h b/onnxruntime/test/optimizer/qdq_test_utils.h
index 262d29396f747..62dd322f292f2 100644
--- a/onnxruntime/test/optimizer/qdq_test_utils.h
+++ b/onnxruntime/test/optimizer/qdq_test_utils.h
@@ -91,49 +91,6 @@ GetQDQTestCaseFn BuildQDQConvTransposeTestCase(const std::vector<int64_t>& input
   };
 }
 
-// Creates the graph:
-//                                  _______________________
-//               input_u8 -> DQ -> |                       | -> Q -> output_u8
-// scale_u8 (initializer) -> DQ -> | InstanceNormalization |
-// bias_u8 (initializer)  -> DQ -> |_______________________|
-//
-// Currently used to test QNN EP.
-template <typename InputQType, typename ScaleQType, typename BiasQType>
-GetQDQTestCaseFn BuildQDQInstanceNormTestCase(const std::vector<int64_t>& input_shape, float epsilon) {
-  return [input_shape, epsilon](ModelTestBuilder& builder) {
-    const int64_t num_channels = input_shape[1];
-    const InputQType quant_zero_point = 0;
-    const float quant_scale = 1.0f;
-
-    auto* dq_scale_output = builder.MakeIntermediate();
-    auto* scale = builder.MakeInitializer<ScaleQType>({num_channels}, static_cast<ScaleQType>(0),
-                                                      static_cast<ScaleQType>(127));
-    builder.AddDequantizeLinearNode<ScaleQType>(scale, quant_scale, quant_zero_point, dq_scale_output);
-
-    // Add bias (initializer) -> DQ ->
-    auto* dq_bias_output = builder.MakeIntermediate();
-    auto* bias = builder.MakeInitializer<BiasQType>({num_channels}, static_cast<BiasQType>(0),
-                                                    static_cast<BiasQType>(4));
-    builder.AddDequantizeLinearNode<BiasQType>(bias, 1.0f, 0, dq_bias_output);
-
-    // Add input_u8 -> DQ ->
-    auto* input_u8 = builder.MakeInput<InputQType>(input_shape, static_cast<InputQType>(0),
-                                                   static_cast<InputQType>(10));
-    auto* dq_input_output = builder.MakeIntermediate();
-    builder.AddDequantizeLinearNode<InputQType>(input_u8, quant_scale, quant_zero_point, dq_input_output);
-
-    // Add dq_input_output -> InstanceNormalization ->
-    auto* instance_norm_output = builder.MakeIntermediate();
-    Node& inst_norm_node = builder.AddNode("InstanceNormalization", {dq_input_output, dq_scale_output, dq_bias_output},
-                                           {instance_norm_output});
-    inst_norm_node.AddAttribute("epsilon", epsilon);
-
-    // Add instance_norm_output -> Q -> output_u8
-    auto* output_u8 = builder.MakeOutput();
-    builder.AddQuantizeLinearNode<InputQType>(instance_norm_output, quant_scale, quant_zero_point, output_u8);
-  };
-}
-
 // Creates the following graph:
 //                                _______________________
 //    input (f32) -> Q -> DQ ->  |                       | -> Q -> DQ -> output (f32)
diff --git a/onnxruntime/test/providers/qnn/instance_norm_htp_test.cc b/onnxruntime/test/providers/qnn/instance_norm_htp_test.cc
index b3a8ca78a27e0..3846a2868a895 100644
--- a/onnxruntime/test/providers/qnn/instance_norm_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/instance_norm_htp_test.cc
@@ -5,6 +5,7 @@
 
 #include <string>
 #include "core/graph/graph.h"
+#include "core/graph/node_attr_utils.h"
 
 #include "test/optimizer/qdq_test_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
@@ -15,15 +16,65 @@ namespace onnxruntime {
 namespace test {
 #if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
+// Creates the graph:
+//                                  _______________________
+//               input_u8 -> DQ -> |                       | -> Q -> output_u8
+// scale_u8 (initializer) -> DQ -> | InstanceNormalization |
+// bias_u8 (initializer)  -> DQ -> |_______________________|
+//
+// Currently used to test QNN EP.
+template <typename QuantType>
+GetQDQTestCaseFn BuildQDQInstanceNormTestCase(const TestInputDef<QuantType>& input_def,
+                                              const TestInputDef<QuantType>& scale_def,
+                                              const TestInputDef<int32_t>& bias_def,
+                                              const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs) {
+  return [input_def, scale_def, bias_def, attrs](ModelTestBuilder& builder) {
+    const QuantType quant_zero_point = 0;
+    const float quant_scale = 1.0f;
+
+    auto* dq_scale_output = builder.MakeIntermediate();
+    auto* scale = MakeTestInput<QuantType>(builder, scale_def);
+    builder.AddDequantizeLinearNode<QuantType>(scale, quant_scale, quant_zero_point, dq_scale_output);
+
+    // Add bias (initializer) -> DQ ->
+    auto* dq_bias_output = builder.MakeIntermediate();
+    auto* bias = MakeTestInput<int32_t>(builder, bias_def);
+    builder.AddDequantizeLinearNode<int32_t>(bias, 1.0f, 0, dq_bias_output);
+
+    // Add input_u8 -> DQ ->
+    auto* input_u8 = MakeTestInput<QuantType>(builder, input_def);
+    auto* dq_input_output = builder.MakeIntermediate();
+    builder.AddDequantizeLinearNode<QuantType>(input_u8, quant_scale, quant_zero_point, dq_input_output);
+
+    // Add dq_input_output -> InstanceNormalization ->
+    auto* instance_norm_output = builder.MakeIntermediate();
+    Node& inst_norm_node = builder.AddNode("InstanceNormalization", {dq_input_output, dq_scale_output, dq_bias_output},
+                                           {instance_norm_output});
+    for (const auto& attr : attrs) {
+      inst_norm_node.AddAttributeProto(attr);
+    }
+
+    // Add instance_norm_output -> Q -> output_u8
+    auto* output_u8 = builder.MakeOutput();
+    builder.AddQuantizeLinearNode<QuantType>(instance_norm_output, quant_scale, quant_zero_point, output_u8);
+  };
+}
+
 /**
  * Runs an InstanceNormalization model on the QNN HTP backend. Checks the graph node assignment, and that inference
  * outputs for QNN and CPU match.
  *
- * \param input_shape The input's shape.
- * \param epsilon The epsilon attribute.
+ * \param input_def The test input's definition (shape, is_initializer, data).
+ * \param scale_def The scale input's definition. Correct shapes must be 1D [num_input_channels].
+ * \param bias_def The bias input's definition. Correct shapes must be 1D [num_input_channels].
+ * \param attrs The node's attributes. The only valid attribute for InstanceNormalization is 'epsilon'.
  * \param expected_ep_assignment How many nodes are expected to be assigned to QNN (All, Some, or None).
  */
-static void RunInstanceNormQDQTest(const std::vector<int64_t>& input_shape, float epsilon,
+template <typename QuantType = uint8_t>
+static void RunInstanceNormQDQTest(const TestInputDef<QuantType>& input_def,
+                                   const TestInputDef<QuantType>& scale_def,
+                                   const TestInputDef<int32_t>& bias_def,
+                                   const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
                                    ExpectedEPNodeAssignment expected_ep_assignment) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
@@ -33,7 +84,7 @@ static void RunInstanceNormQDQTest(const std::vector<int64_t>& input_shape, floa
 #endif
 
   // Runs model with DQ-> InstanceNorm -> Q and compares the outputs of the CPU and QNN EPs.
-  RunQnnModelTest(BuildQDQInstanceNormTestCase<uint8_t, uint8_t, int32_t>(input_shape, epsilon),
+  RunQnnModelTest(BuildQDQInstanceNormTestCase<QuantType>(input_def, scale_def, bias_def, attrs),
                   provider_options,
                   18,
                   expected_ep_assignment);
@@ -42,19 +93,43 @@ static void RunInstanceNormQDQTest(const std::vector<int64_t>& input_shape, floa
 // Check that QNN compiles DQ -> InstanceNormalization -> Q as a single unit.
 // Use an input of rank 4.
 TEST_F(QnnHTPBackendTests, TestQDQInstanceNormU8) {
-  RunInstanceNormQDQTest({1, 2, 3, 3}, 1e-05f, ExpectedEPNodeAssignment::All);
+  RunInstanceNormQDQTest(TestInputDef<uint8_t>({1, 2, 3, 3}, false, 0, 255),
+                         TestInputDef<uint8_t>({2}, true, 0, 127),
+                         TestInputDef<int32_t>({2}, true, 0, 10),
+                         {},
+                         ExpectedEPNodeAssignment::All);
 }
 
 // Check that QNN compiles DQ -> InstanceNormalization -> Q as a single unit.
 // Use an input of rank 3.
 TEST_F(QnnHTPBackendTests, TestQDQInstanceNormU8Rank3) {
-  RunInstanceNormQDQTest({1, 2, 3}, 1e-05f, ExpectedEPNodeAssignment::All);
+  RunInstanceNormQDQTest(TestInputDef<uint8_t>({1, 2, 3}, false, {6, 4, 2, 6, 8, 2}),
+                         TestInputDef<uint8_t>({2}, true, {1, 2}),
+                         TestInputDef<int32_t>({2}, true, {1, 3}),
+                         {},
+                         ExpectedEPNodeAssignment::All);
+}
+
+// TODO: This test now fails in QNN SDK version 2.12.0 (windows arm64 and linux x86_64).
+// This worked in QNN SDK version 2.10.0. Need to determine the severity of this inaccuracy.
+//
+// Exepcted output: 2 6 2 42 42 0
+// Actual output: 2 6 2 43 43 0
+TEST_F(QnnHTPBackendTests, DISABLED_TestQDQInstanceNormU8Rank3_QnnSdk_2_12_Regression) {
+  RunInstanceNormQDQTest(TestInputDef<uint8_t>({1, 2, 3}, false, {3, 4, 3, 9, 9, 8}),
+                         TestInputDef<uint8_t>({2}, true, {2, 57}),
+                         TestInputDef<int32_t>({2}, true, {3, 2}),
+                         {},
+                         ExpectedEPNodeAssignment::All);
 }
 
 // Check that QNN InstanceNorm operator does not handle inputs with rank > 4.
 TEST_F(QnnHTPBackendTests, TestQDQInstanceNormU8Rank5) {
-  // No nodes should be assigned to QNN EP, and graph should have 5 (non-fused) nodes.
-  RunInstanceNormQDQTest({1, 2, 3, 3, 3}, 1e-05f, ExpectedEPNodeAssignment::None);
+  RunInstanceNormQDQTest(TestInputDef<uint8_t>({1, 2, 3, 3, 3}, false, 0, 255),
+                         TestInputDef<uint8_t>({2}, true, 0, 127),
+                         TestInputDef<int32_t>({2}, true, 0, 10),
+                         {},
+                         ExpectedEPNodeAssignment::None);
 }
 
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
diff --git a/onnxruntime/test/providers/qnn/qnn_test_utils.cc b/onnxruntime/test/providers/qnn/qnn_test_utils.cc
index 7379e7a9dabe1..14f2a351d414c 100644
--- a/onnxruntime/test/providers/qnn/qnn_test_utils.cc
+++ b/onnxruntime/test/providers/qnn/qnn_test_utils.cc
@@ -63,7 +63,34 @@ static BackendSupport GetHTPSupport(const onnxruntime::logging::Logger& logger)
   ModelTestBuilder helper(graph);
 
   // Build simple QDQ graph: DQ -> InstanceNormalization -> Q
-  GetQDQTestCaseFn build_test_case = BuildQDQInstanceNormTestCase<uint8_t, uint8_t, int32_t>({1, 2, 3, 3}, 1e-05f);
+  GetQDQTestCaseFn build_test_case = [](ModelTestBuilder& builder) {
+    const uint8_t quant_zero_point = 0;
+    const float quant_scale = 1.0f;
+
+    auto* dq_scale_output = builder.MakeIntermediate();
+    auto* scale = builder.MakeInitializer<uint8_t>({2}, std::vector<uint8_t>{1, 2});
+    builder.AddDequantizeLinearNode<uint8_t>(scale, quant_scale, quant_zero_point, dq_scale_output);
+
+    // Add bias (initializer) -> DQ ->
+    auto* dq_bias_output = builder.MakeIntermediate();
+    auto* bias = builder.MakeInitializer<int32_t>({2}, std::vector<int32_t>{1, 1});
+    builder.AddDequantizeLinearNode<int32_t>(bias, 1.0f, 0, dq_bias_output);
+
+    // Add input_u8 -> DQ ->
+    auto* input_u8 = builder.MakeInput<uint8_t>({1, 2, 3}, std::vector<uint8_t>{1, 2, 3, 4, 5, 6});
+    auto* dq_input_output = builder.MakeIntermediate();
+    builder.AddDequantizeLinearNode<uint8_t>(input_u8, quant_scale, quant_zero_point, dq_input_output);
+
+    // Add dq_input_output -> InstanceNormalization ->
+    auto* instance_norm_output = builder.MakeIntermediate();
+    builder.AddNode("InstanceNormalization", {dq_input_output, dq_scale_output, dq_bias_output},
+                    {instance_norm_output});
+
+    // Add instance_norm_output -> Q -> output_u8
+    auto* output_u8 = builder.MakeOutput();
+    builder.AddQuantizeLinearNode<uint8_t>(instance_norm_output, quant_scale, quant_zero_point, output_u8);
+  };
+
   build_test_case(helper);
   helper.SetGraphOutputs();
   auto status = model.MainGraph().Resolve();
diff --git a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
index df0b9584d1f0f..582a23b30ecca 100644
--- a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
@@ -3,7 +3,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: qnn-v2.10.0.230425122932_54038
+  default: qnn-v2.12.0.230626
 
 jobs:
 - job: Build_QNN_EP
diff --git a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
index b024163bbadde..e9312d2b16241 100644
--- a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
@@ -3,7 +3,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: qnn-v2.10.0.230425122932_54038
+  default: qnn-v2.12.0.230626
 
 jobs:
   - job: Build_QNN_EP
@@ -57,7 +57,7 @@ jobs:
         inputs:
           script: |
             ./build/Release/onnx_test_runner -e qnn \
-              -v -j 1 -c 1 -i "backend_path|$(QNN_SDK_ROOT)/target/x86_64-linux-clang/lib/libQnnCpu.so" \
+              -v -j 1 -c 1 -i "backend_path|$(QNN_SDK_ROOT)/lib/x86_64-linux-clang/libQnnCpu.so" \
               cmake/external/onnx/onnx/backend/test/data/node
 
       - task: CmdLine@2
@@ -65,7 +65,7 @@ jobs:
         inputs:
           script: |
             ./build/Release/onnx_test_runner -e qnn \
-              -v -j 1 -c 1 -i "backend_path|$(QNN_SDK_ROOT)/target/x86_64-linux-clang/lib/libQnnCpu.so" \
+              -v -j 1 -c 1 -i "backend_path|$(QNN_SDK_ROOT)/lib/x86_64-linux-clang/libQnnCpu.so" \
               /data/float32_models
 
       - task: CmdLine@2
@@ -73,7 +73,7 @@ jobs:
         inputs:
           script: |
             ./build/Release/onnx_test_runner -e qnn \
-              -v -j 1 -c 1 -i "backend_path|$(QNN_SDK_ROOT)/target/x86_64-linux-clang/lib/libQnnHtp.so" \
+              -v -j 1 -c 1 -i "backend_path|$(QNN_SDK_ROOT)/lib/x86_64-linux-clang/libQnnHtp.so" \
               /data/qdq_models
 
       - task: CmdLine@2
@@ -81,7 +81,7 @@ jobs:
         inputs:
           script: |
             ./build/Release/onnx_test_runner -e qnn \
-              -v -j 1 -c 1 -i "backend_path|$(QNN_SDK_ROOT)/target/x86_64-linux-clang/lib/libQnnHtp.so qnn_context_cache_enable|1 qnn_context_cache_path|./build/Release/mobilenet_qdq.bin" \
+              -v -j 1 -c 1 -i "backend_path|$(QNN_SDK_ROOT)/lib/x86_64-linux-clang/libQnnHtp.so qnn_context_cache_enable|1 qnn_context_cache_path|./build/Release/mobilenet_qdq.bin" \
               /data/qdq_models/mobilenetv2-1.0_add_transpose_quant
 
       - task: CmdLine@2
@@ -89,5 +89,5 @@ jobs:
         inputs:
           script: |
             ./build/Release/onnx_test_runner -e qnn \
-              -v -j 1 -c 1 -i "backend_path|$(QNN_SDK_ROOT)/target/x86_64-linux-clang/lib/libQnnHtp.so qnn_context_cache_enable|1 qnn_context_cache_path|./build/Release/mobilenet_qdq.bin" \
+              -v -j 1 -c 1 -i "backend_path|$(QNN_SDK_ROOT)/lib/x86_64-linux-clang/libQnnHtp.so qnn_context_cache_enable|1 qnn_context_cache_path|./build/Release/mobilenet_qdq.bin" \
               /data/qdq_models/mobilenetv2-1.0_add_transpose_quant
diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
index 67a0668cd3502..89851ee6b9a13 100644
--- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
@@ -2,12 +2,12 @@ parameters:
 - name: qnn_sdk_path_win
   displayName: QNN Windows SDK path
   type: string
-  default: C:\data\qnnsdk\qnn-v2.10.0.230425122932_54038_win
+  default: C:\data\qnnsdk\qnn-v2.12.1.230626_win
 
 - name: qnn_sdk_info
   displayName: QNN SDK Version Information
   type: string
-  default: qnn-v2.10.0.230425122932_54038
+  default: qnn-v2.12.1.230626_win
 
 - name: ort_package_version
   displayName: OnnxRuntime Nuget package version
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
index 663ebcbbc9e9c..aedd516a528ca 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
@@ -3,7 +3,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: qnn-v2.10.0.230425122932_54038_win
+  default: qnn-v2.12.1.230626_win
 
 jobs:
 - job: 'build'
@@ -55,17 +55,17 @@ jobs:
     displayName: 'Run unit tests'
 
   - script: |
-     .\$(BuildConfig)\onnx_test_runner -j 1 -c 1 -v -e qnn -i "backend_path|$(QNN_SDK_ROOT)\target\aarch64-windows-msvc\lib\QnnCpu.dll" $(Build.SourcesDirectory)\cmake\external\onnx\onnx\backend\test\data\node
+     .\$(BuildConfig)\onnx_test_runner -j 1 -c 1 -v -e qnn -i "backend_path|$(QNN_SDK_ROOT)\lib\aarch64-windows-msvc\QnnCpu.dll" $(Build.SourcesDirectory)\cmake\external\onnx\onnx\backend\test\data\node
     workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)'
     displayName: 'Run ONNX Tests'
 
   - script: |
-     .\$(BuildConfig)\onnx_test_runner -j 1 -c 1 -v -e qnn -i "backend_path|$(QNN_SDK_ROOT)\target\aarch64-windows-msvc\lib\QnnCpu.dll" C:\data\float32_models
+     .\$(BuildConfig)\onnx_test_runner -j 1 -c 1 -v -e qnn -i "backend_path|$(QNN_SDK_ROOT)\lib\aarch64-windows-msvc\QnnCpu.dll" C:\data\float32_models
     workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)'
     displayName: 'Run float32 model tests'
 
   - script: |
-     .\$(BuildConfig)\onnx_test_runner -j 1 -c 1 -v -e qnn -i "backend_path|$(QNN_SDK_ROOT)\target\aarch64-windows-msvc\lib\QnnHtp.dll" C:\data\qdq_models
+     .\$(BuildConfig)\onnx_test_runner -j 1 -c 1 -v -e qnn -i "backend_path|$(QNN_SDK_ROOT)\lib\aarch64-windows-msvc\QnnHtp.dll" C:\data\qdq_models
     workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)'
     displayName: 'Run QDQ model tests'
     enabled: false
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
index e3db5158b216d..cb11f67a961ec 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
@@ -3,7 +3,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: qnn-v2.10.0.230425122932_54038_win
+  default: qnn-v2.12.1.230626_win
 
 jobs:
 - job: 'build'
@@ -68,12 +68,12 @@ jobs:
     displayName: 'Run unit tests'
 
   - script: |
-     .\$(BuildConfig)\onnx_test_runner -j 1 -c 1 -v -e qnn -i "backend_path|$(QNN_SDK_ROOT)\target\x86_64-windows-msvc\lib\QnnCpu.dll" $(Build.SourcesDirectory)\cmake\external\onnx\onnx\backend\test\data\node
+    .\$(BuildConfig)\onnx_test_runner -j 1 -c 1 -v -e qnn -i "backend_path|$(QNN_SDK_ROOT)\lib\x86_64-windows-msvc\QnnCpu.dll" $(Build.SourcesDirectory)\cmake\external\onnx\onnx\backend\test\data\node
     workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)'
     displayName: 'Run ONNX Tests'
 
   - script: |
-     .\$(BuildConfig)\onnx_test_runner -j 1 -c 1 -v -e qnn -i "backend_path|$(QNN_SDK_ROOT)\target\x86_64-windows-msvc\lib\QnnCpu.dll" C:\data\float32_models
+    .\$(BuildConfig)\onnx_test_runner -j 1 -c 1 -v -e qnn -i "backend_path|$(QNN_SDK_ROOT)\lib\x86_64-windows-msvc\QnnCpu.dll" C:\data\float32_models
     workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)'
     displayName: 'Run float32 model tests'
 

From 538d2412ef8498b0a90fd73bb74e085844fed721 Mon Sep 17 00:00:00 2001
From: Baiju Meswani <bmeswani@microsoft.com>
Date: Thu, 20 Jul 2023 17:39:29 -0700
Subject: [PATCH 09/34] Objective-C Add Support to Create and Query String
 ORTValues (#16764)

This pull request contains a few changes:

1. Adds support for string ort values.
2. Fixes the training minimal build (that was broken with #16601) by
putting custom op registration behind #ifdefs
3. Fixes the iOS pod package generation (that was again broken with
#16601) by explicitly providing paths to be copied during pod creation.
---
 Package.swift                                 |   2 +-
 objectivec/include/ort_enums.h                |   1 +
 objectivec/include/ort_value.h                |  28 +++++++
 objectivec/ort_enums.mm                       |  14 ++--
 objectivec/ort_value.mm                       |  75 ++++++++++++++++++
 objectivec/test/ort_session_test.mm           |  36 +++++++++
 objectivec/test/ort_value_test.mm             |  13 +++
 objectivec/test/testdata/identity_string.ort  | Bin 0 -> 1288 bytes
 .../orttraining/training_api/module.cc        |   6 +-
 .../orttraining/training_api/optimizer.cc     |   4 +-
 .../objectivec/assemble_objc_pod_package.py   |  10 ++-
 11 files changed, 179 insertions(+), 10 deletions(-)
 create mode 100644 objectivec/test/testdata/identity_string.ort

diff --git a/Package.swift b/Package.swift
index e053732811745..f8bf33001ea24 100644
--- a/Package.swift
+++ b/Package.swift
@@ -21,7 +21,7 @@ import class Foundation.ProcessInfo
 
 let package = Package(
     name: "onnxruntime",
-    platforms: [.iOS(.v11)],
+    platforms: [.iOS(.v12)],
     products: [
         .library(name: "onnxruntime",
                  type: .static,
diff --git a/objectivec/include/ort_enums.h b/objectivec/include/ort_enums.h
index 6bc29aaa00965..78de233972ccf 100644
--- a/objectivec/include/ort_enums.h
+++ b/objectivec/include/ort_enums.h
@@ -38,6 +38,7 @@ typedef NS_ENUM(int32_t, ORTTensorElementDataType) {
   ORTTensorElementDataTypeUInt32,
   ORTTensorElementDataTypeInt64,
   ORTTensorElementDataTypeUInt64,
+  ORTTensorElementDataTypeString,
 };
 
 /**
diff --git a/objectivec/include/ort_value.h b/objectivec/include/ort_value.h
index 60a9bbed0e683..641c15ba69d2d 100644
--- a/objectivec/include/ort_value.h
+++ b/objectivec/include/ort_value.h
@@ -32,6 +32,21 @@ NS_ASSUME_NONNULL_BEGIN
                                       shape:(NSArray<NSNumber*>*)shape
                                       error:(NSError**)error;
 
+/**
+ * Creates a value that is a string tensor.
+ * The string data will be copied into a buffer owned by this ORTValue instance.
+ *
+ * Available since 1.16.
+ *
+ * @param tensorStringData The tensor string data.
+ * @param shape The tensor shape.
+ * @param error Optional error information set if an error occurs.
+ * @return The instance, or nil if an error occurs.
+ */
+- (nullable instancetype)initWithTensorStringData:(NSArray<NSString*>*)tensorStringData
+                                            shape:(NSArray<NSNumber*>*)shape
+                                            error:(NSError**)error;
+
 /**
  * Gets the type information.
  *
@@ -63,6 +78,19 @@ NS_ASSUME_NONNULL_BEGIN
  */
 - (nullable NSMutableData*)tensorDataWithError:(NSError**)error;
 
+/**
+ * Gets the tensor string data.
+ * This assumes that the value is a string tensor.
+ *
+ * This returns a copy of the value's underlying string data.
+ *
+ * Available since 1.16.
+ *
+ * @param error Optional error information set if an error occurs.
+ * @return The copy of the tensor string data, or nil if an error occurs.
+ */
+- (nullable NSArray<NSString*>*)tensorStringDataWithError:(NSError**)error;
+
 @end
 
 /**
diff --git a/objectivec/ort_enums.mm b/objectivec/ort_enums.mm
index 0144a333d1dc6..60939812df531 100644
--- a/objectivec/ort_enums.mm
+++ b/objectivec/ort_enums.mm
@@ -4,6 +4,7 @@
 #import "ort_enums_internal.h"
 
 #include <algorithm>
+#include <optional>
 
 #import "cxx_api.h"
 
@@ -39,13 +40,13 @@
 struct TensorElementTypeInfo {
   ORTTensorElementDataType type;
   ONNXTensorElementDataType capi_type;
-  size_t element_size;
+  std::optional<size_t> element_size;
 };
 
 // supported ORT tensor element data types
 // define the mapping from ORTTensorElementDataType to C API ONNXTensorElementDataType here
 constexpr TensorElementTypeInfo kElementTypeInfos[]{
-    {ORTTensorElementDataTypeUndefined, ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED, 0},
+    {ORTTensorElementDataTypeUndefined, ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED, std::nullopt},
     {ORTTensorElementDataTypeFloat, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, sizeof(float)},
     {ORTTensorElementDataTypeInt8, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8, sizeof(int8_t)},
     {ORTTensorElementDataTypeUInt8, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, sizeof(uint8_t)},
@@ -53,6 +54,7 @@
     {ORTTensorElementDataTypeUInt32, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32, sizeof(uint32_t)},
     {ORTTensorElementDataTypeInt64, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, sizeof(int64_t)},
     {ORTTensorElementDataTypeUInt64, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64, sizeof(uint64_t)},
+    {ORTTensorElementDataTypeString, ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING, std::nullopt},
 };
 
 struct GraphOptimizationLevelInfo {
@@ -119,9 +121,11 @@ ORTTensorElementDataType CAPIToPublicTensorElementType(ONNXTensorElementDataType
 size_t SizeOfCAPITensorElementType(ONNXTensorElementDataType capi_type) {
   return SelectAndTransform(
       kElementTypeInfos,
-      [capi_type](const auto& type_info) { return type_info.capi_type == capi_type; },
-      [](const auto& type_info) { return type_info.element_size; },
-      "unsupported tensor element type");
+      [capi_type](const auto& type_info) {
+        return type_info.element_size.has_value() && type_info.capi_type == capi_type;
+      },
+      [](const auto& type_info) { return *type_info.element_size; },
+      "unsupported tensor element type or tensor element type does not have a known size");
 }
 
 GraphOptimizationLevel PublicToCAPIGraphOptimizationLevel(ORTGraphOptimizationLevel opt_level) {
diff --git a/objectivec/ort_value.mm b/objectivec/ort_value.mm
index f6ea674e1b3c3..b9dc1a9885c61 100644
--- a/objectivec/ort_value.mm
+++ b/objectivec/ort_value.mm
@@ -71,6 +71,12 @@ - (nullable instancetype)initWithTensorData:(NSMutableData*)tensorData
                                       shape:(NSArray<NSNumber*>*)shape
                                       error:(NSError**)error {
   try {
+    if (elementType == ORTTensorElementDataTypeString) {
+      ORT_CXX_API_THROW(
+          "ORTTensorElementDataTypeString element type provided. "
+          "Please call initWithTensorStringData:shape:error: instead to create an ORTValue with string data.",
+          ORT_INVALID_ARGUMENT);
+    }
     const auto memoryInfo = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
     const auto ONNXElementType = PublicToCAPITensorElementType(elementType);
     const auto shapeVector = [shape]() {
@@ -92,6 +98,46 @@ - (nullable instancetype)initWithTensorData:(NSMutableData*)tensorData
   ORT_OBJC_API_IMPL_CATCH_RETURNING_NULLABLE(error)
 }
 
+- (nullable instancetype)initWithTensorStringData:(NSArray<NSString*>*)tensorStringData
+                                            shape:(NSArray<NSNumber*>*)shape
+                                            error:(NSError**)error {
+  try {
+    Ort::AllocatorWithDefaultOptions allocator;
+    size_t tensorSize = 1U;
+    const auto shapeVector = [&tensorSize, shape]() {
+      std::vector<int64_t> result{};
+      result.reserve(shape.count);
+      for (NSNumber* dim in shape) {
+        const auto dimValue = dim.longLongValue;
+        if (dimValue < 0 || !SafeMultiply(static_cast<size_t>(dimValue), tensorSize, tensorSize)) {
+          ORT_CXX_API_THROW("Failed to compute the tensor size.", ORT_RUNTIME_EXCEPTION);
+        }
+        result.push_back(dimValue);
+      }
+      return result;
+    }();
+
+    if (tensorSize != [tensorStringData count]) {
+      ORT_CXX_API_THROW(
+          "Computed tensor size does not equal the length of the provided tensor string data.",
+          ORT_INVALID_ARGUMENT);
+    }
+
+    Ort::Value ortValue = Ort::Value::CreateTensor(
+        allocator, shapeVector.data(), shapeVector.size(), ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING);
+
+    size_t index = 0;
+    for (NSString* stringData in tensorStringData) {
+      ortValue.FillStringTensorElement([stringData UTF8String], index++);
+    }
+
+    return [self initWithCXXAPIOrtValue:std::move(ortValue)
+                     externalTensorData:nil
+                                  error:error];
+  }
+  ORT_OBJC_API_IMPL_CATCH_RETURNING_NULLABLE(error)
+}
+
 - (nullable ORTValueTypeInfo*)typeInfoWithError:(NSError**)error {
   try {
     return CXXAPIToPublicValueTypeInfo(*_typeInfo);
@@ -110,6 +156,12 @@ - (nullable ORTTensorTypeAndShapeInfo*)tensorTypeAndShapeInfoWithError:(NSError*
 - (nullable NSMutableData*)tensorDataWithError:(NSError**)error {
   try {
     const auto tensorTypeAndShapeInfo = _typeInfo->GetTensorTypeAndShapeInfo();
+    if (tensorTypeAndShapeInfo.GetElementType() == ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING) {
+      ORT_CXX_API_THROW(
+          "This ORTValue holds string data. Please call tensorStringDataWithError: "
+          "instead to retrieve the string data from this ORTValue.",
+          ORT_RUNTIME_EXCEPTION);
+    }
     const size_t elementCount = tensorTypeAndShapeInfo.GetElementCount();
     const size_t elementSize = SizeOfCAPITensorElementType(tensorTypeAndShapeInfo.GetElementType());
     size_t rawDataLength;
@@ -127,6 +179,29 @@ - (nullable NSMutableData*)tensorDataWithError:(NSError**)error {
   ORT_OBJC_API_IMPL_CATCH_RETURNING_NULLABLE(error)
 }
 
+- (nullable NSArray<NSString*>*)tensorStringDataWithError:(NSError**)error {
+  try {
+    const auto tensorTypeAndShapeInfo = _typeInfo->GetTensorTypeAndShapeInfo();
+    const size_t elementCount = tensorTypeAndShapeInfo.GetElementCount();
+    const size_t tensorStringDataLength = _value->GetStringTensorDataLength();
+    std::vector<char> tensorStringData(tensorStringDataLength, '\0');
+    std::vector<size_t> offsets(elementCount);
+    _value->GetStringTensorContent(tensorStringData.data(), tensorStringDataLength,
+                                   offsets.data(), offsets.size());
+
+    NSMutableArray<NSString*>* result = [NSMutableArray arrayWithCapacity:elementCount];
+    for (size_t idx = 0; idx < elementCount; ++idx) {
+      const size_t strLength = (idx == elementCount - 1) ? tensorStringDataLength - offsets[idx]
+                                                         : offsets[idx + 1] - offsets[idx];
+      [result addObject:[[NSString alloc] initWithBytes:tensorStringData.data() + offsets[idx]
+                                                 length:strLength
+                                               encoding:NSUTF8StringEncoding]];
+    }
+    return result;
+  }
+  ORT_OBJC_API_IMPL_CATCH_RETURNING_NULLABLE(error)
+}
+
 #pragma mark - Internal
 
 - (nullable instancetype)initWithCXXAPIOrtValue:(Ort::Value&&)existingCXXAPIOrtValue
diff --git a/objectivec/test/ort_session_test.mm b/objectivec/test/ort_session_test.mm
index 57b92fdb0b711..f00f5db2f995f 100644
--- a/objectivec/test/ort_session_test.mm
+++ b/objectivec/test/ort_session_test.mm
@@ -50,6 +50,13 @@ + (NSString*)getAddModelPath {
   return path;
 }
 
++ (NSString*)getStringModelPath {
+  NSBundle* bundle = [NSBundle bundleForClass:[ORTSessionTest class]];
+  NSString* path = [bundle pathForResource:@"identity_string"
+                                    ofType:@"ort"];
+  return path;
+}
+
 + (NSMutableData*)dataWithScalarFloat:(float)value {
   NSMutableData* data = [[NSMutableData alloc] initWithBytes:&value length:sizeof(value)];
   return data;
@@ -259,6 +266,35 @@ - (void)testRegisterCustomOpsUsingFunctionPointer {
   XCTAssertEqual(gDummyRegisterCustomOpsFnCalled, true);
 }
 
+- (void)testStringInputs {
+  NSError* err = nil;
+  NSArray<NSString*>* stringData = @[ @"ONNX Runtime", @"is the", @"best", @"AI Framework" ];
+  ORTValue* stringValue = [[ORTValue alloc] initWithTensorStringData:stringData shape:@[ @2, @2 ] error:&err];
+  ORTAssertNullableResultSuccessful(stringValue, err);
+
+  ORTSession* session = [[ORTSession alloc] initWithEnv:self.ortEnv
+                                              modelPath:[ORTSessionTest getStringModelPath]
+                                         sessionOptions:[ORTSessionTest makeSessionOptions]
+                                                  error:&err];
+  ORTAssertNullableResultSuccessful(session, err);
+
+  NSDictionary<NSString*, ORTValue*>* outputs =
+      [session runWithInputs:@{@"input:0" : stringValue}
+                 outputNames:[NSSet setWithArray:@[ @"output:0" ]]
+                  runOptions:[ORTSessionTest makeRunOptions]
+                       error:&err];
+  ORTAssertNullableResultSuccessful(outputs, err);
+
+  ORTValue* outputStringValue = outputs[@"output:0"];
+  XCTAssertNotNil(outputStringValue);
+
+  NSArray<NSString*>* outputStringData = [outputStringValue tensorStringDataWithError:&err];
+  ORTAssertNullableResultSuccessful(outputStringData, err);
+
+  XCTAssertEqual([stringData count], [outputStringData count]);
+  XCTAssertTrue([stringData isEqualToArray:outputStringData]);
+}
+
 @end
 
 NS_ASSUME_NONNULL_END
diff --git a/objectivec/test/ort_value_test.mm b/objectivec/test/ort_value_test.mm
index 734ad39095de9..b22d73bbd9948 100644
--- a/objectivec/test/ort_value_test.mm
+++ b/objectivec/test/ort_value_test.mm
@@ -74,6 +74,19 @@ - (void)testInitTensorFailsWithDataSmallerThanShape {
   ORTAssertNullableResultUnsuccessful(ortValue, err);
 }
 
+- (void)testInitTensorWithStringDataSucceeds {
+  NSArray<NSString*>* stringData = @[ @"ONNX Runtime", @"is", @"the", @"best", @"AI", @"Framework" ];
+  NSError* err = nil;
+  ORTValue* stringValue = [[ORTValue alloc] initWithTensorStringData:stringData shape:@[ @3, @2 ] error:&err];
+  ORTAssertNullableResultSuccessful(stringValue, err);
+
+  NSArray<NSString*>* returnedStringData = [stringValue tensorStringDataWithError:&err];
+  ORTAssertNullableResultSuccessful(returnedStringData, err);
+
+  XCTAssertEqual([stringData count], [returnedStringData count]);
+  XCTAssertTrue([stringData isEqualToArray:returnedStringData]);
+}
+
 @end
 
 NS_ASSUME_NONNULL_END
diff --git a/objectivec/test/testdata/identity_string.ort b/objectivec/test/testdata/identity_string.ort
new file mode 100644
index 0000000000000000000000000000000000000000..96ba0f37c69490e973e26263853a16c3300a8b58
GIT binary patch
literal 1288
zcmZux&1w@-6h5Pa9os5FLn$Jqi!8Dzmsl#4;HoO<LJShcg_~hAZ7(!;!sMn+LC}SZ
zA};y>eE^p}f@^W%6NpP6p&O0gcki882M>I6e(w3Z=iH<zB2S<7pDas5mL-7}gNwrn
z_>Wj*=htioCXqW8IfLE^GHp#@3Ftl^<i=`y+`S{8;X^y)E&=NRW9Z+PE2y-9ucIi2
zL_d)g*^m}|Z_6sOvX2;^E8zElHk@Xj-ca5Ewty)LT*f)6c_H54D0cm|;6rQ!?5pbg
zC*&A5>Z`&209<5oUVjmq@5LR2&zf}5ZQIoobj<i>m|`qz&^rOGz_-A+fgZqm12o+S
ze+g7Xdxh5!D1X3*J>Ty=e>lyviPeSamBpbR<R#;{wQBU#E8={Mi5*R>-vRf@9r(L3
zD#k5+d>b|1W45${qPIcDcx4ANImRUG0Oc3K)&qwc@r-1G{j_Fh&FkTPu_o_x#(*&A
zrO=D-Poe}9=kD2$?^NVV$jw@uE&KQm8+Vh)4ZJAt4|U#;xVOd^=69H20{{1e1Bmlx
zUe@%p>$23`z5rPpj1=apkl!KT5zqq8p??KVfmcA)<Hz5#8S|2uy8#sCo*Et7qRjSH
zYIDQ0Z#BQ0uw}(i4Rux)<6_q;lkFd6a$fV}yG^w!jG3xYnIG!>NZB&gMw>mC7c*k6
zir53^?{bfoHa0I!dY}xGgA(_SIemX`+DMGM2K>_#^b_Q-->>TDVjrL2SN%?zi|?TF
z8y?{PwLG`#wNySG<)t3tUnYe=cj;e=V5goqTjsX;7_a^>@w}rtYEyNbuif3`yxd;z
G;_MH{>baQ!

literal 0
HcmV?d00001

diff --git a/orttraining/orttraining/training_api/module.cc b/orttraining/orttraining/training_api/module.cc
index bdf77b86527bb..29300bbb7e8ec 100644
--- a/orttraining/orttraining/training_api/module.cc
+++ b/orttraining/orttraining/training_api/module.cc
@@ -156,7 +156,7 @@ Module::Module(const std::string& train_model_path_or_bytes,
                const Environment& env,
                const std::vector<std::shared_ptr<IExecutionProvider>>& providers,
                const std::optional<std::string>& eval_model_path_or_bytes,
-               gsl::span<OrtCustomOpDomain* const> op_domains)
+               [[maybe_unused]] gsl::span<OrtCustomOpDomain* const> op_domains)
     : state_{state} {
   // Enforce weight prepacking is disabled
   // If the user explicitly enabled weight prepacking then return an error.
@@ -170,9 +170,11 @@ Module::Module(const std::string& train_model_path_or_bytes,
   }
 
   train_sess_ = std::make_unique<onnxruntime::InferenceSession>(session_options, env);
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
   if (!op_domains.empty()) {
     ORT_THROW_IF_ERROR(train_sess_->AddCustomOpDomains(op_domains));
   }
+#endif
 
   ORT_THROW_IF_ERROR(train_sess_->Load(train_model_path_or_bytes));
   for (const auto& provider : providers) {
@@ -278,9 +280,11 @@ Module::Module(const std::string& train_model_path_or_bytes,
 
   if (eval_model_path_or_bytes.has_value()) {
     eval_sess_ = std::make_unique<onnxruntime::InferenceSession>(session_options, env);
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
     if (!op_domains.empty()) {
       ORT_THROW_IF_ERROR(eval_sess_->AddCustomOpDomains(op_domains));
     }
+#endif
 
     ORT_THROW_IF_ERROR(eval_sess_->Load(eval_model_path_or_bytes.value()));
     for (const auto& provider : providers) {
diff --git a/orttraining/orttraining/training_api/optimizer.cc b/orttraining/orttraining/training_api/optimizer.cc
index d76f6d93f8278..26565fdd98024 100644
--- a/orttraining/orttraining/training_api/optimizer.cc
+++ b/orttraining/orttraining/training_api/optimizer.cc
@@ -225,10 +225,12 @@ Optimizer::Optimizer(const std::string& optim_path_or_bytes,
 
 void Optimizer::Initialize(const std::string& optim_path_or_bytes,
                            const std::vector<std::shared_ptr<IExecutionProvider>>& providers,
-                           gsl::span<OrtCustomOpDomain* const> op_domains) {
+                           [[maybe_unused]] gsl::span<OrtCustomOpDomain* const> op_domains) {
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
   if (!op_domains.empty()) {
     ORT_THROW_IF_ERROR(optim_sess_->AddCustomOpDomains(op_domains));
   }
+#endif
 
   for (const auto& execution_provider : providers) {
     ORT_THROW_IF_ERROR(optim_sess_->RegisterExecutionProvider(execution_provider));
diff --git a/tools/ci_build/github/apple/objectivec/assemble_objc_pod_package.py b/tools/ci_build/github/apple/objectivec/assemble_objc_pod_package.py
index 7d1005a34c76a..135a55165beda 100755
--- a/tools/ci_build/github/apple/objectivec/assemble_objc_pod_package.py
+++ b/tools/ci_build/github/apple/objectivec/assemble_objc_pod_package.py
@@ -47,7 +47,10 @@
     ],
     "test_resource_files": [
         "objectivec/test/testdata/*.ort",
-        "onnxruntime/test/testdata/training_api/*",
+        "onnxruntime/test/testdata/training_api/*.onnx",
+        "onnxruntime/test/testdata/training_api/*.ckpt",
+        "onnxruntime/test/testdata/training_api/*.in",
+        "onnxruntime/test/testdata/training_api/*.out",
     ],
 }
 
@@ -72,7 +75,10 @@
         "objectivec/test/ort_training_utils_test.mm",
     ],
     "test_resource_files": [
-        "onnxruntime/test/testdata/training_api/*",
+        "onnxruntime/test/testdata/training_api/*.onnx",
+        "onnxruntime/test/testdata/training_api/*.ckpt",
+        "onnxruntime/test/testdata/training_api/*.in",
+        "onnxruntime/test/testdata/training_api/*.out",
     ],
 }
 

From 0f9883f804e4f7847e7215950a1eabfe047543d7 Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Thu, 20 Jul 2023 18:24:57 -0700
Subject: [PATCH 10/34] Fix Mac M1 build (#16763)

- Add ifndef `__APPLE__` to skip lines which cause EXC_BAD_INSTRUCTION error.
- Fix floatToHalf/doubleToHalf conversion issue and add tests.
---
 onnxruntime/core/mlas/lib/platform.cpp  |  5 ++++-
 onnxruntime/core/util/math_cpu.cc       | 21 +++++++++++++++++++--
 onnxruntime/test/framework/math_test.cc | 18 ++++++++++++++++++
 3 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
index 6446007610a15..d7c5f9fc67aca 100644
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@@ -449,10 +449,13 @@ Return Value:
 
 #if defined(_WIN32)
     HasDotProductInstructions = (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) != 0);
-#else
+#elif !defined(__APPLE__)  // The next few lines result in an EXC_BAD_INSTRUCTION runtime error on a M1 Mac so we
+                           // disable it there.
     uint64_t isar0_el1;
     asm("mrs %[reg], ID_AA64ISAR0_EL1\n" : [reg] "=r"(isar0_el1) : :);
     HasDotProductInstructions = ((isar0_el1 >> 44) & 0xfu) == 0x1u;
+#else
+    HasDotProductInstructions = MLAS_CPUIDINFO::GetCPUIDInfo().HasArmNeonDot();
 #endif
 
     if (HasDotProductInstructions) {
diff --git a/onnxruntime/core/util/math_cpu.cc b/onnxruntime/core/util/math_cpu.cc
index f11fe5a701b3a..983321593a92b 100644
--- a/onnxruntime/core/util/math_cpu.cc
+++ b/onnxruntime/core/util/math_cpu.cc
@@ -20,6 +20,7 @@
 #include "core/framework/float16.h"
 
 #include <algorithm>
+#include <type_traits>
 #include "core/common/narrow.h"
 #include "core/mlas/inc/mlas.h"
 #if defined(__GNUC__)
@@ -855,12 +856,28 @@ void Col2imNd<float, CPUMathUtil, StorageOrder::NCHW>(const float* data_col, con
 SPECIALIZED_COPYVECTOR(float)
 #undef SPECIALIZED_COPYVECTOR
 
+// like C++20's std::bit_cast
+// adapted from the example implementation here: https://en.cppreference.com/w/cpp/numeric/bit_cast
+// TODO replace this with std::bit_cast when we move to C++20
+template <typename Dst, typename Src>
+static std::enable_if_t<
+    sizeof(Src) == sizeof(Dst) &&
+        std::is_trivially_copyable_v<Src> &&
+        std::is_trivially_copyable_v<Dst> &&
+        std::is_trivially_constructible_v<Dst>,
+    Dst>
+BitCast(const Src& src) {
+  Dst dst;
+  std::memcpy(&dst, &src, sizeof(dst));
+  return dst;
+}
+
 uint16_t floatToHalf(float f) {
-  return Eigen::half_impl::float_to_half_rtne(f).x;
+  return BitCast<uint16_t>(Eigen::half_impl::float_to_half_rtne(f).x);
 }
 
 uint16_t doubleToHalf(double f) {
-  return Eigen::half_impl::float_to_half_rtne(static_cast<float>(f)).x;
+  return BitCast<uint16_t>(Eigen::half_impl::float_to_half_rtne(static_cast<float>(f)).x);
 }
 
 float halfToFloat(uint16_t h) {
diff --git a/onnxruntime/test/framework/math_test.cc b/onnxruntime/test/framework/math_test.cc
index 41f8d8669986e..a133bbd50326d 100644
--- a/onnxruntime/test/framework/math_test.cc
+++ b/onnxruntime/test/framework/math_test.cc
@@ -202,4 +202,22 @@ TEST(MathTest, GemvTrans) {
   }
 }
 
+TEST(MathTest, HalfFloatConversion) {
+  constexpr float original_values[] = {-4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f};
+  for (const auto original_value : original_values) {
+    const auto half_value = math::floatToHalf(original_value);
+    const auto round_trip_value = math::halfToFloat(half_value);
+    EXPECT_EQ(round_trip_value, original_value);
+  }
+}
+
+TEST(MathTest, HalfDoubleConversion) {
+  constexpr double original_values[] = {-4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f};
+  for (const auto original_value : original_values) {
+    const auto half_value = math::doubleToHalf(original_value);
+    const auto round_trip_value = static_cast<double>(math::halfToFloat(half_value));
+    EXPECT_EQ(round_trip_value, original_value);
+  }
+}
+
 }  // namespace onnxruntime

From b7176f98266924053204fe25c3f2e701615c2585 Mon Sep 17 00:00:00 2001
From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
Date: Thu, 20 Jul 2023 18:44:28 -0700
Subject: [PATCH 11/34] Fix bug with saving model optimized by inference
 session (#16716)

### Description
A [previous PR](https://github.com/microsoft/onnxruntime/pull/16531)
added a temporary directory to save the model optimizations after
loading a model into an `InferenceSession`. Many models that have an
external data file, however, require the data file to be in the same
directory as the ONNX model file. Because the model is saved in a
temporary directory and the data is saved in another directory, this
causes a `FileNotFoundError` error when trying to load the model in the
temporary directory.

This PR fixes this error by saving the external data file in the same
directory that the optimized model is located in.

### Motivation and Context
This PR fixes a bug with using a temporary directory while running the
optimizer for models that have an external data file.
---
 include/onnxruntime/core/graph/graph.h        |  1 +
 onnxruntime/core/graph/graph.cc               | 11 +++++-
 onnxruntime/core/graph/model.cc               |  7 ++--
 onnxruntime/core/graph/model.h                |  2 ++
 .../python/tools/transformers/onnx_model.py   |  4 +--
 .../python/tools/transformers/optimizer.py    | 35 +++++++++++++++----
 .../save_model_with_external_initializers.cc  |  8 +++--
 .../test/python/onnxruntime_test_python.py    | 24 +++++++++++++
 8 files changed, 79 insertions(+), 13 deletions(-)

diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
index dc6c7d271343c..9f2ff2d4095cb 100644
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@@ -1121,6 +1121,7 @@ class Graph {
   @returns GraphProto serialization of the graph.
   */
   ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::string& external_file_name,
+                                                                  const PathString& file_path,
                                                                   size_t initializer_size_threshold) const;
 
   /** Gets the ISchemaRegistry instances being used with this Graph. */
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index 3c45e910ae1af..d75a7a519254e 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -3381,11 +3381,20 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProto() const {
 }
 
 ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(const std::string& external_file_name,
+                                                                       const PathString& destination_file_path,
                                                                        size_t initializer_size_threshold) const {
   GraphProto result;
   ToGraphProtoInternal(result);
 
-  std::ofstream external_stream(external_file_name, std::ofstream::out | std::ofstream::binary);
+  Path parent_path = Path::Parse(destination_file_path).ParentPath();
+  Path external_file_path = Path::Parse(ToPathString(external_file_name));
+  // Check if parent_path is relative path (length = 0)
+  if (parent_path.ToPathString().length()) {
+    // Save external data file in same directory as model
+    external_file_path = parent_path.Append(external_file_path);
+  }
+
+  std::ofstream external_stream(external_file_path.ToPathString(), std::ofstream::out | std::ofstream::binary);
   ORT_ENFORCE(external_stream.is_open());
   int64_t external_offset = 0;
 
diff --git a/onnxruntime/core/graph/model.cc b/onnxruntime/core/graph/model.cc
index feec7d5407051..b8e8836da3bee 100644
--- a/onnxruntime/core/graph/model.cc
+++ b/onnxruntime/core/graph/model.cc
@@ -344,10 +344,12 @@ ModelProto Model::ToProto() {
 }
 
 ModelProto Model::ToGraphProtoWithExternalInitializers(const std::string& external_file_name,
+                                                       const PathString& file_path,
                                                        size_t initializer_size_threshold) {
   ModelProto result(model_proto_);
   const auto& graph = *graph_;
   *(result.mutable_graph()) = graph.ToGraphProtoWithExternalInitializers(external_file_name,
+                                                                         file_path,
                                                                          initializer_size_threshold);
   return result;
 }
@@ -572,7 +574,7 @@ static Status SaveModelWithExternalInitializers(Model& model,
   ORT_RETURN_IF_ERROR(status);
 
   ORT_TRY {
-    status = Model::SaveWithExternalInitializers(model, fd, external_file_name,
+    status = Model::SaveWithExternalInitializers(model, fd, file_path, external_file_name,
                                                  initializer_size_threshold);
   }
   ORT_CATCH(const std::exception& ex) {
@@ -722,6 +724,7 @@ Status Model::Save(Model& model, int p_fd) {
 
 Status Model::SaveWithExternalInitializers(Model& model,
                                            int fd,
+                                           const PathString& file_path,
                                            const std::string& external_file_name,
                                            size_t initializer_size_threshold) {
   if (fd < 0) {
@@ -730,7 +733,7 @@ Status Model::SaveWithExternalInitializers(Model& model,
 
   ORT_RETURN_IF_ERROR(model.MainGraph().Resolve());
 
-  auto model_proto = model.ToGraphProtoWithExternalInitializers(external_file_name, initializer_size_threshold);
+  auto model_proto = model.ToGraphProtoWithExternalInitializers(external_file_name, file_path, initializer_size_threshold);
   google::protobuf::io::FileOutputStream output(fd);
   const bool result = model_proto.SerializeToZeroCopyStream(&output) && output.Flush();
   if (result) {
diff --git a/onnxruntime/core/graph/model.h b/onnxruntime/core/graph/model.h
index bd1f53b43ca2d..5337211ae79d4 100644
--- a/onnxruntime/core/graph/model.h
+++ b/onnxruntime/core/graph/model.h
@@ -191,6 +191,7 @@ class Model {
   // Save initializer larger than the given threshold (in bytes) into an external binary file
   // with the given name. This function is useful to avoid hitting the size limit of protobuf files.
   ONNX_NAMESPACE::ModelProto ToGraphProtoWithExternalInitializers(const std::string& external_file_name,
+                                                                  const PathString& file_path,
                                                                   size_t initializer_size_threshold);
 
 #ifdef _WIN32
@@ -217,6 +218,7 @@ class Model {
 
   static common::Status SaveWithExternalInitializers(Model& model,
                                                      int fd,
+                                                     const PathString& file_path,
                                                      const std::string& external_file_name,
                                                      size_t initializer_size_threshold);
 
diff --git a/onnxruntime/python/tools/transformers/onnx_model.py b/onnxruntime/python/tools/transformers/onnx_model.py
index 8e1e21e9f8661..ab2ae5ceac946 100644
--- a/onnxruntime/python/tools/transformers/onnx_model.py
+++ b/onnxruntime/python/tools/transformers/onnx_model.py
@@ -1021,13 +1021,13 @@ def save(
             location = Path(external_data_path).name if all_tensors_to_one_file else None
 
             if os.path.exists(output_path):
-                logger.info(f"Delete the existed onnx file: {output_path}")
+                logger.info(f"Delete the existing onnx file: {output_path}")
                 os.remove(output_path)
 
             if all_tensors_to_one_file:
                 if os.path.exists(external_data_path):
                     # Delete the external data file. Otherwise, data will be appended to existing file.
-                    logger.info(f"Delete the existed external data file: {external_data_path}")
+                    logger.info(f"Delete the existing external data file: {external_data_path}")
                     os.remove(external_data_path)
             else:
                 if os.listdir(output_dir):
diff --git a/onnxruntime/python/tools/transformers/optimizer.py b/onnxruntime/python/tools/transformers/optimizer.py
index 99b48e501481e..a7c4cb4971084 100644
--- a/onnxruntime/python/tools/transformers/optimizer.py
+++ b/onnxruntime/python/tools/transformers/optimizer.py
@@ -21,7 +21,7 @@
 import logging
 import os
 import tempfile
-from typing import Dict, Optional
+from typing import Dict, List, Optional
 
 import coloredlogs
 from fusion_options import FusionOptions
@@ -64,8 +64,11 @@ def optimize_by_onnxruntime(
     use_gpu: bool = False,
     optimized_model_path: Optional[str] = None,
     opt_level: Optional[int] = 99,
-    disabled_optimizers=[],  # noqa: B006
-    verbose=False,
+    disabled_optimizers: List[str] = [],  # noqa: B006
+    verbose: bool = False,
+    save_as_external_data: bool = False,
+    external_data_filename: str = "",
+    external_data_file_threshold: int = 1024,
 ) -> str:
     """
     Use onnxruntime to optimize model.
@@ -76,6 +79,9 @@ def optimize_by_onnxruntime(
         optimized_model_path (str or None): the path of optimized model.
         opt_level (int): graph optimization level.
         disabled_optimizers (List[str]): a list of names of disabled optimizers
+        save_as_external_data (bool): whether to save external data outside of ONNX model
+        external_data_filename (str): name of external data file. If not provided, name is automatically created from ONNX model.
+        external_data_file_threshold (int): threshold to decide whether to save tensor in ONNX model or in external data file
     Returns:
         optimized_model_path (str): the path of optimized model
     """
@@ -112,6 +118,16 @@ def optimize_by_onnxruntime(
         optimized_model_path = "{}_o{}_{}.onnx".format(path_prefix, opt_level, "gpu" if use_gpu else "cpu")
 
     sess_options.optimized_model_filepath = optimized_model_path
+    if save_as_external_data:
+        if len(external_data_filename) == 0:
+            # Set external data filename to model_name.onnx.data
+            external_data_filename = os.path.basename(optimized_model_path) + ".data"
+        sess_options.add_session_config_entry(
+            "session.optimized_model_external_initializers_file_name", external_data_filename
+        )
+        sess_options.add_session_config_entry(
+            "session.optimized_model_external_initializers_min_size_in_bytes", str(external_data_file_threshold)
+        )
 
     if verbose:
         print("Using onnxruntime to optimize model - Debug level Set to verbose")
@@ -203,7 +219,8 @@ def optimize_model(
     opt_level: Optional[int] = None,
     use_gpu: bool = False,
     only_onnxruntime: bool = False,
-    verbose=False,
+    verbose: bool = False,
+    use_external_data_format: bool = False,
 ):
     """Optimize Model by OnnxRuntime and/or python fusion logic.
 
@@ -241,6 +258,8 @@ def optimize_model(
         use_gpu (bool, optional): use gpu or not for onnxruntime. Defaults to False.
         only_onnxruntime (bool, optional): only use onnxruntime to optimize model, and no python fusion.
             Defaults to False.
+        use_external_data_format (bool, optional): use external data format when saving optimized model.
+            Defaults to False.
 
      Returns:
         object of an optimizer class.
@@ -260,6 +279,7 @@ def optimize_model(
     temp_dir = tempfile.TemporaryDirectory()
     optimized_model_name = "model_o{}_{}.onnx".format(opt_level, "gpu" if use_gpu else "cpu")
     optimized_model_path = os.path.join(temp_dir.name, optimized_model_name)
+
     if opt_level > 1:
         # Disable some optimizers that might cause failure in symbolic shape inference or attention fusion.
         disabled_optimizers += (
@@ -276,10 +296,11 @@ def optimize_model(
         temp_model_path = optimize_by_onnxruntime(
             input,
             use_gpu=use_gpu,
+            optimized_model_path=optimized_model_path,
             opt_level=opt_level,
             disabled_optimizers=disabled_optimizers,
             verbose=verbose,
-            optimized_model_path=optimized_model_path,
+            save_as_external_data=use_external_data_format,
         )
     elif opt_level == 1:
         # basic optimizations (like constant folding and cast elimination) are not specified to execution provider.
@@ -289,10 +310,11 @@ def optimize_model(
         temp_model_path = optimize_by_onnxruntime(
             input,
             use_gpu=use_gpu,
+            optimized_model_path=optimized_model_path,
             opt_level=1,
             disabled_optimizers=disabled_optimizers,
             verbose=verbose,
-            optimized_model_path=optimized_model_path,
+            save_as_external_data=use_external_data_format,
         )
 
     if only_onnxruntime and not temp_model_path:
@@ -474,6 +496,7 @@ def main():
         optimization_options=optimization_options,
         use_gpu=args.use_gpu,
         only_onnxruntime=args.only_onnxruntime,
+        use_external_data_format=args.use_external_data_format,
     )
 
     if args.float16:
diff --git a/onnxruntime/test/framework/save_model_with_external_initializers.cc b/onnxruntime/test/framework/save_model_with_external_initializers.cc
index b1cb65a82b129..bafec520d18a6 100644
--- a/onnxruntime/test/framework/save_model_with_external_initializers.cc
+++ b/onnxruntime/test/framework/save_model_with_external_initializers.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/common/path_string.h"
 #include "core/framework/data_types.h"
 #include "core/graph/model.h"
 #include "core/framework/tensorprotoutils.h"
@@ -41,6 +42,7 @@ void LoadSaveAndCompareModel(const std::string& input_onnx,
   ASSERT_EQ(initializers.size(), initializers_from_external.size());
 
   // Compare the initializers of the two versions.
+  Path external_data_path{};
   for (auto i : initializers) {
     const std::string kInitName = i.first;
     const ONNX_NAMESPACE::TensorProto* tensor_proto = i.second;
@@ -51,7 +53,9 @@ void LoadSaveAndCompareModel(const std::string& input_onnx,
     size_t tensor_proto_size = tensor_proto_data.size();
 
     std::vector<uint8_t> from_external_tensor_proto_data;
-    ORT_THROW_IF_ERROR(utils::UnpackInitializerData(*from_external_tensor_proto, Path(), from_external_tensor_proto_data));
+    Path model_path = Path::Parse(ToPathString(output_onnx));
+    external_data_path = model_path.ParentPath().Append(Path::Parse(ToPathString(external_init_file)));
+    ORT_THROW_IF_ERROR(utils::UnpackInitializerData(*from_external_tensor_proto, model_path, from_external_tensor_proto_data));
     size_t from_external_tensor_proto_size = from_external_tensor_proto_data.size();
 
     if (from_external_tensor_proto_size < initializer_size_threshold) {
@@ -67,7 +71,7 @@ void LoadSaveAndCompareModel(const std::string& input_onnx,
   }
   // Cleanup.
   ASSERT_EQ(std::remove(output_onnx.c_str()), 0);
-  ASSERT_EQ(std::remove(external_init_file.c_str()), 0);
+  ASSERT_EQ(std::remove(PathToUTF8String(external_data_path.ToPathString()).c_str()), 0);
 }
 
 TEST(SaveWithExternalInitializers, Mnist) {
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index e18a6276cd2c7..e2e2aa8d850f8 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -121,6 +121,30 @@ def testModelSerializationWithExternalInitializers(self):  # noqa: N802
             else:
                 raise onnxruntime_error
 
+    def testModelSerializationWithExternalInitializersToDirectory(self):  # noqa: N802
+        try:
+            so = onnxrt.SessionOptions()
+            so.log_severity_level = 1
+            so.logid = "TestModelSerializationWithExternalInitializersToDirectory"
+            directory = "./testdata/"
+            so.optimized_model_filepath = os.path.join(directory, "model_with_external_initializers_in_dir.onnx")
+            external_initializers_file = "external_initializers_in_dir.bin"
+            so.add_session_config_entry(
+                "session.optimized_model_external_initializers_file_name", external_initializers_file
+            )
+            so.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "100")
+            onnxrt.InferenceSession(get_name("mnist.onnx"), sess_options=so, providers=["CPUExecutionProvider"])
+            self.assertTrue(os.path.isfile(so.optimized_model_filepath))
+            self.assertTrue(os.path.isfile(os.path.join(directory, external_initializers_file)))
+        except Fail as onnxruntime_error:
+            if (
+                str(onnxruntime_error) == "[ONNXRuntimeError] : 1 : FAIL : Unable to serialize model as it contains"
+                " compiled nodes. Please disable any execution providers which generate compiled nodes."
+            ):
+                pass
+            else:
+                raise onnxruntime_error
+
     def testGetProviders(self):  # noqa: N802
         self.assertTrue("CPUExecutionProvider" in onnxrt.get_available_providers())
         # get_all_providers() returns the default EP order from highest to lowest.

From b508c7236f98d1fc200cec6a6df97dfd49c558d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@users.noreply.github.com>
Date: Fri, 21 Jul 2023 04:52:19 +0200
Subject: [PATCH 12/34] Replace call to deprecated torch.norm (#16758)

### Description
torch.norm is deprecated as mentioned in issue #16751. This PR replaces
the call to torch.norm by the options suggested by torch documentation.
---
 .../orttraining/python/training/optim/_modifier.py        | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/orttraining/orttraining/python/training/optim/_modifier.py b/orttraining/orttraining/python/training/optim/_modifier.py
index e9296bc63d560..30178bf4845b2 100644
--- a/orttraining/orttraining/python/training/optim/_modifier.py
+++ b/orttraining/orttraining/python/training/optim/_modifier.py
@@ -137,7 +137,13 @@ def param_is_not_tensor_parallel_duplicate(param):
 
         else:
             for grad in grads_for_norm:
-                grad_norm = torch.norm(grad, norm_type)
+                # torch.norm is deprecated and moved to torch.linalg.norm
+                # with a different signature
+                # see https://pytorch.org/docs/stable/generated/torch.norm.html
+                if norm_type in {"fro", "nuc"}:
+                    grad_norm = torch.linalg.matrix_norm(grad, norm_type)
+                else:
+                    grad_norm = torch.linalg.norm(grad, norm_type)
                 total_norm += grad_norm**norm_type
 
         if horizontal_model_parallel_grad_norm_aggregation:

From 4d569f6586d109983e9059b64cd6c2dcaea643e2 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Thu, 20 Jul 2023 20:57:48 -0700
Subject: [PATCH 13/34] [QNN EP] Op support: LayerNorm, Asin, Sign (#16740)

### Description
Add op support for LayerNorm, Asin, Sign.
Enable QDQ node unit support for Sin Op

---------

Co-authored-by: Adrian Lizarraga <adlizarraga@microsoft.com>
---
 .../selectors_actions/qdq_selectors.cc        |  18 +-
 .../selectors_actions/qdq_selectors.h         |   4 +-
 .../selectors_actions/shared/utils.cc         |  16 +-
 .../qnn/builder/op_builder_factory.cc         |   6 +
 .../qnn/builder/op_builder_factory.h          |   2 +
 .../qnn/builder/opbuilder/base_op_builder.h   | 166 +++++++++---------
 .../opbuilder/layer_norm_op_builder.cc        | 112 ++++++++++++
 .../core/providers/qnn/builder/qnn_def.h      |   1 +
 .../test/providers/cpu/math/sign_test.cc      |   3 +-
 .../test/providers/qnn/layer_norm_test.cc     | 139 +++++++++++++++
 .../test/providers/qnn/simple_op_htp_test.cc  |  21 +++
 11 files changed, 392 insertions(+), 96 deletions(-)
 create mode 100644 onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
 create mode 100644 onnxruntime/test/providers/qnn/layer_norm_test.cc

diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
index 88302dbd33269..565afcc67e7df 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
@@ -330,23 +330,29 @@ bool WhereNodeGroupSelector::Check(const GraphViewer& graph_viewer, const Node&
          dt_input_1 == dt_output;
 }
 
-bool InstanceNormalizationNodeGroupSelector::Check(const GraphViewer& graph_viewer,
-                                                   const Node& node,
-                                                   const std::vector<const Node*>& dq_nodes,
-                                                   const std::vector<const Node*>& q_nodes) const {
+bool InstanceAndLayerNormalizationNodeGroupSelector::Check(const GraphViewer& graph_viewer,
+                                                           const Node& node,
+                                                           const std::vector<const Node*>& dq_nodes,
+                                                           const std::vector<const Node*>& q_nodes) const {
   if (!CheckQDQNodes(graph_viewer, node, dq_nodes, q_nodes)) {
     return false;
   }
 
   int32_t dt_input = dq_nodes[0]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
   int32_t dt_scale = dq_nodes[1]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
-  int32_t dt_bias = dq_nodes[2]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
+  int32_t dt_bias = 0;
+  bool has_bias = false;
+  // bias is optional for LayerNorm
+  if (dq_nodes.size() > 2) {
+    has_bias = true;
+    dt_bias = dq_nodes[2]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
+  }
   int32_t dt_output = q_nodes[0]->OutputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
 
   // Input, output, and scale need to be the same type. The bias is int32.
   return (dt_input == dt_output) &&
          (dt_input == dt_scale) &&
-         (dt_bias == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32);
+         (has_bias ? dt_bias == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32 : true);
 }
 
 bool BatchNormalizationNodeGroupSelector::Check(const GraphViewer& graph_viewer,
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
index 1c165d1787b22..ab9ad45697dfa 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
@@ -139,7 +139,7 @@ class GemmNodeGroupSelector : public NodeGroupSelector {
 
 // Input: DQ nodes for input, scale, and B
 // Output: Q node for output
-class InstanceNormalizationNodeGroupSelector : public NodeGroupSelector {
+class InstanceAndLayerNormalizationNodeGroupSelector : public NodeGroupSelector {
  private:
   bool Check(const GraphViewer& graph_viewer, const Node& node,
              const std::vector<const Node*>& dq_nodes,
@@ -264,7 +264,7 @@ class GemmSelector : public BaseSelector {
 class InstanceNormalizationSelector : public BaseSelector {
  public:
   InstanceNormalizationSelector()
-      : BaseSelector(std::make_unique<InstanceNormalizationNodeGroupSelector>()) {}
+      : BaseSelector(std::make_unique<InstanceAndLayerNormalizationNodeGroupSelector>()) {}
 };
 
 // DQ nodes for X, W and optionally B, (mean, var not required) -> node -> Q
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
index 2cf726f8ad90b..4f24fa26d8896 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
@@ -62,6 +62,9 @@ static const OpVersionsAndSelector::OpVersionsMap GetUnaryOpVersionsMap() {
           {"Softmax", {}},
           {"Sqrt", {}},
           {"Atan", {}},
+          {"Asin", {}},
+          {"Sin", {}},
+          {"Sign", {}},
           {"Tanh", {}},
           {"Exp", {}},
           {"LRN", {}}};
@@ -88,8 +91,9 @@ static const OpVersionsAndSelector::OpVersionsMap GetMatMulOpVersionsMap() {
 static const OpVersionsAndSelector::OpVersionsMap GetGemmOpVersionsMap() {
   return {{"Gemm", {}}};
 }
-static const OpVersionsAndSelector::OpVersionsMap GetInstanceNormalizationOpVersionsMap() {
-  return {{"InstanceNormalization", {}}};
+static const OpVersionsAndSelector::OpVersionsMap GetInstanceAndLayerNormalizationOpVersionsMap() {
+  return {{"InstanceNormalization", {}},
+          {"LayerNormalization", {}}};
 }
 static const OpVersionsAndSelector::OpVersionsMap GetBatchNormalizationOpVersionsMap() {
   return {{"BatchNormalization", {}}};
@@ -167,10 +171,10 @@ void RegisterGemmSelector(Selectors& qdq_selectors) {
                                  std::move(selector));
 }
 
-void RegisterInstanceNormalizationSelector(Selectors& qdq_selectors) {
+void RegisterInstanceAndLayerNormalizationSelector(Selectors& qdq_selectors) {
   /* register selector for InstanceNormalization op */
-  std::unique_ptr<NodeGroupSelector> selector = std::make_unique<InstanceNormalizationNodeGroupSelector>();
-  qdq_selectors.RegisterSelector(GetInstanceNormalizationOpVersionsMap(),
+  std::unique_ptr<NodeGroupSelector> selector = std::make_unique<InstanceAndLayerNormalizationNodeGroupSelector>();
+  qdq_selectors.RegisterSelector(GetInstanceAndLayerNormalizationOpVersionsMap(),
                                  std::move(selector));
 }
 
@@ -198,7 +202,7 @@ void SelectorManager::CreateSelectors() {
   RegisterConvTransposeSelector(qdq_selectors_);
   RegisterMatMulSelector(qdq_selectors_);
   RegisterGemmSelector(qdq_selectors_);
-  RegisterInstanceNormalizationSelector(qdq_selectors_);
+  RegisterInstanceAndLayerNormalizationSelector(qdq_selectors_);
   RegisterBatchNormalizationSelector(qdq_selectors_);
   RegisterLogicalComparisonSelectors(qdq_selectors_);
 }
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
index 05dc2e696302c..eb658f58cd686 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
+++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
@@ -15,12 +15,14 @@ namespace qnn {
 OpBuilderRegistrations::OpBuilderRegistrations() {
   {
     CreateSimpleOpBuilder("Add", *this);
+    CreateSimpleOpBuilder("Asin", *this);
     CreateSimpleOpBuilder("Atan", *this);
     CreateSimpleOpBuilder("Mul", *this);
     CreateSimpleOpBuilder("Abs", *this);
     CreateSimpleOpBuilder("And", *this);
     CreateSimpleOpBuilder("Ceil", *this);
     CreateSimpleOpBuilder("Cos", *this);
+    CreateSimpleOpBuilder("Sign", *this);
     CreateSimpleOpBuilder("Div", *this);
     CreateSimpleOpBuilder("Equal", *this);
     CreateSimpleOpBuilder("Exp", *this);
@@ -136,6 +138,10 @@ OpBuilderRegistrations::OpBuilderRegistrations() {
     CreateBatchNormOpBuilder("BatchNormalization", *this);
   }
 
+  {
+    CreateLayerNormOpBuilder("LayerNormalization", *this);
+  }
+
   {
     CreateLRNOpBuilder("LRN", *this);
   }
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.h b/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
index 8f66df7bdcefe..694cfb5ce0046 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
+++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
@@ -82,6 +82,8 @@ void CreateReduceOpBuilder(const std::string& op_type, OpBuilderRegistrations& o
 
 void CreateBatchNormOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
+void CreateLayerNormOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+
 void CreateLRNOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
 }  // namespace qnn
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
index 4bc6d5ce0327b..df1d0ac83d0c7 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
@@ -81,86 +81,89 @@ class BaseOpBuilder : public IOpBuilder {
   static const std::string& GetQnnOpType(const std::string& onnx_op_type) {
     // TODO: Use QNN operator names defined in "QnnOpDef.h"
     static const std::unordered_map<std::string, std::string> onnx_op_type_to_qnn_op_type = {
-        {"Add", "ElementWiseAdd"},
-        {"Mul", "ElementWiseMultiply"},
-        {"Abs", "ElementWiseAbs"},
-        {"And", "ElementWiseAnd"},
-        {"Atan", "ElementWiseAtan"},
-        {"Ceil", "ElementWiseCeil"},
-        {"Cast", "Cast"},
-        {"Clip", "ReluMinMax"},
-        {"Cos", "ElementWiseCos"},
-        {"Div", "ElementWiseDivide"},
-        {"Equal", "ElementWiseEqual"},
-        {"Exp", "ElementWiseExp"},
-        {"Floor", "ElementWiseFloor"},
-        {"Gather", "Gather"},
-        {"Greater", "ElementWiseGreater"},
-        {"GreaterOrEqual", "ElementWiseGreaterEqual"},
-        {"Less", "ElementWiseLess"},
-        {"LessOrEqual", "ElementWiseLessEqual"},
-        {"Log", "ElementWiseLog"},
-        {"Max", "ElementWiseMaximum"},
-        {"Min", "ElementWiseMinimum"},
-        {"Neg", "ElementWiseNeg"},
-        {"Not", "ElementWiseNot"},
-        {"Or", "ElementWiseOr"},
-        {"Pow", "ElementWisePower"},
-        {"PRelu", "Prelu"},
-        {"LeakyRelu", "Prelu"},
-        {"ReduceMax", "ReduceMax"},
-        {"ReduceMean", "ReduceMean"},
-        {"ReduceMin", "ReduceMin"},
-        {"ReduceProd", "ReduceProd"},
-        {"ReduceSum", "ReduceSum"},
-        {"Round", "ElementWiseRound"},
-        {"Where", "ElementWiseSelect"},
-        {"Sigmoid", "Sigmoid"},
-        {"Sin", "ElementWiseSin"},
-        {"Slice", "StridedSlice"},
-        {"Split", "Split"},
-        {"Softmax", "Softmax"},
-        {"Sqrt", "ElementWiseSquareRoot"},
-        {"Sub", "ElementWiseSubtract"},
-        {"Tanh", "Tanh"},
-        {"Transpose", "Transpose"},
-
-        {"DequantizeLinear", "Dequantize"},
-        {"QuantizeLinear", "Quantize"},
-
-        {"MatMul", "MatMul"},
-
-        {"Elu", "Elu"},
-        {"Relu", "Relu"},
-        {"Gelu", "Gelu"},
-        {"Sigmoid", "Sigmoid"},
-
-        {"HardSwish", "HardSwish"},
-
-        {"Conv", "Conv2d"},
-        {"ConvTranspose", "TransposeConv2d"},
-
-        {"GlobalAveragePool", "PoolAvg2d"},
-        {"AveragePool", "PoolAvg2d"},
-        {"MaxPool", "PoolMax2d"},
-
-        {"Reshape", "Reshape"},
-        {"Resize", "Resize"},
-        {"Flatten", "Reshape"},
-        {"Squeeze", "Reshape"},
-        {"Unsqueeze", "Reshape"},
-
-        {"LogSoftmax", "LogSoftmax"},
-        {"Concat", "Concat"},
-
-        {"Gemm", "FullyConnected"},
-
-        {"ArgMax", "Argmax"},
-        {"ArgMin", "Argmin"},
-        {"Tile", "Tile"},
-        {"TopK", "TopK"},
-        {"InstanceNormalization", "InstanceNorm"},
-        {"BatchNormalization", "Batchnorm"},
+        {"Add", QNN_OP_ELEMENT_WISE_ADD},
+        {"Mul", QNN_OP_ELEMENT_WISE_MULTIPLY},
+        {"Abs", QNN_OP_ELEMENT_WISE_ABS},
+        {"And", QNN_OP_ELEMENT_WISE_AND},
+        {"Asin", QNN_OP_ELEMENT_WISE_ASIN},
+        {"Atan", QNN_OP_ELEMENT_WISE_ATAN},
+        {"Ceil", QNN_OP_ELEMENT_WISE_CEIL},
+        {"Sign", QNN_OP_ELEMENT_WISE_SIGN},
+        {"Cast", QNN_OP_CAST},
+        {"Clip", QNN_OP_RELU_MIN_MAX},
+        {"Cos", QNN_OP_ELEMENT_WISE_COS},
+        {"Div", QNN_OP_ELEMENT_WISE_DIVIDE},
+        {"Equal", QNN_OP_ELEMENT_WISE_EQUAL},
+        {"Exp", QNN_OP_ELEMENT_WISE_EXP},
+        {"Floor", QNN_OP_ELEMENT_WISE_FLOOR},
+        {"Gather", QNN_OP_GATHER},
+        {"Greater", QNN_OP_ELEMENT_WISE_GREATER},
+        {"GreaterOrEqual", QNN_OP_ELEMENT_WISE_GREATER_EQUAL},
+        {"Less", QNN_OP_ELEMENT_WISE_LESS},
+        {"LessOrEqual", QNN_OP_ELEMENT_WISE_LESS_EQUAL},
+        {"Log", QNN_OP_ELEMENT_WISE_LOG},
+        {"Max", QNN_OP_ELEMENT_WISE_MAXIMUM},
+        {"Min", QNN_OP_ELEMENT_WISE_MINIMUM},
+        {"Neg", QNN_OP_ELEMENT_WISE_NEG},
+        {"Not", QNN_OP_ELEMENT_WISE_NOT},
+        {"Or", QNN_OP_ELEMENT_WISE_OR},
+        {"Pow", QNN_OP_ELEMENT_WISE_POWER},
+        {"PRelu", QNN_OP_PRELU},
+        {"LeakyRelu", QNN_OP_PRELU},
+        {"ReduceMax", QNN_OP_REDUCE_MAX},
+        {"ReduceMean", QNN_OP_REDUCE_MEAN},
+        {"ReduceMin", QNN_OP_REDUCE_MIN},
+        {"ReduceProd", QNN_OP_REDUCE_PROD},
+        {"ReduceSum", QNN_OP_REDUCE_SUM},
+        {"Round", QNN_OP_ELEMENT_WISE_ROUND},
+        {"Where", QNN_OP_ELEMENT_WISE_SELECT},
+        {"Sigmoid", QNN_OP_SIGMOID},
+        {"Sin", QNN_OP_ELEMENT_WISE_SIN},
+        {"Slice", QNN_OP_STRIDED_SLICE},
+        {"Split", QNN_OP_SPLIT},
+        {"Softmax", QNN_OP_SOFTMAX},
+        {"Sqrt", QNN_OP_ELEMENT_WISE_SQUARE_ROOT},
+        {"Sub", QNN_OP_ELEMENT_WISE_SUBTRACT},
+        {"Tanh", QNN_OP_TANH},
+        {"Transpose", QNN_OP_TRANSPOSE},
+
+        {"DequantizeLinear", QNN_OP_DEQUANTIZE},
+        {"QuantizeLinear", QNN_OP_QUANTIZE},
+
+        {"MatMul", QNN_OP_MAT_MUL},
+
+        {"Elu", QNN_OP_ELU},
+        {"Relu", QNN_OP_RELU},
+        {"Gelu", QNN_OP_GELU},
+        {"Sigmoid", QNN_OP_SIGMOID},
+
+        {"HardSwish", QNN_OP_HARD_SWISH},
+
+        {"Conv", QNN_OP_CONV_2D},
+        {"ConvTranspose", QNN_OP_TRANSPOSE_CONV_2D},
+
+        {"GlobalAveragePool", QNN_OP_POOL_AVG_2D},
+        {"AveragePool", QNN_OP_POOL_AVG_2D},
+        {"MaxPool", QNN_OP_POOL_MAX_2D},
+
+        {"Reshape", QNN_OP_RESHAPE},
+        {"Resize", QNN_OP_RESIZE},
+        {"Flatten", QNN_OP_RESHAPE},
+        {"Squeeze", QNN_OP_RESHAPE},
+        {"Unsqueeze", QNN_OP_RESHAPE},
+
+        {"LogSoftmax", QNN_OP_LOG_SOFTMAX},
+        {"Concat", QNN_OP_CONCAT},
+
+        {"Gemm", QNN_OP_FULLY_CONNECTED},
+
+        {"ArgMax", QNN_OP_ARGMAX},
+        {"ArgMin", QNN_OP_ARGMIN},
+        {"Tile", QNN_OP_TILE},
+        {"TopK", QNN_OP_TOP_K},
+        {"InstanceNormalization", QNN_OP_INSTANCE_NORM},
+        {"BatchNormalization", QNN_OP_BATCHNORM},
+        {"LayerNormalization", QNN_OP_LAYER_NORM},
 
         {"LRN", QNN_OP_LRN}};
     auto it = onnx_op_type_to_qnn_op_type.find(onnx_op_type);
@@ -262,7 +265,8 @@ class BaseOpBuilder : public IOpBuilder {
     static const std::unordered_map<std::string, std::pair<size_t, size_t>> input_output_count_qnn_required = {
         {"GlobalAveragePool", {0, 1}},
         {"MaxPool", {0, 1}},
-        {"BatchNormalization", {3, 1}}};
+        {"BatchNormalization", {3, 1}},
+        {"LayerNormalization", {0, 1}}};
 
     auto pos = input_output_count_qnn_required.find(onnx_op_type);
     if (pos == input_output_count_qnn_required.end()) {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
new file mode 100644
index 0000000000000..a6bbb3b872845
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
@@ -0,0 +1,112 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/common.h"
+#include "core/providers/shared/utils/utils.h"
+#include "core/framework/tensorprotoutils.h"
+#include "core/providers/qnn/builder/qnn_model_wrapper.h"
+#include "core/providers/qnn/builder/op_builder_factory.h"
+#include "core/common/safeint.h"
+#include "onnx/defs/data_type_utils.h"
+
+#include "base_op_builder.h"
+
+namespace onnxruntime {
+namespace qnn {
+
+class LayerNormOpBuilder : public BaseOpBuilder {
+ public:
+  LayerNormOpBuilder() : BaseOpBuilder("LayerNormOpBuilder") {}
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(LayerNormOpBuilder);
+
+  Status IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
+                       const NodeUnit& node_unit,
+                       const logging::Logger& logger,
+                       bool is_quantized_model) const override final ORT_MUST_USE_RESULT;
+
+ protected:
+  Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
+                                     const NodeUnit& node_unit,
+                                     std::vector<std::string>&& input_names,
+                                     const logging::Logger& logger,
+                                     bool is_quantized_model,
+                                     bool do_op_validation) const override ORT_MUST_USE_RESULT;
+};
+
+// Instance normalization op is sensitive to data layout.
+// The nodes from 1st call of GetCapability do not get layout transformer applied, so their shapes are still NCHW.
+// The nodes from 2nd call of GetCapability get their layout transformed to NHWC.
+// Therefore, we need to check the node domain to determine if the layout has been transformed.
+Status LayerNormOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
+                                         const NodeUnit& node_unit,
+                                         const logging::Logger& logger,
+                                         bool is_quantized_model) const {
+  const auto float_elem_type = ONNX_NAMESPACE::Utils::DataTypeUtils::ToType("float");
+
+  // Check input type is float for CPU.
+  const auto& inputs = node_unit.Inputs();
+  ONNX_NAMESPACE::DataType input_data_type = inputs[0].node_arg.Type();
+  ORT_RETURN_IF(!is_quantized_model && input_data_type != float_elem_type, "QNN LayerNorm data type ", input_data_type->c_str(), " is not supported in CPU backend.");
+
+  // Also check output type is float for CPU.
+  const auto& outputs = node_unit.Outputs();
+  ONNX_NAMESPACE::DataType output_data_type = outputs[0].node_arg.Type();
+  ORT_RETURN_IF(!is_quantized_model && output_data_type != float_elem_type, "QNN LayerNorm data type ", output_data_type->c_str(), " is not supported in CPU backend.");
+  ORT_RETURN_IF(outputs.size() > 1, "QNN LayerNorm only support 1 output.");
+
+  return AddToModelBuilder(qnn_model_wrapper, node_unit, logger, is_quantized_model, true);
+}
+
+Status LayerNormOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
+                                                       const NodeUnit& node_unit,
+                                                       std::vector<std::string>&& input_names,
+                                                       const logging::Logger& logger,
+                                                       bool is_quantized_model,
+                                                       bool do_op_validation) const {
+  NodeAttrHelper node_helper(node_unit);
+  std::vector<std::string> param_tensor_names;
+
+  const float epsilon = node_helper.Get("epsilon", 1e-05f);  // Default is 1e-05 according to ONNX spec.
+  Qnn_Scalar_t epsilon_param = QNN_SCALAR_INIT;
+  epsilon_param.dataType = QNN_DATATYPE_FLOAT_32;
+  epsilon_param.floatValue = epsilon;
+  QnnParamWrapper epsilon_param_wrapper(node_unit.Index(),
+                                        node_unit.Name(),
+                                        QNN_OP_LAYER_NORM_PARAM_EPSILON,
+                                        epsilon_param);
+  param_tensor_names.push_back(epsilon_param_wrapper.GetParamTensorName());
+  qnn_model_wrapper.AddParamWrapper(std::move(epsilon_param_wrapper));
+
+  std::vector<uint32_t> input_shape;
+  ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(node_unit.Inputs()[0].node_arg, input_shape), "Cannot get shape of input 0");
+  const size_t input_rank = input_shape.size();
+  int32_t default_axis = -1;
+  Qnn_Scalar_t axis_qnn_scalar = QNN_SCALAR_INIT;
+  ORT_RETURN_IF_ERROR(ProcessAxisAttribute(qnn_model_wrapper, node_unit, axis_qnn_scalar, default_axis));
+  size_t axes_rank = input_rank - static_cast<size_t>(default_axis);
+  std::vector<uint32_t> axes(axes_rank, 0);
+  std::vector<uint32_t> axes_shape{SafeInt<uint32_t>(axes_rank)};
+  axes[0] = static_cast<uint32_t>(default_axis);
+  for (size_t i = 1; i < axes.size(); ++i) {
+    axes[i] = axes[i - 1] + 1;
+  }
+
+  QnnParamWrapper axes_param(node_unit.Index(), node_unit.Name(), QNN_OP_LAYER_NORM_PARAM_AXES,
+                             std::move(axes_shape), std::move(axes));
+  param_tensor_names.push_back(axes_param.GetParamTensorName());
+  qnn_model_wrapper.AddParamWrapper(std::move(axes_param));
+
+  ORT_RETURN_IF_ERROR(ProcessOutputs(qnn_model_wrapper, node_unit,
+                                     std::move(input_names),
+                                     std::move(param_tensor_names),
+                                     logger, is_quantized_model, do_op_validation, GetQnnOpType(node_unit.OpType())));
+
+  return Status::OK();
+}
+
+void CreateLayerNormOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
+  op_registrations.AddOpBuilder(op_type, std::make_unique<LayerNormOpBuilder>());
+}
+
+}  // namespace qnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_def.h b/onnxruntime/core/providers/qnn/builder/qnn_def.h
index 733683f8aa6a8..c096d5d88972d 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_def.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_def.h
@@ -439,6 +439,7 @@ typedef struct GraphConfigInfo {
 
 namespace qnn_def {
 const std::string package_name = "qti.aisw";
+// TODO: remove these parameter name, re-use from QnnOpDef.h
 const std::string dilation = "dilation";
 const std::string pad_amount = "pad_amount";
 const std::string stride = "stride";
diff --git a/onnxruntime/test/providers/cpu/math/sign_test.cc b/onnxruntime/test/providers/cpu/math/sign_test.cc
index 1a657637b9e4c..12844068c47d2 100644
--- a/onnxruntime/test/providers/cpu/math/sign_test.cc
+++ b/onnxruntime/test/providers/cpu/math/sign_test.cc
@@ -140,7 +140,8 @@ TEST(MathOpTest, Sign_int64) {
   std::vector<int64_t> output;
   TestImpl<int64_t>(input.cbegin(), input.cend(), std::back_inserter(output));
   test.AddOutput<int64_t>("output", input_dims, output);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider});
+  // TODO: QNN execute error, need further investigation
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kQnnExecutionProvider});
 }
 
 TEST(MathOpTest, Sign_float) {
diff --git a/onnxruntime/test/providers/qnn/layer_norm_test.cc b/onnxruntime/test/providers/qnn/layer_norm_test.cc
new file mode 100644
index 0000000000000..97917a2816c3a
--- /dev/null
+++ b/onnxruntime/test/providers/qnn/layer_norm_test.cc
@@ -0,0 +1,139 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include <string>
+#include "core/graph/graph.h"
+
+#include "test/optimizer/qdq_test_utils.h"
+#include "test/providers/qnn/qnn_test_utils.h"
+
+#include "gtest/gtest.h"
+
+namespace onnxruntime {
+namespace test {
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
+static void RunLayerNormCpuTest(const std::vector<int64_t>& shape) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnCpu.dll";
+#else
+  provider_options["backend_path"] = "libQnnCpu.so";
+#endif
+
+  auto BuildLayerNormTestCase = [](const std::vector<int64_t>& shape) -> GetTestModelFn {
+    return [shape](ModelTestBuilder& builder) {
+      // Random input data
+      auto input = builder.MakeInput<float>(shape, 0.0f, 10.0f);
+      auto scale = builder.MakeInput<float>(shape, 0.0f, 10.0f);
+
+      auto* output = builder.MakeOutput();
+      Node& layer_norm_node = builder.AddNode("LayerNormalization", {input, scale}, {output});
+
+      layer_norm_node.AddAttribute("axis", static_cast<int64_t>(0));
+    };
+  };
+
+  constexpr int expected_nodes_in_partition = 1;
+  RunQnnModelTest(BuildLayerNormTestCase(shape),
+                  provider_options,
+                  13,
+                  ExpectedEPNodeAssignment::All,
+                  expected_nodes_in_partition);
+}
+
+TEST_F(QnnCPUBackendTests, TestLayerNorm) {
+  RunLayerNormCpuTest({2, 3});
+}
+
+TEST_F(QnnCPUBackendTests, TestLayerNorm1D) {
+  RunLayerNormCpuTest({1, 2, 3});
+}
+
+TEST_F(QnnCPUBackendTests, TestLayerNorm2D) {
+  RunLayerNormCpuTest({1, 2, 3, 3});
+}
+
+TEST_F(QnnCPUBackendTests, TestLayerNorm3D) {
+  RunLayerNormCpuTest({1, 2, 3, 3, 4});
+}
+
+template <typename InputQType, typename ScaleQType>
+GetQDQTestCaseFn BuildQDQLayerNormTestCase(const std::vector<int64_t>& input_shape,
+                                           const std::vector<int64_t>& scale_shape,
+                                           int64_t axis_value = 0) {
+  return [input_shape, scale_shape, axis_value](ModelTestBuilder& builder) {
+    const InputQType quant_zero_point = 0;
+    // const float quant_scale = 1.0f;
+
+    auto* input = builder.MakeInput<InputQType>(input_shape, std::numeric_limits<InputQType>::min(),
+                                                std::numeric_limits<InputQType>::max());
+    auto* dq_input = builder.MakeIntermediate();
+    builder.AddDequantizeLinearNode<InputQType>(input, 0.0039f, quant_zero_point, dq_input);
+
+    auto* dq_scale_output = builder.MakeIntermediate();
+    auto* scale = builder.MakeInitializer<ScaleQType>(scale_shape, static_cast<ScaleQType>(1), static_cast<ScaleQType>(127));
+    builder.AddDequantizeLinearNode<ScaleQType>(scale, 0.0028f, quant_zero_point, dq_scale_output);
+
+    auto* layernorm_output = builder.MakeIntermediate();
+    Node& layer_norm_node = builder.AddNode("LayerNormalization", {dq_input, dq_scale_output}, {layernorm_output});
+    layer_norm_node.AddAttribute("axis", axis_value);
+
+    auto* q_output = builder.MakeIntermediate();
+    builder.AddQuantizeLinearNode<InputQType>(layernorm_output, 0.00377f, quant_zero_point, q_output);
+
+    auto* final_output = builder.MakeOutput();
+    builder.AddDequantizeLinearNode<InputQType>(q_output, 0.00377f,
+                                                quant_zero_point,
+                                                final_output);
+  };
+}
+
+/**
+ * Runs an LayerNormalization model on the QNN HTP backend. Checks the graph node assignment, and that inference
+ * outputs for QNN and CPU match.
+ *
+ * \param input_shape The input's shape.
+ * \param scale_shape The scale's shape.
+ * \param expected_ep_assignment How many nodes are expected to be assigned to QNN (All, Some, or None).
+ * \param num_modes_in_graph The number of expected nodes in the graph.
+ * \param axis_value The axis value.
+ */
+static void RunLayerNormQDQTest(const std::vector<int64_t>& input_shape,
+                                const std::vector<int64_t>& scale_shape,
+                                ExpectedEPNodeAssignment expected_ep_assignment,
+                                int64_t axis_value = 0) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  // Runs model with DQ-> InstanceNorm -> Q and compares the outputs of the CPU and QNN EPs.
+  RunQnnModelTest(BuildQDQLayerNormTestCase<uint8_t, uint8_t>(input_shape, scale_shape, axis_value),
+                  provider_options,
+                  11,
+                  expected_ep_assignment);
+}
+
+// Check that QNN compiles DQ -> LayerNormalization -> Q as a single unit.
+// Use an input of rank 3.
+// Failed QNN op validation: QnnDsp <E> Param[0] has incorrect Value 3
+TEST_F(QnnHTPBackendTests, DISABLED_TestQDQLayerNorm1DAxis0) {
+  RunLayerNormQDQTest({1, 2, 3}, {1, 2, 3}, ExpectedEPNodeAssignment::All);
+}
+
+// Failed QNN FinalizeGraphs: QnnDsp <E> Failed to finalize graph (id: 1) with err 1002
+TEST_F(QnnHTPBackendTests, DISABLED_TestQDQLayerNorm1DAxis2) {
+  RunLayerNormQDQTest({1, 2, 3}, {3}, ExpectedEPNodeAssignment::All, -1);
+}
+
+#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
+}  // namespace test
+}  // namespace onnxruntime
+
+#endif
\ No newline at end of file
diff --git a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
index fd6ab0011db30..93bd96e9549e8 100644
--- a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc
@@ -154,6 +154,27 @@ TEST_F(QnnHTPBackendTests, TestQDQAtanTest) {
                           "Atan", {}, 11, ExpectedEPNodeAssignment::All);
 }
 
+// Check that QNN compiles DQ -> Asin -> Q as a single unit.
+// Use an input of rank 3.
+TEST_F(QnnHTPBackendTests, TestQDQAsinTest) {
+  RunQDQSingleInputOpTest(TestInputDef<uint8_t>({1, 2, 3}, false, 0, 1),  // input range 0 ~ 1
+                          "Asin", {}, 11, ExpectedEPNodeAssignment::All);
+}
+
+// Check that QNN compiles DQ -> Sign -> Q as a single unit.
+// Use an input of rank 3.
+TEST_F(QnnHTPBackendTests, TestQDQSignTest) {
+  RunQDQSingleInputOpTest(TestInputDef<uint8_t>({1, 2, 3}, false, UInt8Limits::min(), UInt8Limits::max()),
+                          "Sign", {}, 11, ExpectedEPNodeAssignment::All);
+}
+
+// Check that QNN compiles DQ -> Sign -> Q as a single unit.
+// Use an input of rank 3.
+TEST_F(QnnHTPBackendTests, TestQDQSinTest) {
+  RunQDQSingleInputOpTest(TestInputDef<uint8_t>({1, 2, 3}, false, UInt8Limits::min(), UInt8Limits::max()),
+                          "Sin", {}, 11, ExpectedEPNodeAssignment::All);
+}
+
 // Check that QNN compiles DQ -> Softmax -> Q as a single unit.
 // Test that the default axis (-1) for SoftMax opset 13 works.
 TEST_F(QnnHTPBackendTests, TestQDQSoftmax13_DefaultAxis) {

From 1e18efade5d293033900ea1c7ff1ea746fa9f17f Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Date: Thu, 20 Jul 2023 21:58:29 -0700
Subject: [PATCH 14/34] [C#] Add ML Sequences and Maps Create and Process APIs
 (#16648)

### Description
1) Added Sequence And Maps convenience APIs to create input Sequences
and Maps
and also visit the outputs.

2) Address OrtValue design issue when the values are created on top of
the
managed memory and the ortValues are used for sequence and maps
creation.
We should retain the original managed instances that keep the memory
pinned.
We opt to keep track of those and dispose of them within an instance of
OrtValue
that represents a Map or a Sequence.

3) Set `LangVersion` to default per [MS Versioning
Docs.](https://learn.microsoft.com/en-us/dotnet/csharp/language-reference/configure-language-version)

### Motivation and Context
1) When writing code examples, use of Map and Sequences API proved to be
cumbersome.
2) It is a BUG, that we should address, as the managed memory can move
by the GC and lead to
intermittent crashes.
3) Make use of the most feature of the C#.
---
 .../DisposableNamedOnnxValue.shared.cs        |   7 +
 .../FixedBufferOnnxValue.shared.cs            |   2 +-
 .../ManagedProjections.shared.cs              |  56 +--
 .../Microsoft.ML.OnnxRuntime.csproj           |   2 +-
 .../OrtValue.shared.cs                        | 377 ++++++++++++++++--
 ...rosoft.ML.OnnxRuntime.EndToEndTests.csproj |  11 +-
 ...crosoft.ML.OnnxRuntime.Tests.Common.csproj |   2 +-
 .../OrtValueTests.cs                          | 134 ++++---
 .../TestDataLoader.cs                         | 217 +++++-----
 ...oft.ML.OnnxRuntime.Tests.NetCoreApp.csproj |   2 +-
 10 files changed, 581 insertions(+), 229 deletions(-)

diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/DisposableNamedOnnxValue.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/DisposableNamedOnnxValue.shared.cs
index 691aa59927953..6d69f58d20413 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/DisposableNamedOnnxValue.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/DisposableNamedOnnxValue.shared.cs
@@ -21,6 +21,7 @@ public interface IDisposableReadOnlyCollection<T> : IReadOnlyCollection<T>, IRea
     internal class DisposableList<T> : List<T>, IDisposableReadOnlyCollection<T>
         where T : IDisposable
     {
+        private bool _disposed;
         public DisposableList() { }
         public DisposableList(int count) : base(count) { }
 
@@ -30,6 +31,11 @@ public DisposableList(IEnumerable<T> collection) : base(collection) { }
 
         protected virtual void Dispose(bool disposing)
         {
+            if (_disposed)
+            {
+                return;
+            }
+
             if (disposing)
             {
                 // Dispose in the reverse order.
@@ -43,6 +49,7 @@ protected virtual void Dispose(bool disposing)
                     this[i]?.Dispose();
                 }
                 this.Clear();
+                _disposed = true;
             }
         }
 
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/FixedBufferOnnxValue.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/FixedBufferOnnxValue.shared.cs
index 56e0106e9e96a..3a29eea1bdae8 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/FixedBufferOnnxValue.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/FixedBufferOnnxValue.shared.cs
@@ -89,7 +89,7 @@ public static FixedBufferOnnxValue CreateFromTensor<T>(Tensor<T> value)
         /// \endcode
         /// </example>
         public static FixedBufferOnnxValue CreateFromMemory<T>(OrtMemoryInfo memoryInfo, Memory<T> memory,
-            TensorElementType elementType, long[] shape, long bytesSize)
+            TensorElementType elementType, long[] shape, long bytesSize) where T : unmanaged
         {
             if(elementType == TensorElementType.String)
             {
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/ManagedProjections.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/ManagedProjections.shared.cs
index 517fcb56836cc..e512a8c2612ae 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/ManagedProjections.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/ManagedProjections.shared.cs
@@ -5,6 +5,7 @@
 using System;
 using System.Collections.Generic;
 using System.Diagnostics;
+using System.Linq;
 
 namespace Microsoft.ML.OnnxRuntime
 {
@@ -24,8 +25,7 @@ internal class ManagedTypeProjection
         /// </summary>
         /// <param name="namedOnnxValue"></param>
         /// <param name="metadata"></param>
-        /// <param name="disposables"></param>
-        /// <returns></returns>
+        /// <returns>OrtValye created accoding to the metadata</returns>
         internal static OrtValue CreateProjection(NamedOnnxValue namedOnnxValue, NodeMetadata metadata)
         {
             OrtValue result;
@@ -67,8 +67,7 @@ internal static OrtValue CreateProjection(NamedOnnxValue namedOnnxValue, NodeMet
         /// </summary>
         /// <param name="namedOnnxValue">NamedOnnxValue containing a IEnumerable<NameOnnValue></param>
         /// <param name="metadata">sequence metadata</param>
-        /// <param name="disposables">cleanup list</param>
-        /// <returns></returns>
+        /// <returns>OrtValue that represents a sequence</returns>
         /// <exception cref="OnnxRuntimeException"></exception>
         private static OrtValue CreateSequenceProjection(NamedOnnxValue namedOnnxValue, NodeMetadata metadata)
         {
@@ -84,8 +83,8 @@ private static OrtValue CreateSequenceProjection(NamedOnnxValue namedOnnxValue,
                 capacity = collection.Count;
             }
 
-            // Record all the ortValues belonging to the sequence locally
-            using (var sequenceOrtValues = new DisposableList<OrtValue>(capacity))
+            DisposableList<OrtValue> sequenceOrtValues = new(capacity);
+            try
             {
                 foreach (var element in seqContainer)
                 {
@@ -97,7 +96,12 @@ private static OrtValue CreateSequenceProjection(NamedOnnxValue namedOnnxValue,
 
                     sequenceOrtValues.Add(CreateProjection(element, elementMeta));
                 }
-                return OrtValue.CreateSequence(sequenceOrtValues);
+                return OrtValue.CreateSequence(ref sequenceOrtValues);
+            }
+            catch(Exception)
+            {
+                sequenceOrtValues?.Dispose();
+                throw;
             }
         }
 
@@ -107,7 +111,6 @@ private static OrtValue CreateSequenceProjection(NamedOnnxValue namedOnnxValue,
         /// </summary>
         /// <param name="node"></param>
         /// <param name="elementMeta"></param>
-        /// <param name="disposables"></param>
         /// <returns>OrtValue</returns>
         /// <exception cref="OnnxRuntimeException"></exception>
         private static OrtValue CreateMapProjection(NamedOnnxValue node, NodeMetadata elementMeta)
@@ -123,9 +126,13 @@ private static OrtValue CreateMapProjection(NamedOnnxValue node, NodeMetadata el
                     $"Node: {node.Name} onnxruntime only supports maps with primitive types values");
             }
 
-            TensorBase keys = node.GetDictionaryKeys();
-            using (OrtValue ortValueKeys = OrtValue.CreateFromTensorObject(keys, out TensorElementType elementTypeKeys))
+            Span<OrtValue> ortValues = new OrtValue[2];
+            var disposableGuard = new DisposableArray<OrtValue>(ortValues);
+            try
             {
+                TensorBase keys = node.GetDictionaryKeys();
+                ortValues[0] = OrtValue.CreateFromTensorObject(keys, out TensorElementType elementTypeKeys);
+
                 if (elementTypeKeys != mapMeta.KeyDataType)
                 {
                     throw new OnnxRuntimeException(ErrorCode.InvalidArgument,
@@ -133,39 +140,40 @@ private static OrtValue CreateMapProjection(NamedOnnxValue node, NodeMetadata el
                 }
 
                 TensorBase values = node.GetDictionaryValues();
-                using (OrtValue ortValueValues = OrtValue.CreateFromTensorObject(values, out TensorElementType elementTypeValues))
+                ortValues[1] = OrtValue.CreateFromTensorObject(values, out TensorElementType elementTypeValues);
+                if (elementTypeValues != mapValuesMeta.ElementDataType)
                 {
-                    if (elementTypeValues != mapValuesMeta.ElementDataType)
-                    {
-                        throw new OnnxRuntimeException(ErrorCode.InvalidArgument,
-                                                       $"Map value data type supplied: {elementTypeValues} metadata expected: {mapValuesMeta.ElementDataType}");
-                    }
-
-                    // Create Map OrtValue
-                    return OrtValue.CreateMap(ortValueKeys, ortValueValues);
+                    throw new OnnxRuntimeException(ErrorCode.InvalidArgument,
+                                                   $"Map value data type supplied: {elementTypeValues} metadata expected: {mapValuesMeta.ElementDataType}");
                 }
+
+                // Create Map OrtValue
+                return OrtValue.CreateMap(ref ortValues[0], ref ortValues[1]);
+            }
+            catch (Exception)
+            {
+                disposableGuard.Dispose();
+                throw;
             }
         }
 
-
         /// <summary>
         /// This pins memory that is contained within DenseTensor.
         /// </summary>
         /// <param name="node">NodeOnnxValue containing DenseTensor</param>
         /// <param name="elementMeta"></param>
-        /// <param name="disposables">cleanup list</param>
         /// <returns></returns>
         /// <exception cref="OnnxRuntimeException"></exception>
         private static OrtValue CreateTensorProjection(NamedOnnxValue node, NodeMetadata elementMeta)
         {
-            if (!(node.Value is TensorBase))
+            if (node.Value is not TensorBase)
             {
                 throw new OnnxRuntimeException(ErrorCode.InvalidArgument,
                     $"NamedOnnxValue contains: {node.Value.GetType()}, expecting a Tensor<T>");
             }
 
             OrtValue ortValue = OrtValue.CreateFromTensorObject(node.Value as TensorBase, out TensorElementType elementType);
-            try 
+            try
             {
                 if (elementType != elementMeta.ElementDataType)
                 {
@@ -173,7 +181,7 @@ private static OrtValue CreateTensorProjection(NamedOnnxValue node, NodeMetadata
                         $"Tensor element data type discovered: {elementType} metadata expected: {elementMeta.ElementDataType}");
                 }
             }
-            catch(Exception)
+            catch (Exception)
             {
                 ortValue.Dispose();
                 throw;
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
index 981d802f47f8f..3c9f5cf6743a7 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
@@ -66,7 +66,7 @@
 
   <PropertyGroup>
     <Platforms>AnyCPU;x86</Platforms>
-    <LangVersion>7.3</LangVersion>
+    <LangVersion>default</LangVersion>
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
     <SignAssembly>true</SignAssembly>
     <AssemblyOriginatorKeyFile>..\..\OnnxRuntime.snk</AssemblyOriginatorKeyFile>
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs
index 23239594562bc..b3f3ee517d004 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs
@@ -5,6 +5,7 @@
 using System;
 using System.Buffers;
 using System.Collections.Generic;
+using System.Collections.ObjectModel;
 using System.Diagnostics;
 using System.Linq;
 using System.Runtime.InteropServices;
@@ -39,25 +40,60 @@ public enum OnnxValueType
     /// </summary>
     public class OrtValue : IOrtValueOwner, IDisposable
     {
+        // OrtValues that are members of Sequences or Maps that map. They potentially map managed memory and we need to keep them around.
+        // this exists only when we deal with compose ML types.
+        private DisposableList<OrtValue> _compositeMembers;
         private IntPtr _handle;
         private MemoryHandle? _memHandle; // Present when the OrtValue is created on top of managed memory
         private bool _disposed;
 
+        internal OrtValue(IntPtr handle)
+        {
+            _handle = handle;
+            InitOnnxType();
+        }
+
         /// <summary>
         /// Constructor. The newly constructed OrtValue takes ownership of the native OrtValue instance
-        /// and disposes of it when the OrtValue instance is disposed.
         /// </summary>
-        /// <param name="handle">Pointer to a native instance of OrtValue</param>
-        /// <param name="onnxValueType">OnnxValue type if known, otherwise the constructor would interrogate
-        /// the handle</param>
-        internal OrtValue(IntPtr handle, OnnxValueType onnxValueType = OnnxValueType.ONNX_TYPE_UNKNOWN)
+        /// <param name="handle"></param>
+        /// <param name="onnxValueType"></param>
+        /// <exception cref="ArgumentException">thrown when onnxValue type is not known</exception>
+        internal OrtValue(IntPtr handle, OnnxValueType onnxValueType)
         {
+            if (onnxValueType == OnnxValueType.ONNX_TYPE_UNKNOWN)
+            {
+                throw new ArgumentException("onnxValueType argument is passed as unknown");
+            }
+
             _handle = handle;
             OnnxType = onnxValueType;
-            if (OnnxType == OnnxValueType.ONNX_TYPE_UNKNOWN)
+        }
+
+        /// <summary>
+        /// Constructor. The newly constructed OrtValue takes ownership of the native OrtValue instance
+        /// and disposes of it when the OrtValue instance is disposed. The instance will take ownership and will
+        /// dispose of compositeMembers instances.
+        /// 
+        /// This constructor can only throw if OnnxType is not specified.
+        /// </summary>
+        /// <param name="handle">native ortValue handle</param>
+        /// <param name="onnxValueType">must one of the valid types</param>
+        /// <param name="compositeMembers">For composite types this contains dependent ortValues such as members of a sequence
+        /// or keys/values for the map, that may have been created on top of the managed memory and must be disposed
+        /// with the new ortValue. This container will be taken the ownership of and the argument will be set to null.</param>
+        /// <exception cref="ArgumentException">throws when onnxValueType is not specified</exception>
+        internal OrtValue(IntPtr handle, OnnxValueType onnxValueType, ref DisposableList<OrtValue> compositeMembers)
+        {
+            if (onnxValueType == OnnxValueType.ONNX_TYPE_UNKNOWN)
             {
-                InitOnnxType();
+                throw new ArgumentException("onnxValueType argument is passed as unknown");
             }
+
+            _handle = handle;
+            OnnxType = onnxValueType;
+            _compositeMembers = compositeMembers;
+            compositeMembers = null;
         }
 
         /// <summary>
@@ -165,7 +201,7 @@ public OrtValue GetValue(int index, OrtAllocator allocator)
         /// <typeparam name="T"></typeparam>
         /// <returns>ReadOnlySpan<typeparamref name="T"/></returns>
         /// <exception cref="OnnxRuntimeException"></exception>
-        public ReadOnlySpan<T> GetTensorDataAsSpan<T>() where T : struct
+        public ReadOnlySpan<T> GetTensorDataAsSpan<T>() where T : unmanaged
         {
             var byteSpan = GetTensorBufferRawData(typeof(T));
             return MemoryMarshal.Cast<byte, T>(byteSpan);
@@ -185,7 +221,7 @@ public ReadOnlySpan<T> GetTensorDataAsSpan<T>() where T : struct
         /// </summary>
         /// <typeparam name="T"></typeparam>
         /// <returns>Typed Span over the native buffer</returns>
-        public Span<T> GetTensorMutableDataAsSpan<T>() where T : struct
+        public Span<T> GetTensorMutableDataAsSpan<T>() where T : unmanaged
         {
             var byteSpan = GetTensorBufferRawData(typeof(T));
             return MemoryMarshal.Cast<byte, T>(byteSpan);
@@ -505,6 +541,7 @@ out IntPtr ortValueHandle
         /// <returns>A disposable OrtValue instance</returns>
         /// <exception cref="OnnxRuntimeException"></exception>
         public static OrtValue CreateTensorValueFromMemory<T>(OrtMemoryInfo memoryInfo, Memory<T> memory, long[] shape)
+            where T : unmanaged
         {
             var typeInfo = TensorBase.GetTypeInfo(typeof(T)) ??
                 throw new OnnxRuntimeException(ErrorCode.InvalidArgument, $"Tensor of type: {typeof(T)} is not supported");
@@ -561,7 +598,7 @@ out IntPtr ortValueHandle
         /// <param name="data">managed data buffer</param>
         /// <param name="shape">shape that describes the buffer</param>
         /// <returns>A disposable OrtValue instance</returns>
-        public static OrtValue CreateTensorValueFromMemory<T>(T[] data, long[] shape)
+        public static OrtValue CreateTensorValueFromMemory<T>(T[] data, long[] shape) where T : unmanaged
         {
             return OrtValue.CreateTensorValueFromMemory(OrtMemoryInfo.DefaultInstance, new Memory<T>(data), shape);
         }
@@ -847,56 +884,340 @@ public static OrtValue CreateFromStringTensor(Tensor<string> tensor)
         /// All OrtValues in the collection must be of the same Onnx type.
         /// I.e. (Tensor, SparseTensor, Map, Sequence, etc.)
         /// 
-        /// All OrtValues are internally ref-counted and stored within the sequence OrtValue
-        /// so the input OrtValues can be disposed of after this call.
+        /// The ortValues that are passed as argument are taken possession of by the newly
+        /// created OrtValue. The caller should not dispose them, unless this call fails.
+        /// 
+        /// The ortValues would be empty on successful return.
         /// </summary>
-        /// <param name="ortValues">a collection of OrtValues</param>
+        /// <param name="ortValues">a collection of OrtValues. On success the ortValues contained in the list
+        /// are taken ownership of and the list is cleared.</param>
         /// <returns>A disposable instance of OrtValues</returns>
         /// <exception cref="ArgumentNullException"></exception>
-        public static OrtValue CreateSequence(IReadOnlyCollection<OrtValue> ortValues)
+        public static OrtValue CreateSequence(ICollection<OrtValue> ortValues)
         {
             if (ortValues is null)
             {
                 throw new ArgumentNullException(nameof(ortValues));
             }
 
-            var handles = new IntPtr[ortValues.Count];
-            for (int i = 0; i < ortValues.Count; i++)
+            if (ortValues.IsReadOnly)
+            {
+                throw new ArgumentException("ortValues argument can not be a readonly collection");
+            }
+
+            var compositeMembers = new DisposableList<OrtValue>(ortValues);
+            try
             {
-                handles[i] = ortValues.ElementAt(i).Handle;
+                var result = CreateSequence(ref compositeMembers);
+                Debug.Assert(compositeMembers is null, "Must be null on success");
+                ortValues.Clear();
+                return result;
+            }
+            catch (Exception)
+            {
+                // The caller is responsible for disposing the ortValues
+                compositeMembers?.Clear();
+                throw;
+            }
+        }
+
+        /// <summary>
+        /// Creates a sequence from the values in compositeMembers
+        /// The argument is taken possession of and is nullified on successful return.
+        /// </summary>
+        /// <param name="compositeMembers">sequence ortValues</param>
+        /// <returns>OrtValue instance representing a Sequence</returns>
+        internal static OrtValue CreateSequence(ref DisposableList<OrtValue> compositeMembers)
+        {
+            var handles = new IntPtr[compositeMembers.Count];
+            for (int i = 0; i < compositeMembers.Count; i++)
+            {
+                handles[i] = compositeMembers[i].Handle;
             }
 
             NativeApiStatus.VerifySuccess(NativeMethods.OrtCreateValue(handles,
-                (UIntPtr)ortValues.Count, (IntPtr)OnnxValueType.ONNX_TYPE_SEQUENCE,
-                out IntPtr sequenceHandle));
-            return new OrtValue(sequenceHandle, OnnxValueType.ONNX_TYPE_SEQUENCE);
+                    (UIntPtr)handles.Length, (IntPtr)OnnxValueType.ONNX_TYPE_SEQUENCE,
+                    out IntPtr sequenceHandle));
+
+            return new OrtValue(sequenceHandle, OnnxValueType.ONNX_TYPE_SEQUENCE, ref compositeMembers);
         }
 
+        /// <summary>
+        /// A delegate type that is expected to process each OrtValue in a sequence.
+        /// </summary>
+        /// <param name="ortValue">OrtValue that holds sequence element</param>
+        /// <param name="index">ordinal of the value</param>
+        public delegate void SequenceElementVisitor(OrtValue ortValue, int index);
+
+        /// <summary>
+        /// Feeds each OrtValue in a sequence to the visitor delegate.
+        /// This helps users to avoid dealing each value life-span
+        /// </summary>
+        /// <param name="visitor">visitor delegate</param>
+        /// <param name="allocator">allocator to use for intermediate ort values</param>
+        /// <exception cref="OnnxRuntimeException"></exception>
+        public void ProcessSequence(SequenceElementVisitor visitor, OrtAllocator allocator)
+        {
+            if (OnnxType != OnnxValueType.ONNX_TYPE_SEQUENCE)
+            {
+                throw new OnnxRuntimeException(ErrorCode.InvalidArgument,
+                    $"OrtValue.OnnxType of {OnnxType} is not a sequence");
+            }
+
+            int count = GetValueCount();
+            for (int i = 0; i < count; i++)
+            {
+                using var ortValue = GetValue(i, allocator);
+                visitor(ortValue, i);
+            }
+        }
 
         /// <summary>
         /// Creates a map OrtValue with keys and values.
-        /// ORT supports only a subset of types for keys and values.
-        /// We are not restricting them here.
+        /// On a high level the Onnxruntime representation of the map always consists of two
+        /// OrtValues, keys and values.
+        /// 
+        /// According to ONNX standard map keys can be unmanaged types only (or strings).
+        /// Those keys are contained in a single tensor within OrtValue keys.
+        /// 
+        /// Map values, on the other hand, can be composite types. The values parameter
+        /// can either contain a single tensor with unmanaged map values with the same number of
+        /// elements as the keys, or it can be a sequence of OrtValues,
+        /// each of those can be a composite type (tensor, sequence, map). If it is a sequence,
+        /// then the number of elements must match the number of elements in keys.
         /// 
-        /// All OrtValues are internally ref-counted and stored within the map OrtValue
-        /// so the input OrtValues can be disposed of after this call.
+        /// Keys and values must be in the same order.
+        /// 
+        /// ORT supports only a subset of types for keys and values, however, this API does not
+        /// restrict it.
+        /// 
+        /// The ortValues that are passed as argument are taken possession of by the newly
+        /// created OrtValue. The caller should not dispose them, unless this call fails.
+        /// 
+        /// Keys and values arguments will be set to null on success.
         /// </summary>
         /// <param name="keys">Contains keys</param>
         /// <param name="values">Contains values</param>
         /// <returns>A disposable OrtValue</returns>
         /// <exception cref="ArgumentNullException"></exception>
-        public static OrtValue CreateMap(OrtValue keys, OrtValue values)
+        public static OrtValue CreateMap(ref OrtValue keys, ref OrtValue values)
         {
             if (keys is null || values is null)
             {
-                throw new ArgumentNullException($"keys or/and values are null");
+                throw new ArgumentNullException("keys or/and values are null");
             }
 
             IntPtr[] handles = { keys.Handle, values.Handle };
             NativeApiStatus.VerifySuccess(
                 NativeMethods.OrtCreateValue(handles, (UIntPtr)handles.Length, (IntPtr)OnnxValueType.ONNX_TYPE_MAP,
                                out IntPtr mapHandle));
-            return new OrtValue(mapHandle, OnnxValueType.ONNX_TYPE_MAP);
+
+            var compositeMembers = new DisposableList<OrtValue>
+            {
+                keys,
+                values
+            };
+
+            keys = null;
+            values = null;
+
+            // This constructor will not throw.
+            return new OrtValue(mapHandle, OnnxValueType.ONNX_TYPE_MAP, ref compositeMembers);
+        }
+
+        /// <summary>
+        /// This API helps to quickly creates a map OrtValue with unmanaged (primitive) keys and values specified as arrays.
+        /// This helps the user not to create OrtValues for keys and values separately and deal only with the final result.
+        /// The map would consist of two tensors, one for keys and one for values.
+        /// 
+        /// The OrtValues would be created on top of the managed memory arrays and use it directly.
+        /// The number of elements in keys and values must be the same and they must be in order.
+        /// 
+        /// The types must be unmanaged.
+        /// </summary>
+        /// <typeparam name="K">keys type</typeparam>
+        /// <typeparam name="V">values type</typeparam>
+        /// <param name="keys">array of keys of K type</param>
+        /// <param name="values">array of values of V type</param>
+        /// <returns>OrtValue instance</returns>
+        /// <exception cref="ArgumentNullException"></exception>
+        /// <exception cref="ArgumentException"></exception>
+        public static OrtValue CreateMap<K, V>(K[] keys, V[] values) where K : unmanaged where V : unmanaged
+        {
+            if (keys is null || values is null)
+            {
+                throw new ArgumentNullException("Keys or/and values are null");
+            }
+
+            if (keys.Length != values.Length)
+            {
+                throw new ArgumentException("Expecting keys and values same len. " +
+                    $"Received keys: {keys.Length}, Values: {values.Length}");
+            }
+
+            long[] shape = { keys.Length };
+            Span<OrtValue> ortValues = new OrtValue[2];
+            var disposableGuard = new DisposableArray<OrtValue>(ortValues);
+            try
+            {
+                ortValues[0] = CreateTensorValueFromMemory(keys, shape);
+                ortValues[1] = CreateTensorValueFromMemory(values, shape);
+                return CreateMap(ref ortValues[0], ref ortValues[1]);
+            }
+            catch (Exception)
+            {
+                disposableGuard.Dispose();
+                throw;
+            }
+        }
+
+        /// <summary>
+        /// Creates a map OrtValue with string keys and non-string values.
+        /// This helps the user not to create OrtValues for keys and values separately.
+        /// The number of elements in keys and values must be the same and they must be in order.
+        /// The map would consist of two tensors, one for keys and one for values.
+        /// 
+        /// string keys would be converted to UTF-8 encoding and copied to an allocated native memory.
+        /// The OrtValue for values would be created on top of the managed memory using it directly.
+        /// 
+        /// The values type must be unmanaged.
+        /// </summary>
+        /// <typeparam name="V"></typeparam>
+        /// <param name="keys">Collection of strings</param>
+        /// <param name="values"></param>
+        /// <returns>OrtValue instance</returns>
+        /// <exception cref="ArgumentNullException"></exception>
+        /// <exception cref="ArgumentException"></exception>
+        public static OrtValue CreateMapWithStringKeys<V>(IReadOnlyCollection<string> keys, V[] values) where V : unmanaged
+        {
+            if (keys is null || values is null)
+            {
+                throw new ArgumentNullException("Keys or/and values are null");
+            }
+
+            if (keys.Count != values.Length)
+            {
+                throw new ArgumentException("Expecting keys and values same len. " +
+                    $"Received keys: {keys.Count}, Values: {values.Length}");
+            }
+
+            long[] shape = { keys.Count };
+
+            Span<OrtValue> ortValues = new OrtValue[2];
+            var disposableGuard = new DisposableArray<OrtValue>(ortValues);
+            try
+            {
+                ortValues[0] = CreateTensorWithEmptyStrings(OrtAllocator.DefaultInstance, shape);
+                int count = 0;
+                foreach (var key in keys)
+                {
+                    ortValues[0].FillStringTensorElement(key.AsSpan(), count++);
+                }
+
+                ortValues[1] = CreateTensorValueFromMemory(values, shape);
+                return CreateMap(ref ortValues[0], ref ortValues[1]);
+            }
+            catch (Exception)
+            {
+                disposableGuard.Dispose();
+                throw;
+            }
+        }
+
+        /// <summary>
+        /// Creates a map OrtValue with non-string keys and string values.
+        /// 
+        /// This helps the user not to create OrtValues for keys and values separately.
+        /// The number of elements in keys and values must be the same and they must be in order.
+        /// 
+        /// The OrtValue for keys would be created on top of the managed memory using it directly.
+        /// string values would be converted to UTF-8 encoding and copied to an allocated native memory.
+        /// 
+        /// </summary>
+        /// <typeparam name="K">unmanaged type of keys</typeparam>
+        /// <param name="keys"></param>
+        /// <param name="values">collection of string values</param>
+        /// <returns>Instance of OrtValue</returns>
+        /// <exception cref="ArgumentNullException"></exception>
+        /// <exception cref="ArgumentException"></exception>
+        public static OrtValue CreateMapWithStringValues<K>(K[] keys, IReadOnlyCollection<string> values) where K : unmanaged
+        {
+            if (keys is null || values is null)
+            {
+                throw new ArgumentNullException("Keys or/and values are null");
+            }
+
+            if (keys.Length != values.Count)
+            {
+                throw new ArgumentException("Expecting keys and values same len. " +
+                    $"Received keys: {keys.Length}, Values: {values.Count}");
+            }
+
+            long[] shape = { keys.Length };
+            Span<OrtValue> ortValues = new OrtValue[2];
+            var disposableGuard = new DisposableArray<OrtValue>(ortValues);
+            try
+            {
+                ortValues[0] = CreateTensorValueFromMemory(keys, shape);
+                ortValues[1] = CreateTensorWithEmptyStrings(OrtAllocator.DefaultInstance, shape);
+                int count = 0;
+                foreach (var value in values)
+                {
+                    ortValues[1].FillStringTensorElement(value.AsSpan(), count++);
+                }
+                return CreateMap(ref ortValues[0], ref ortValues[1]);
+            }
+            catch (Exception)
+            {
+                disposableGuard.Dispose();
+                throw;
+            }
+        }
+
+        /// <summary>
+        /// A public delegate that will be invoked once with map keys and values.
+        /// The delegate helps not to deal with the lifespan of intermediate OrtValues.
+        /// Typically, when one uses GetValue() API, it creates a copy of OrtValue
+        /// that points to the same buffer as keys or values. This API helps to deal with those
+        /// temporary instances and avoid leaks.
+        /// 
+        /// According to ONNX standard map keys can be unmanaged types only (or strings).
+        /// Those keys are contained in a single tensor within OrtValue keys. So you can query those
+        /// directly from keys argument.
+        /// 
+        /// Map values, on the other hand, can be composite types. The values parameter
+        /// can either contain a single tensor with unmanaged map values with the same number of
+        /// elements as the keys, or it can be a sequence of OrtValues,
+        /// each of those can be a composite type (tensor, sequence, map). If it is a sequence,
+        /// then the number of elements must match the number of elements in keys.
+        /// 
+        /// Depending on the structure of the values, one will either directly query a single tensor
+        /// from values, or will have to iterate over the sequence of OrtValues and visit each of those
+        /// resulting in a recursive visitation.
+        /// </summary>
+        /// <param name="keys">This would always represent a tensor</param>
+        /// <param name="values">Can be any of the Onnx types, but they would all reduce to tensors eventually</param>
+        public delegate void MapVisitor(OrtValue keys, OrtValue values);
+
+        /// <summary>
+        /// This API helps the user to process a map OrtValue without
+        /// having to deal with the lifespan of intermediate OrtValues.
+        /// 
+        /// each API value is fed to the vistor functor.
+        /// </summary>
+        /// <param name="visitor">visitor function</param>
+        /// <param name="allocator">Allocator to use for intermediate values</param>
+        /// <exception cref="OnnxRuntimeException"></exception>
+        public void ProcessMap(MapVisitor visitor, OrtAllocator allocator)
+        {
+            if (OnnxType != OnnxValueType.ONNX_TYPE_MAP)
+            {
+                throw new OnnxRuntimeException(ErrorCode.InvalidArgument, "This OrtValue does not represent a map");
+            }
+
+            using var keys = GetValue(0, allocator);
+            using var values = GetValue(1, allocator);
+            visitor(keys, values);
         }
 
         private unsafe void FillStringTensorElement(char* strPtr, int strLength, int index)
@@ -973,6 +1294,8 @@ protected virtual void Dispose(bool disposing)
             {
                 _memHandle?.Dispose();
                 _memHandle = null;
+                _compositeMembers?.Dispose();
+                _compositeMembers = null;
             }
 
             Debug.Assert(_handle != IntPtr.Zero);
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj
index f44db30afdc98..1c9827c5bac62 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.EndToEndTests/Microsoft.ML.OnnxRuntime.EndToEndTests.csproj
@@ -6,11 +6,12 @@
     <Platform>AnyCPU</Platform>
     <OutputPath>bin\$(Configuration)\</OutputPath>
     <!-- arbitrary version for testing locally. when used in a CI CurrentOnnxRuntimeVersion should always be specified and match the package being tested -->
-    <CurrentOnnxRuntimeVersion Condition="'$(CurrentOnnxRuntimeVersion)' == ''">1.9.0</CurrentOnnxRuntimeVersion>
+    <CurrentOnnxRuntimeVersion Condition="'$(CurrentOnnxRuntimeVersion)' == ''">1.15.0</CurrentOnnxRuntimeVersion>
     <PackageName Condition="'$(PACKAGENAME)' == ''">Microsoft.ML.OnnxRuntime</PackageName>
     <IsLinuxBuild Condition="'$([System.Runtime.InteropServices.RuntimeInformation]::IsOSPlatform($([System.Runtime.InteropServices.OSPlatform]::Linux)))' == 'true'">true</IsLinuxBuild>
     <IsWindowsBuild Condition="'$([System.Runtime.InteropServices.RuntimeInformation]::IsOSPlatform($([System.Runtime.InteropServices.OSPlatform]::Windows)))' == 'true'">true</IsWindowsBuild>
     <IsMacOSBuild Condition="'$([System.Runtime.InteropServices.RuntimeInformation]::IsOSPlatform($([System.Runtime.InteropServices.OSPlatform]::OSX)))' == 'true'">true</IsMacOSBuild>
+    <LangVersion>default</LangVersion>
     <AllowUnsafeBlocks>True</AllowUnsafeBlocks>
     <SignAssembly>true</SignAssembly>
     <AssemblyOriginatorKeyFile>..\..\OnnxRuntime.snk</AssemblyOriginatorKeyFile>
@@ -47,9 +48,10 @@
   <ItemGroup>
     <BuildEnvVars Include="OnnxRuntimeBuildDirectory=$(OnnxRuntimeBuildDirectory)" />
 
-    <Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\InferenceTest.cs"/>
-    <Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\OnnxMl.cs"/>
-    <Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\OnnxData.cs"/>
+    <Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\InferenceTest.cs" />
+    <Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\EqualityComparers.cs" />
+    <Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\OnnxMl.cs" />
+    <Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\OnnxData.cs" />
     <Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.NetCoreApp\InferenceTest.netcore.cs" />
     <Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\TestDataLoader.cs" />
 
@@ -102,4 +104,5 @@
   <ItemGroup>
     <Service Include="{508349b6-6b84-4df5-91f0-309beebad82d}" />
   </ItemGroup>
+
 </Project>
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/Microsoft.ML.OnnxRuntime.Tests.Common.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/Microsoft.ML.OnnxRuntime.Tests.Common.csproj
index 12fee9b5db52e..ee81ab77432d1 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/Microsoft.ML.OnnxRuntime.Tests.Common.csproj
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/Microsoft.ML.OnnxRuntime.Tests.Common.csproj
@@ -14,7 +14,7 @@
     <ProtoSrc>$(OnnxRuntimeCsharpRoot)\..\cmake\external\onnx</ProtoSrc>
 
     <!-- following attributes were necessary for the migrated Tensor tests -->
-    <LangVersion>7.2</LangVersion>
+    <LangVersion>default</LangVersion>
     <AllowUnsafeBlocks>True</AllowUnsafeBlocks>
     <SignAssembly>true</SignAssembly> <!-- need signing for friend access to the internals of the Tensors assembly -->
     <AssemblyOriginatorKeyFile>..\..\OnnxRuntime.snk</AssemblyOriginatorKeyFile>
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtValueTests.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtValueTests.cs
index 94996ed3c8c78..1b621e2b8e60b 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtValueTests.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtValueTests.cs
@@ -120,7 +120,7 @@ public void PopulateAndReadStringTensorViaTensor()
             }
         }
         static void VerifyTensorCreateWithData<T>(OrtValue tensor, TensorElementType dataType, long[] shape,
-            ReadOnlySpan<T> originalData) where T : struct
+            ReadOnlySpan<T> originalData) where T : unmanaged
         {
             // Verify invocation
             var dataTypeInfo = TensorBase.GetTypeInfo(typeof(T));
@@ -172,7 +172,7 @@ public void CreateTensorOverManagedBuffer()
             // The tensor will be created on top of the managed memory. No copy is made.
             // The memory should stay pinned until the OrtValue instance is disposed. This means
             // stayed pinned until the end of Run() method when you are actually running inference.
-            using(var tensor = OrtValue.CreateTensorValueFromMemory(data, shape))
+            using (var tensor = OrtValue.CreateTensorValueFromMemory(data, shape))
             {
                 VerifyTensorCreateWithData<int>(tensor, TensorElementType.Int32, shape, data);
             }
@@ -215,7 +215,7 @@ public void CreateTensorOverUnmangedBuffer()
             }
         }
 
-        private static void PopulateAndCheck<T>(T[] data) where T : struct
+        private static void PopulateAndCheck<T>(T[] data) where T : unmanaged
         {
             var typeInfo = TensorBase.GetTypeInfo(typeof(T));
             Assert.NotNull(typeInfo);
@@ -255,80 +255,92 @@ public void CreateAllocatedTensor()
         private static readonly long[] ml_data_2 = { 3, 4 };
 
         // Use this utility method to create two tensors for Map and Sequence tests
-        private static Tuple<OrtValue, OrtValue> CreateTwoTensors(IList<IDisposable> cleanup)
+        private static void CreateTwoTensors(out OrtValue val1, out OrtValue val2)
         {
             const int ml_data_dim = 2;
             // For map tensors they must be single dimensional
             long[] shape = { ml_data_dim };
 
-            unsafe
-            {
-                var ortValue_1 = OrtValue.CreateTensorValueFromMemory(ml_data_1, shape);
-                cleanup.Add(ortValue_1);
-                var ortValue_2 = OrtValue.CreateTensorValueFromMemory(ml_data_2, shape);
-                cleanup.Add(ortValue_2);
-                return Tuple.Create(ortValue_1, ortValue_2);
-            }
+            val1 = OrtValue.CreateTensorValueFromMemory(ml_data_1, shape);
+            val2 = OrtValue.CreateTensorValueFromMemory(ml_data_2, shape);
         }
 
-        [Fact(DisplayName = "CreateMap")]
-        public void CreateMap()
+        [Fact(DisplayName = "CreateMapFromValues")]
+        public void CreateMapFromValues()
         {
-            using (var cleanUp = new DisposableListTest<IDisposable>())
-            {
-                var valTuple = CreateTwoTensors(cleanUp);
-                using (var map = OrtValue.CreateMap(valTuple.Item1, valTuple.Item2))
-                {
-                    Assert.Equal(OnnxValueType.ONNX_TYPE_MAP, map.OnnxType);
-                    var typeInfo = map.GetTypeInfo();
-                    var mapInfo = typeInfo.MapTypeInfo;
-                    Assert.Equal(TensorElementType.Int64, mapInfo.KeyType);
-                    Assert.Equal(OnnxValueType.ONNX_TYPE_TENSOR, mapInfo.ValueType.OnnxType);
-
-                    // Must return always 2 for map since we have two ort values
-                    Assert.Equal(2, map.GetValueCount());
-
-                    var keys = map.GetValue(0, OrtAllocator.DefaultInstance);
-                    cleanUp.Add(keys);
-                    Assert.Equal(OnnxValueType.ONNX_TYPE_TENSOR, keys.OnnxType);
-                    Assert.Equal(ml_data_1, keys.GetTensorDataAsSpan<long>().ToArray());
-
-                    var vals = map.GetValue(1, OrtAllocator.DefaultInstance);
-                    cleanUp.Add(vals);
-                    Assert.Equal(OnnxValueType.ONNX_TYPE_TENSOR, vals.OnnxType);
-                    Assert.Equal(ml_data_2, vals.GetTensorDataAsSpan<long>().ToArray());
-                }
-            }
+            CreateTwoTensors(out OrtValue keys, out OrtValue values);
+            using var map = OrtValue.CreateMap(ref keys, ref values);
+            Assert.Equal(OnnxValueType.ONNX_TYPE_MAP, map.OnnxType);
+            var typeInfo = map.GetTypeInfo();
+            var mapInfo = typeInfo.MapTypeInfo;
+            Assert.Equal(TensorElementType.Int64, mapInfo.KeyType);
+            Assert.Equal(OnnxValueType.ONNX_TYPE_TENSOR, mapInfo.ValueType.OnnxType);
+
+            // Must return always 2 for map since we have two ort values
+            Assert.Equal(2, map.GetValueCount());
+
+            map.ProcessMap((keys, values) => {
+                Assert.Equal(OnnxValueType.ONNX_TYPE_TENSOR, keys.OnnxType);
+                Assert.Equal(OnnxValueType.ONNX_TYPE_TENSOR, values.OnnxType);
+                Assert.Equal(ml_data_1, keys.GetTensorDataAsSpan<long>().ToArray());
+                Assert.Equal(ml_data_2, values.GetTensorDataAsSpan<long>().ToArray());
+
+            }, OrtAllocator.DefaultInstance);
+        }
+
+        [Fact(DisplayName = "CreateMapFromArraysUnmanaged")]
+        public void CreateMapFromArraysUnmanaged()
+        {
+            long[] keys = { 1, 2, 3 };
+            float[] vals = { 1, 2, 3 };
+            using var map = OrtValue.CreateMap(keys, vals);
+        }
+
+        [Fact(DisplayName = "CreateMapWithStringKeys")]
+        public void CreateMapWithStringKeys()
+        {
+            string[] keys = { "one", "two", "three" };
+            float[] vals = { 1, 2, 3 };
+            using var map = OrtValue.CreateMapWithStringKeys(keys, vals);
+        }
+
+        [Fact(DisplayName = "CreateMapWithStringValues")]
+        public void CreateMapWithStringValues()
+        {
+            long[] keys = { 1, 2, 3 };
+            string[] values = { "one", "two", "three" };
+            using var map = OrtValue.CreateMapWithStringValues(keys, values);
         }
 
         [Fact(DisplayName = "CreateSequence")]
         public void CreateSequence()
         {
-            using (var cleanUp = new DisposableListTest<IDisposable>())
+            CreateTwoTensors(out OrtValue val1, out OrtValue val2);
+            using var seqVals = new DisposableListTest<OrtValue> { val1, val2 };
+            using var seq = OrtValue.CreateSequence(seqVals);
+
+            Assert.Equal(OnnxValueType.ONNX_TYPE_SEQUENCE, seq.OnnxType);
+            var typeInfo = seq.GetTypeInfo();
+            var seqInfo = typeInfo.SequenceTypeInfo;
+            Assert.Equal(OnnxValueType.ONNX_TYPE_TENSOR, seqInfo.ElementType.OnnxType);
+
+            // Will return 2 because we put 2 values in the sequence
+            Assert.Equal(2, seq.GetValueCount());
+
+            // Visit each element in the sequence
+            seq.ProcessSequence((ortValue, index) =>
             {
-                var valTuple = CreateTwoTensors(cleanUp);
-                OrtValue[] seqVals = { valTuple.Item1, valTuple.Item2 };
-                using (var seq = OrtValue.CreateSequence(seqVals))
+                // We know both elements are tensors of long
+                Assert.Equal(OnnxValueType.ONNX_TYPE_TENSOR, ortValue.OnnxType);
+                if (index == 0)
                 {
-                    Assert.Equal(OnnxValueType.ONNX_TYPE_SEQUENCE, seq.OnnxType);
-                    var typeInfo = seq.GetTypeInfo();
-                    var seqInfo = typeInfo.SequenceTypeInfo;
-                    Assert.Equal(OnnxValueType.ONNX_TYPE_TENSOR, seqInfo.ElementType.OnnxType);
-
-                    // Will return 2 because we put 2 values in the sequence
-                    Assert.Equal(2, seq.GetValueCount());
-
-                    var item_0 = seq.GetValue(0, OrtAllocator.DefaultInstance);
-                    cleanUp.Add(item_0);
-                    Assert.Equal(OnnxValueType.ONNX_TYPE_TENSOR, item_0.OnnxType);
-                    Assert.Equal(ml_data_1, item_0.GetTensorDataAsSpan<long>().ToArray());
-
-                    var item_1 = seq.GetValue(1, OrtAllocator.DefaultInstance);
-                    cleanUp.Add(item_1);
-                    Assert.Equal(OnnxValueType.ONNX_TYPE_TENSOR, item_1.OnnxType);
-                    Assert.Equal(ml_data_2, item_1.GetTensorDataAsSpan<long>().ToArray());
+                    Assert.Equal(ml_data_1, ortValue.GetTensorDataAsSpan<long>().ToArray());
                 }
-            }
+                else
+                {
+                    Assert.Equal(ml_data_2, ortValue.GetTensorDataAsSpan<long>().ToArray());
+                }
+            }, OrtAllocator.DefaultInstance);
         }
     }
 }
\ No newline at end of file
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/TestDataLoader.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/TestDataLoader.cs
index 548f7cf238cf5..d9843f1788294 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/TestDataLoader.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/TestDataLoader.cs
@@ -16,12 +16,16 @@ public class DisposableListTest<T> : List<T>, IDisposableReadOnlyCollection<T>
         where T : IDisposable
     {
         public DisposableListTest()
-        {}
+        { }
+
+        public DisposableListTest(IEnumerable<T> enumerable) : base(enumerable)
+        { }
+
         public DisposableListTest(int count)
             : base(count)
-        {}
+        { }
 
-#region IDisposable Support
+        #region IDisposable Support
         private bool disposedValue = false; // To detect redundant calls
 
         protected virtual void Dispose(bool disposing)
@@ -54,7 +58,7 @@ public void Dispose()
             Dispose(true);
             GC.SuppressFinalize(this);
         }
-#endregion
+        #endregion
     }
 
     internal struct DisposableTestPair<TValue> : IDisposable
@@ -218,26 +222,26 @@ internal static NamedOnnxValue LoadOnnxValueFromFilePb(string fullFilename, stri
                 switch (nodeMeta.OnnxValueType)
                 {
                     case OnnxValueType.ONNX_TYPE_TENSOR:
-                    {
-                        var tensor = Onnx.TensorProto.Parser.ParseFrom(file);
-                        return LoadTensorPb(tensor, nodeName, nodeMeta);
-                    }
+                        {
+                            var tensor = Onnx.TensorProto.Parser.ParseFrom(file);
+                            return LoadTensorPb(tensor, nodeName, nodeMeta);
+                        }
                     case OnnxValueType.ONNX_TYPE_SEQUENCE:
-                    {
-                        var sequence = Onnx.SequenceProto.Parser.ParseFrom(file);
-                        return CreateNamedOnnxValueFromSequence(sequence, nodeName, nodeMeta);
-                    }
+                        {
+                            var sequence = Onnx.SequenceProto.Parser.ParseFrom(file);
+                            return CreateNamedOnnxValueFromSequence(sequence, nodeName, nodeMeta);
+                        }
                     case OnnxValueType.ONNX_TYPE_MAP:
-                    {
-                        throw new NotImplementedException(
-                            "Map test data format requires clarification: https://github.com/onnx/onnx/issues/5072");
-                    }
+                        {
+                            throw new NotImplementedException(
+                                "Map test data format requires clarification: https://github.com/onnx/onnx/issues/5072");
+                        }
 
                     case OnnxValueType.ONNX_TYPE_OPTIONAL:
-                    {
-                        var opt = Onnx.OptionalProto.Parser.ParseFrom(file);
-                        return CreateNamedOnnxValueFromOptional(opt, nodeName, nodeMeta);
-                    }
+                        {
+                            var opt = Onnx.OptionalProto.Parser.ParseFrom(file);
+                            return CreateNamedOnnxValueFromOptional(opt, nodeName, nodeMeta);
+                        }
                     default:
                         throw new NotImplementedException($"Unable to load value type: {nodeMeta.OnnxValueType} not implemented");
                 }
@@ -254,26 +258,26 @@ internal static DisposableTestPair<OrtValue> LoadOrtValueFromFilePb(string fullF
                 switch (nodeMeta.OnnxValueType)
                 {
                     case OnnxValueType.ONNX_TYPE_TENSOR:
-                    {
-                        var tensor = Onnx.TensorProto.Parser.ParseFrom(file);
-                        return new DisposableTestPair<OrtValue>(nodeName, LoadOrValueTensorPb(tensor, nodeName, nodeMeta));
-                    }
+                        {
+                            var tensor = Onnx.TensorProto.Parser.ParseFrom(file);
+                            return new DisposableTestPair<OrtValue>(nodeName, LoadOrValueTensorPb(tensor, nodeName, nodeMeta));
+                        }
                     case OnnxValueType.ONNX_TYPE_SEQUENCE:
-                    {
-                        var sequence = Onnx.SequenceProto.Parser.ParseFrom(file);
-                        return new DisposableTestPair<OrtValue>(nodeName, CreateOrtValueFromSequence(sequence, nodeName, nodeMeta));
-                    }
+                        {
+                            var sequence = Onnx.SequenceProto.Parser.ParseFrom(file);
+                            return new DisposableTestPair<OrtValue>(nodeName, CreateOrtValueFromSequence(sequence, nodeName, nodeMeta));
+                        }
                     case OnnxValueType.ONNX_TYPE_MAP:
-                    {
-                        throw new NotImplementedException(
-                            "Map test data format requires clarification: https://github.com/onnx/onnx/issues/5072");
-                    }
+                        {
+                            throw new NotImplementedException(
+                                "Map test data format requires clarification: https://github.com/onnx/onnx/issues/5072");
+                        }
 
                     case OnnxValueType.ONNX_TYPE_OPTIONAL:
-                    {
-                        var opt = Onnx.OptionalProto.Parser.ParseFrom(file);
-                        return new DisposableTestPair<OrtValue>(nodeName, CreateOrtValueFromOptional(opt, nodeName, nodeMeta));
-                    }
+                        {
+                            var opt = Onnx.OptionalProto.Parser.ParseFrom(file);
+                            return new DisposableTestPair<OrtValue>(nodeName, CreateOrtValueFromOptional(opt, nodeName, nodeMeta));
+                        }
                     default:
                         throw new NotImplementedException($"Unable to load value type: {nodeMeta.OnnxValueType} not implemented");
                 }
@@ -309,50 +313,50 @@ internal static NamedOnnxValue CreateNamedOnnxValueFromSequence(Onnx.SequencePro
             switch (seqElemType)
             {
                 case Onnx.SequenceProto.Types.DataType.Tensor:
-                {
-                    SequenceCheckMatchOnnxType(nodeName, sequenceMeta, OnnxValueType.ONNX_TYPE_TENSOR);
-                    var sequenceOfTensors = new List<NamedOnnxValue>(sequence.TensorValues.Count);
-                    foreach (var tensor in sequence.TensorValues)
                     {
-                        var elemName = MakeSequenceElementName(nodeName, sequence.Name, seqNum++);
-                        var namedOnnxValue = LoadTensorPb(tensor, elemName, elemMeta);
-                        sequenceOfTensors.Add(namedOnnxValue);
+                        SequenceCheckMatchOnnxType(nodeName, sequenceMeta, OnnxValueType.ONNX_TYPE_TENSOR);
+                        var sequenceOfTensors = new List<NamedOnnxValue>(sequence.TensorValues.Count);
+                        foreach (var tensor in sequence.TensorValues)
+                        {
+                            var elemName = MakeSequenceElementName(nodeName, sequence.Name, seqNum++);
+                            var namedOnnxValue = LoadTensorPb(tensor, elemName, elemMeta);
+                            sequenceOfTensors.Add(namedOnnxValue);
+                        }
+                        return NamedOnnxValue.CreateFromSequence(nodeName, sequenceOfTensors);
                     }
-                    return NamedOnnxValue.CreateFromSequence(nodeName, sequenceOfTensors);
-                }
                 case Onnx.SequenceProto.Types.DataType.Sequence:
-                {
-                    SequenceCheckMatchOnnxType(nodeName, sequenceMeta, OnnxValueType.ONNX_TYPE_SEQUENCE);
-                    var seqOfSequences = new List<NamedOnnxValue>(sequence.SequenceValues.Count);
-                    foreach (var s in sequence.SequenceValues)
                     {
-                        var elemName = MakeSequenceElementName(nodeName, sequence.Name, seqNum++);
-                        seqOfSequences.Add(CreateNamedOnnxValueFromSequence(s, elemName, elemMeta));
+                        SequenceCheckMatchOnnxType(nodeName, sequenceMeta, OnnxValueType.ONNX_TYPE_SEQUENCE);
+                        var seqOfSequences = new List<NamedOnnxValue>(sequence.SequenceValues.Count);
+                        foreach (var s in sequence.SequenceValues)
+                        {
+                            var elemName = MakeSequenceElementName(nodeName, sequence.Name, seqNum++);
+                            seqOfSequences.Add(CreateNamedOnnxValueFromSequence(s, elemName, elemMeta));
+                        }
+                        return NamedOnnxValue.CreateFromSequence(nodeName, seqOfSequences);
                     }
-                    return NamedOnnxValue.CreateFromSequence(nodeName, seqOfSequences);
-                }
                 case Onnx.SequenceProto.Types.DataType.Map:
-                {
-                    SequenceCheckMatchOnnxType(nodeName, sequenceMeta, OnnxValueType.ONNX_TYPE_MAP);
-                    var seqOfMaps = new List<NamedOnnxValue>(sequence.MapValues.Count);
-                    foreach (var m in sequence.MapValues)
                     {
-                        var elemName = MakeSequenceElementName(nodeName, sequence.Name, seqNum++);
-                        seqOfMaps.Add(CreateNamedOnnxValueFromMap(m, elemName, elemMeta));
+                        SequenceCheckMatchOnnxType(nodeName, sequenceMeta, OnnxValueType.ONNX_TYPE_MAP);
+                        var seqOfMaps = new List<NamedOnnxValue>(sequence.MapValues.Count);
+                        foreach (var m in sequence.MapValues)
+                        {
+                            var elemName = MakeSequenceElementName(nodeName, sequence.Name, seqNum++);
+                            seqOfMaps.Add(CreateNamedOnnxValueFromMap(m, elemName, elemMeta));
+                        }
+                        return NamedOnnxValue.CreateFromSequence(nodeName, seqOfMaps);
                     }
-                    return NamedOnnxValue.CreateFromSequence(nodeName, seqOfMaps);
-                }
                 case Onnx.SequenceProto.Types.DataType.Optional:
-                {
-                    SequenceCheckMatchOnnxType(nodeName, sequenceMeta, OnnxValueType.ONNX_TYPE_OPTIONAL);
-                    var seqOfOpts = new List<NamedOnnxValue>(sequence.OptionalValues.Count);
-                    foreach (var opt in sequence.OptionalValues)
                     {
-                        var elemName = MakeSequenceElementName(nodeName, sequence.Name, seqNum++);
-                        seqOfOpts.Add(CreateNamedOnnxValueFromOptional(opt, elemName, elemMeta));
+                        SequenceCheckMatchOnnxType(nodeName, sequenceMeta, OnnxValueType.ONNX_TYPE_OPTIONAL);
+                        var seqOfOpts = new List<NamedOnnxValue>(sequence.OptionalValues.Count);
+                        foreach (var opt in sequence.OptionalValues)
+                        {
+                            var elemName = MakeSequenceElementName(nodeName, sequence.Name, seqNum++);
+                            seqOfOpts.Add(CreateNamedOnnxValueFromOptional(opt, elemName, elemMeta));
+                        }
+                        return NamedOnnxValue.CreateFromSequence(nodeName, seqOfOpts);
                     }
-                    return NamedOnnxValue.CreateFromSequence(nodeName, seqOfOpts);
-                }
                 default:
                     throw new NotImplementedException($"Sequence test data loading does not support element type: " +
                                                       $"'{seqElemType}'");
@@ -370,20 +374,20 @@ internal static NamedOnnxValue CreateNamedOnnxValueFromOptional(Onnx.OptionalPro
             switch ((Onnx.OptionalProto.Types.DataType)optional.ElemType)
             {
                 case Onnx.OptionalProto.Types.DataType.Tensor:
-                {
-                    var tensor = optional.TensorValue;
-                    return LoadTensorPb(tensor, nodeName, meta);
-                }
+                    {
+                        var tensor = optional.TensorValue;
+                        return LoadTensorPb(tensor, nodeName, meta);
+                    }
                 case Onnx.OptionalProto.Types.DataType.Sequence:
-                {
-                    var sequence = optional.SequenceValue;
-                    return CreateNamedOnnxValueFromSequence(sequence, nodeName, meta);
-                }
+                    {
+                        var sequence = optional.SequenceValue;
+                        return CreateNamedOnnxValueFromSequence(sequence, nodeName, meta);
+                    }
                 case Onnx.OptionalProto.Types.DataType.Map:
-                {
-                    var map = optional.MapValue;
-                    return CreateNamedOnnxValueFromMap(map, nodeName, meta);
-                }
+                    {
+                        var map = optional.MapValue;
+                        return CreateNamedOnnxValueFromMap(map, nodeName, meta);
+                    }
                 case Onnx.OptionalProto.Types.DataType.Optional:
                     throw new NotImplementedException($"Unable to load '{nodeName}' optional contained within optional");
                 default:
@@ -454,23 +458,21 @@ internal static OrtValue CreateOrtValueFromSequence(Onnx.SequenceProto sequence,
             switch (seqElemType)
             {
                 case Onnx.SequenceProto.Types.DataType.Tensor:
-                {
-                    SequenceCheckMatchOnnxType(nodeName, sequenceMeta, OnnxValueType.ONNX_TYPE_TENSOR);
-                    using (var sequenceOfTensors = new DisposableListTest<OrtValue>(sequence.TensorValues.Count))
                     {
+                        SequenceCheckMatchOnnxType(nodeName, sequenceMeta, OnnxValueType.ONNX_TYPE_TENSOR);
+                        using DisposableListTest<OrtValue> sequenceOfTensors = new(sequence.TensorValues.Count);
                         foreach (var tensor in sequence.TensorValues)
                         {
                             var element = LoadOrValueTensorPb(tensor, sequence.Name, elemMeta);
                             sequenceOfTensors.Add(element);
                         }
+                        // Will take possession of ortValues in the sequence and will clear this container
                         return OrtValue.CreateSequence(sequenceOfTensors);
                     }
-                }
                 case Onnx.SequenceProto.Types.DataType.Sequence: // Sequence of sequences
-                {
-                    SequenceCheckMatchOnnxType(nodeName, sequenceMeta, OnnxValueType.ONNX_TYPE_SEQUENCE);
-                    using (var seqOfSequences = new DisposableListTest<OrtValue>(sequence.TensorValues.Count))
                     {
+                        SequenceCheckMatchOnnxType(nodeName, sequenceMeta, OnnxValueType.ONNX_TYPE_SEQUENCE);
+                        using DisposableListTest<OrtValue> seqOfSequences = new(sequence.TensorValues.Count);
                         foreach (var s in sequence.SequenceValues)
                         {
                             var elemName = MakeSequenceElementName(nodeName, sequence.Name, seqNum++);
@@ -479,17 +481,15 @@ internal static OrtValue CreateOrtValueFromSequence(Onnx.SequenceProto sequence,
                         }
                         return OrtValue.CreateSequence(seqOfSequences);
                     }
-                }
                 case Onnx.SequenceProto.Types.DataType.Map:
-                {
-                    throw new NotImplementedException(
-                        "Test data format for maps is under investigation");
-                }
+                    {
+                        throw new NotImplementedException(
+                            "Test data format for maps is under investigation");
+                    }
                 case Onnx.SequenceProto.Types.DataType.Optional:
-                {
-                    SequenceCheckMatchOnnxType(nodeName, sequenceMeta, OnnxValueType.ONNX_TYPE_OPTIONAL);
-                    using (var seqOfSequences = new DisposableListTest<OrtValue>(sequence.TensorValues.Count))
                     {
+                        SequenceCheckMatchOnnxType(nodeName, sequenceMeta, OnnxValueType.ONNX_TYPE_OPTIONAL);
+                        using DisposableListTest<OrtValue> seqOfSequences = new(sequence.TensorValues.Count);
                         foreach (var opt in sequence.OptionalValues)
                         {
                             var elemName = MakeSequenceElementName(nodeName, sequence.Name, seqNum++);
@@ -498,7 +498,6 @@ internal static OrtValue CreateOrtValueFromSequence(Onnx.SequenceProto sequence,
                         }
                         return OrtValue.CreateSequence(seqOfSequences);
                     }
-                }
                 default:
                     throw new NotImplementedException($"Sequence test data loading does not support element type: " +
                                                       $"'{seqElemType}'");
@@ -511,20 +510,20 @@ internal static OrtValue CreateOrtValueFromOptional(Onnx.OptionalProto optional,
             switch ((Onnx.OptionalProto.Types.DataType)optional.ElemType)
             {
                 case Onnx.OptionalProto.Types.DataType.Tensor:
-                {
-                    var tensor = optional.TensorValue;
-                    return LoadOrValueTensorPb(tensor, nodeName, meta);
-                }
+                    {
+                        var tensor = optional.TensorValue;
+                        return LoadOrValueTensorPb(tensor, nodeName, meta);
+                    }
                 case Onnx.OptionalProto.Types.DataType.Sequence:
-                {
-                    var sequence = optional.SequenceValue;
-                    return CreateOrtValueFromSequence(sequence, nodeName, meta);
-                }
+                    {
+                        var sequence = optional.SequenceValue;
+                        return CreateOrtValueFromSequence(sequence, nodeName, meta);
+                    }
                 case Onnx.OptionalProto.Types.DataType.Map:
-                {
-                    throw new NotImplementedException(
-                        "Test data format for maps is under investigation");
-                }
+                    {
+                        throw new NotImplementedException(
+                            "Test data format for maps is under investigation");
+                    }
                 case Onnx.OptionalProto.Types.DataType.Optional:
                     throw new NotImplementedException($"Unable to load '{nodeName}' optional contained within optional");
                 default:
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp.csproj
index 61c3b1079f470..9886f050fbd6b 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp.csproj
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp.csproj
@@ -11,7 +11,7 @@
     <IsMacOSBuild Condition="'$([System.Runtime.InteropServices.RuntimeInformation]::IsOSPlatform($([System.Runtime.InteropServices.OSPlatform]::OSX)))' == 'true'">true</IsMacOSBuild>
     <ProtoSrc>$(OnnxSourceDirectory)\onnx</ProtoSrc>
     <!-- following attributes were necessary for the migrated Tensor tests -->
-    <LangVersion>7.2</LangVersion>
+    <LangVersion>default</LangVersion>
     <AllowUnsafeBlocks>True</AllowUnsafeBlocks>
     <SignAssembly>true</SignAssembly> <!-- need signing for friend access to the internals of the Tensors assembly -->
     <AssemblyOriginatorKeyFile>..\..\OnnxRuntime.snk</AssemblyOriginatorKeyFile>

From a8e776b78bfa0d0b1fec8b34b4545d91c2a9d175 Mon Sep 17 00:00:00 2001
From: Adam Pocock <adam.pocock@oracle.com>
Date: Fri, 21 Jul 2023 07:14:41 -0400
Subject: [PATCH 15/34] [java] Adds support for fp16 and bf16 tensors (#16703)

### Description
The Java API currently only supports fp16 output tensors which it
automatically casts to floats on the way out. This PR adds support for
creating fp16 and bf16 tensors (from `java.nio.Buffer` objects or as the
output of models, creation from Java short arrays is not supported),
along with efficient methods for casting `FloatBuffer` into
`ShortBuffer` filled with fp16 or bf16 values and vice versa.

The fp16 conversions use a trick to pull in the efficient conversion
methods added to Java 20, falling back to ports of the MLAS methods
otherwise. The Java 20 methods can be special cased by the C2 JIT
compiler to emit the single instruction on x86 and ARM which converts
fp32<->fp16, or the vectorized versions thereof, so they should be quite
a bit faster than the MLAS ported one.

### Motivation and Context
fp16 and bf16 are increasingly popular formats and we've had several
requests for this functionality. Fixes #7003.

cc @yuslepukhin  @cassiebreviu

---------

Co-authored-by: Scott McKay <Scott.McKay@microsoft.com>
---
 .../java/ai/onnxruntime/OnnxJavaType.java     |  18 +-
 .../java/ai/onnxruntime/OnnxSparseTensor.java |  29 +-
 .../main/java/ai/onnxruntime/OnnxTensor.java  |  64 ++--
 .../src/main/java/ai/onnxruntime/OrtUtil.java | 273 ++++++++++++++
 .../main/java/ai/onnxruntime/TensorInfo.java  |  37 +-
 java/src/main/native/OrtJniUtil.c             |  52 ++-
 java/src/main/native/OrtJniUtil.h             |   2 +
 .../main/native/ai_onnxruntime_OnnxTensor.c   |  14 +-
 .../java/ai/onnxruntime/ModelGenerators.java  |  70 ++++
 .../java/ai/onnxruntime/OnnxTensorTest.java   | 345 ++++++++++++++++++
 .../ai/onnxruntime/TensorCreationTest.java    | 147 --------
 .../src/test/resources/java-bf16-to-fp32.onnx |  14 +
 .../src/test/resources/java-fp16-to-fp32.onnx |  15 +
 .../src/test/resources/java-fp32-to-bf16.onnx |  14 +
 .../src/test/resources/java-fp32-to-fp16.onnx |  16 +
 15 files changed, 879 insertions(+), 231 deletions(-)
 create mode 100644 java/src/test/java/ai/onnxruntime/OnnxTensorTest.java
 delete mode 100644 java/src/test/java/ai/onnxruntime/TensorCreationTest.java
 create mode 100644 java/src/test/resources/java-bf16-to-fp32.onnx
 create mode 100644 java/src/test/resources/java-fp16-to-fp32.onnx
 create mode 100644 java/src/test/resources/java-fp32-to-bf16.onnx
 create mode 100644 java/src/test/resources/java-fp32-to-fp16.onnx

diff --git a/java/src/main/java/ai/onnxruntime/OnnxJavaType.java b/java/src/main/java/ai/onnxruntime/OnnxJavaType.java
index 12b720327f3ea..8c80eb7b6753f 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxJavaType.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxJavaType.java
@@ -1,12 +1,12 @@
 /*
- * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2019, 2023, Oracle and/or its affiliates. All rights reserved.
  * Licensed under the MIT License.
  */
 package ai.onnxruntime;
 
 import ai.onnxruntime.TensorInfo.OnnxTensorType;
 
-/** An enum representing onnxruntime supported Java primitive types (and String). */
+/** An enum representing ONNX Runtime supported Java primitive types (and String). */
 public enum OnnxJavaType {
   FLOAT(1, float.class, 4),
   DOUBLE(2, double.class, 8),
@@ -17,12 +17,18 @@ public enum OnnxJavaType {
   BOOL(7, boolean.class, 1),
   STRING(8, String.class, 4),
   UINT8(9, byte.class, 1),
+  /** A IEEE 16-bit floating point value. */
+  FLOAT16(10, short.class, 2),
+  /** A non-IEEE 16-bit floating point value, with 8 exponent bits and 7 mantissa bits. */
+  BFLOAT16(11, short.class, 2),
   UNKNOWN(0, Object.class, 0);
 
-  private static final OnnxJavaType[] values = new OnnxJavaType[10];
+  private static final OnnxJavaType[] values;
 
   static {
-    for (OnnxJavaType ot : OnnxJavaType.values()) {
+    OnnxJavaType[] tmpValues = OnnxJavaType.values();
+    values = new OnnxJavaType[tmpValues.length];
+    for (OnnxJavaType ot : tmpValues) {
       values[ot.value] = ot;
     }
   }
@@ -76,6 +82,9 @@ public static OnnxJavaType mapFromOnnxTensorType(OnnxTensorType onnxValue) {
       case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
         return OnnxJavaType.INT64;
       case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
+        return OnnxJavaType.FLOAT16;
+      case ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16:
+        return OnnxJavaType.BFLOAT16;
       case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
         return OnnxJavaType.FLOAT;
       case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE:
@@ -87,7 +96,6 @@ public static OnnxJavaType mapFromOnnxTensorType(OnnxTensorType onnxValue) {
       case ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED:
       case ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64:
       case ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128:
-      case ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16:
       default:
         return OnnxJavaType.UNKNOWN;
     }
diff --git a/java/src/main/java/ai/onnxruntime/OnnxSparseTensor.java b/java/src/main/java/ai/onnxruntime/OnnxSparseTensor.java
index 668e6e07ceccd..061738a1bab96 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxSparseTensor.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxSparseTensor.java
@@ -1,11 +1,9 @@
 /*
- * Copyright (c) 2022 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2022, 2023 Oracle and/or its affiliates. All rights reserved.
  * Licensed under the MIT License.
  */
 package ai.onnxruntime;
 
-import static ai.onnxruntime.OnnxTensor.fp16ToFloat;
-
 import java.nio.Buffer;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
@@ -315,26 +313,23 @@ public Buffer getValuesBuffer() {
         getValuesBuffer(OnnxRuntime.ortApiHandle, nativeHandle).order(ByteOrder.nativeOrder());
     switch (info.type) {
       case FLOAT:
-        if (info.onnxType == TensorInfo.OnnxTensorType.ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16) {
-          ShortBuffer shortBuffer = buffer.asShortBuffer();
-          int bufferCap = shortBuffer.capacity();
-          FloatBuffer output = FloatBuffer.allocate(bufferCap);
-          for (int i = 0; i < bufferCap; i++) {
-            output.put(fp16ToFloat(shortBuffer.get(i)));
-          }
-          output.rewind();
-          return output;
-        } else if (info.onnxType
-            == TensorInfo.OnnxTensorType.ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16) {
-          throw new IllegalArgumentException("BFloat16 is not supported.");
-        } else {
-          // regular fp32
+        {
           FloatBuffer floatBuf = buffer.asFloatBuffer();
           FloatBuffer output = FloatBuffer.allocate(floatBuf.capacity());
           output.put(floatBuf);
           output.rewind();
           return output;
         }
+      case FLOAT16:
+        {
+          ShortBuffer shortBuffer = buffer.asShortBuffer();
+          return OrtUtil.convertFp16BufferToFloatBuffer(shortBuffer);
+        }
+      case BFLOAT16:
+        {
+          ShortBuffer shortBuffer = buffer.asShortBuffer();
+          return OrtUtil.convertBf16BufferToFloatBuffer(shortBuffer);
+        }
       case DOUBLE:
         {
           DoubleBuffer doubleBuf = buffer.asDoubleBuffer();
diff --git a/java/src/main/java/ai/onnxruntime/OnnxTensor.java b/java/src/main/java/ai/onnxruntime/OnnxTensor.java
index 5703fb9c48495..0dec29b59a860 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxTensor.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxTensor.java
@@ -18,6 +18,7 @@
  * returned as outputs.
  */
 public class OnnxTensor extends OnnxTensorLike {
+
   /**
    * This reference is held for OnnxTensors backed by a Java nio buffer to ensure the buffer does
    * not go out of scope while the OnnxTensor exists.
@@ -70,6 +71,12 @@ public Object getValue() throws OrtException {
           return getBool(OnnxRuntime.ortApiHandle, nativeHandle);
         case STRING:
           return getString(OnnxRuntime.ortApiHandle, nativeHandle);
+        case FLOAT16:
+          return OrtUtil.fp16ToFloat(
+              getShort(OnnxRuntime.ortApiHandle, nativeHandle, info.onnxType.value));
+        case BFLOAT16:
+          return OrtUtil.bf16ToFloat(
+              getShort(OnnxRuntime.ortApiHandle, nativeHandle, info.onnxType.value));
         case UNKNOWN:
         default:
           throw new OrtException("Extracting the value of an invalid Tensor.");
@@ -126,30 +133,28 @@ public ByteBuffer getByteBuffer() {
 
   /**
    * Returns a copy of the underlying OnnxTensor as a FloatBuffer if it can be losslessly converted
-   * into a float (i.e. it's a float or fp16), otherwise it returns null.
+   * into a float (i.e. it's a float, fp16 or bf16), otherwise it returns null.
    *
    * @return A FloatBuffer copy of the OnnxTensor.
    */
   public FloatBuffer getFloatBuffer() {
     if (info.type == OnnxJavaType.FLOAT) {
-      if (info.onnxType == TensorInfo.OnnxTensorType.ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16) {
-        // if it's fp16 we need to copy it out by hand.
-        ShortBuffer buffer = getBuffer().asShortBuffer();
-        int bufferCap = buffer.capacity();
-        FloatBuffer output = FloatBuffer.allocate(bufferCap);
-        for (int i = 0; i < bufferCap; i++) {
-          output.put(fp16ToFloat(buffer.get(i)));
-        }
-        output.rewind();
-        return output;
-      } else {
-        // if it's fp32 use the efficient copy.
-        FloatBuffer buffer = getBuffer().asFloatBuffer();
-        FloatBuffer output = FloatBuffer.allocate(buffer.capacity());
-        output.put(buffer);
-        output.rewind();
-        return output;
-      }
+      // if it's fp32 use the efficient copy.
+      FloatBuffer buffer = getBuffer().asFloatBuffer();
+      FloatBuffer output = FloatBuffer.allocate(buffer.capacity());
+      output.put(buffer);
+      output.rewind();
+      return output;
+    } else if (info.type == OnnxJavaType.FLOAT16) {
+      // if it's fp16 we need to copy it out by hand.
+      ByteBuffer buf = getBuffer();
+      ShortBuffer buffer = buf.asShortBuffer();
+      return OrtUtil.convertFp16BufferToFloatBuffer(buffer);
+    } else if (info.type == OnnxJavaType.BFLOAT16) {
+      // if it's bf16 we need to copy it out by hand.
+      ByteBuffer buf = getBuffer();
+      ShortBuffer buffer = buf.asShortBuffer();
+      return OrtUtil.convertBf16BufferToFloatBuffer(buffer);
     } else {
       return null;
     }
@@ -174,13 +179,15 @@ public DoubleBuffer getDoubleBuffer() {
   }
 
   /**
-   * Returns a copy of the underlying OnnxTensor as a ShortBuffer if the underlying type is int16 or
-   * uint16, otherwise it returns null.
+   * Returns a copy of the underlying OnnxTensor as a ShortBuffer if the underlying type is int16,
+   * uint16, fp16 or bf16, otherwise it returns null.
    *
    * @return A ShortBuffer copy of the OnnxTensor.
    */
   public ShortBuffer getShortBuffer() {
-    if (info.type == OnnxJavaType.INT16) {
+    if ((info.type == OnnxJavaType.INT16)
+        || (info.type == OnnxJavaType.FLOAT16)
+        || (info.type == OnnxJavaType.BFLOAT16)) {
       ShortBuffer buffer = getBuffer().asShortBuffer();
       ShortBuffer output = ShortBuffer.allocate(buffer.capacity());
       output.put(buffer);
@@ -270,19 +277,6 @@ private native void getArray(long apiHandle, long nativeHandle, Object carrier)
 
   private native void close(long apiHandle, long nativeHandle);
 
-  /**
-   * Mirrors the conversion in the C code. It's not precise if there are subnormal values, nor does
-   * it preserve all the different kinds of NaNs (which aren't representable in Java anyway).
-   *
-   * @param input A uint16_t representing an IEEE half precision float.
-   * @return A float.
-   */
-  static float fp16ToFloat(short input) {
-    int output =
-        ((input & 0x8000) << 16) | (((input & 0x7c00) + 0x1C000) << 13) | ((input & 0x03FF) << 13);
-    return Float.intBitsToFloat(output);
-  }
-
   /**
    * Create a Tensor from a Java primitive, primitive multidimensional array or String
    * multidimensional array. The shape is inferred from the object using reflection. The default
diff --git a/java/src/main/java/ai/onnxruntime/OrtUtil.java b/java/src/main/java/ai/onnxruntime/OrtUtil.java
index e6f5bc1f5dd79..d8ade62f620a5 100644
--- a/java/src/main/java/ai/onnxruntime/OrtUtil.java
+++ b/java/src/main/java/ai/onnxruntime/OrtUtil.java
@@ -1,9 +1,13 @@
 /*
  * Copyright (c) 2019, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) Microsoft Corporation. All rights reserved.
  * Licensed under the MIT License.
  */
 package ai.onnxruntime;
 
+import java.lang.invoke.MethodHandle;
+import java.lang.invoke.MethodHandles;
+import java.lang.invoke.MethodType;
 import java.lang.reflect.Array;
 import java.nio.Buffer;
 import java.nio.ByteBuffer;
@@ -15,9 +19,45 @@
 import java.nio.ShortBuffer;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.logging.Level;
+import java.util.logging.Logger;
 
 /** Util code for interacting with Java arrays. */
 public final class OrtUtil {
+  private static final Logger logger = Logger.getLogger(OrtUtil.class.getName());
+
+  private static final MethodHandle fp16ToFp32;
+  private static final MethodHandle fp32ToFp16;
+
+  static {
+    MethodHandle tmp16 = null;
+    MethodHandle tmp32 = null;
+    MethodHandles.Lookup lookup = MethodHandles.lookup();
+    try {
+      // Attempt to lookup the Java 20 fp16 conversion methods which can use SIMD intrinsics.
+      tmp16 =
+          lookup.findStatic(
+              Float.class, "float16ToFloat", MethodType.methodType(float.class, short.class));
+      tmp32 =
+          lookup.findStatic(
+              Float.class, "floatToFloat16", MethodType.methodType(short.class, float.class));
+    } catch (IllegalAccessException | NoSuchMethodException e) {
+      // Must be on Java 19 or earlier, create handles for our methods.
+      try {
+        tmp16 =
+            lookup.findStatic(
+                OrtUtil.class, "mlasFp16ToFloat", MethodType.methodType(float.class, short.class));
+        tmp32 =
+            lookup.findStatic(
+                OrtUtil.class, "mlasFloatToFp16", MethodType.methodType(short.class, float.class));
+      } catch (IllegalAccessException | NoSuchMethodException ex) {
+        // Should not happen
+        logger.log(Level.SEVERE, "Failed to find fp16 conversion methods on OnnxTensor", e);
+      }
+    }
+    fp16ToFp32 = tmp16;
+    fp32ToFp16 = tmp32;
+  }
 
   /** Private constructor for static util class. */
   private OrtUtil() {}
@@ -532,6 +572,8 @@ static BufferTuple prepareBuffer(Buffer data, OnnxJavaType type) {
           tmp = buffer.put((ByteBuffer) data);
           break;
         case INT16:
+        case FLOAT16:
+        case BFLOAT16:
           tmp = buffer.asShortBuffer().put((ShortBuffer) data);
           break;
         case INT32:
@@ -553,6 +595,237 @@ static BufferTuple prepareBuffer(Buffer data, OnnxJavaType type) {
     return new BufferTuple(tmp, bufferPos, bufferSize, data.remaining(), tmp != data);
   }
 
+  /**
+   * Rounds a buffer of floats into a buffer containing fp16 values (stored as shorts in Java).
+   *
+   * <p>Respects the position and limit of the input buffer.
+   *
+   * @param buf The buffer of floats.
+   * @return A buffer of fp16 values stored as shorts.
+   */
+  public static ShortBuffer convertFloatBufferToFp16Buffer(FloatBuffer buf) {
+    int pos = buf.position();
+    int remaining = buf.remaining();
+    ShortBuffer output =
+        ByteBuffer.allocateDirect(remaining * 2).order(ByteOrder.nativeOrder()).asShortBuffer();
+    for (int i = 0; i < remaining; i++) {
+      output.put(i, floatToFp16(buf.get(i + pos)));
+    }
+    return output;
+  }
+
+  /**
+   * Casts a buffer of fp16 values stored as shorts into a buffer of floats.
+   *
+   * <p>Respects the position and limit of the input buffer.
+   *
+   * @param buf The buffer of fp16 values stored as shorts.
+   * @return A buffer of float values.
+   */
+  public static FloatBuffer convertFp16BufferToFloatBuffer(ShortBuffer buf) {
+    int pos = buf.position();
+    int remaining = buf.remaining();
+    FloatBuffer output =
+        ByteBuffer.allocateDirect(remaining * 4).order(ByteOrder.nativeOrder()).asFloatBuffer();
+    for (int i = 0; i < remaining; i++) {
+      output.put(i, fp16ToFloat(buf.get(i + pos)));
+    }
+    return output;
+  }
+
+  /**
+   * Rounds a buffer of floats into a buffer containing bf16 values (stored as shorts in Java).
+   *
+   * <p>Respects the position and limit of the input buffer.
+   *
+   * @param buf The buffer of floats.
+   * @return A buffer of bf16 values stored as shorts.
+   */
+  public static ShortBuffer convertFloatBufferToBf16Buffer(FloatBuffer buf) {
+    int pos = buf.position();
+    int remaining = buf.remaining();
+    ShortBuffer output =
+        ByteBuffer.allocateDirect(remaining * 2).order(ByteOrder.nativeOrder()).asShortBuffer();
+    for (int i = 0; i < remaining; i++) {
+      output.put(i, floatToBf16(buf.get(i + pos)));
+    }
+    return output;
+  }
+
+  /**
+   * Casts a buffer of bf16 values stored as shorts into a buffer of floats.
+   *
+   * <p>Respects the position and limit of the input buffer.
+   *
+   * @param buf The buffer of bf16 values stored as shorts.
+   * @return A buffer of float values.
+   */
+  public static FloatBuffer convertBf16BufferToFloatBuffer(ShortBuffer buf) {
+    int pos = buf.position();
+    int remaining = buf.remaining();
+    FloatBuffer output =
+        ByteBuffer.allocateDirect(remaining * 4).order(ByteOrder.nativeOrder()).asFloatBuffer();
+    for (int i = 0; i < remaining; i++) {
+      output.put(i, bf16ToFloat(buf.get(i + pos)));
+    }
+    return output;
+  }
+
+  /**
+   * Converts a fp16 value stored in a short into a float value.
+   *
+   * <p>Note on Java 20 or newer this uses {@code Float.float16ToFloat} which may use CPU specific
+   * instructions for the conversion, otherwise it uses the conversion operation from ORT's native
+   * implementation.
+   *
+   * @param input The fp16 value.
+   * @return The float value.
+   */
+  public static float fp16ToFloat(short input) {
+    try {
+      float ret = (float) fp16ToFp32.invokeExact(input);
+      return ret;
+    } catch (Throwable e) {
+      throw new AssertionError("Should not reach here", e);
+    }
+  }
+
+  /**
+   * Converts a float value into a fp16 value stored in a short.
+   *
+   * <p>Note on Java 20 or newer this uses {@code Float.floatToFloat16} which may use CPU specific
+   * instructions for the conversion, otherwise it uses the conversion operation from ORT's native
+   * implementation.
+   *
+   * @param input The float value.
+   * @return The fp16 value.
+   */
+  public static short floatToFp16(float input) {
+    try {
+      short ret = (short) fp32ToFp16.invokeExact(input);
+      return ret;
+    } catch (Throwable e) {
+      throw new AssertionError("Should not reach here", e);
+    }
+  }
+
+  /**
+   * Upcasts a fp16 value to a float. Mirrors the conversion in MLAS.
+   *
+   * @param input A uint16_t representing an IEEE half precision float.
+   * @return A float.
+   */
+  static float mlasFp16ToFloat(short input) {
+    // Port of MLAS_Half2Float from onnxruntime/core/mlas/inc/mlas_float16.h
+    final int MAGIC = 113 << 23;
+    // exponent mask after shift
+    final int SHIFTED_EXP = 0x7c00 << 13;
+
+    // exponent/mantissa bits
+    int bits = (input & 0x7fff) << 13;
+    // just the exponent
+    final int exp = SHIFTED_EXP & bits;
+    // exponent adjust
+    bits += (127 - 15) << 23;
+
+    // handle exponent special cases
+    if (exp == SHIFTED_EXP) {
+      // Inf/NaN?
+      // extra exp adjust
+      bits += (128 - 16) << 23;
+    } else if (exp == 0) {
+      // Zero/Denormal?
+      // extra exp adjust
+      bits += (1 << 23);
+      // renormalize
+      float tmp = Float.intBitsToFloat(bits) - Float.intBitsToFloat(MAGIC);
+      bits = Float.floatToIntBits(tmp);
+    }
+
+    // sign bit
+    bits |= (input & 0x8000) << 16;
+
+    return Float.intBitsToFloat(bits);
+  }
+
+  /**
+   * Rounds a float value to fp16. Mirrors the conversion in MLAS.
+   *
+   * @param input A float value.
+   * @return The value rounded to an IEEE half precision value.
+   */
+  static short mlasFloatToFp16(float input) {
+    // Port of MLAS_Float2Half from onnxruntime/core/mlas/inc/mlas_float16.h
+    int bits = Float.floatToIntBits(input);
+    final int F32_INFINITY = Float.floatToIntBits(Float.POSITIVE_INFINITY);
+    final int F16_MAX = (127 + 16) << 23;
+    final int DENORM_MAGIC = ((127 - 15) + (23 - 10) + 1) << 23;
+    final int SIGN_MASK = 0x80000000;
+    final int ROUNDING_CONST = ((15 - 127) << 23) + 0xfff;
+
+    int sign = bits & SIGN_MASK;
+    // mask out sign bit
+    bits ^= sign;
+
+    short output;
+    if (bits >= F16_MAX) {
+      // Inf or NaN (all exponent bits set)
+      output = (bits > F32_INFINITY) ? (short) 0x7e00 : (short) 0x7c00;
+    } else {
+      if (bits < (113 << 23)) {
+        // Subnormal or zero
+        // use a magic value to align our 10 mantissa bits at the bottom of
+        // the float. as long as FP addition is round-to-nearest-even this
+        // just works.
+        float tmp = Float.intBitsToFloat(bits) + Float.intBitsToFloat(DENORM_MAGIC);
+
+        // and one integer subtract of the bias later, we have our final float!
+        output = (short) (Float.floatToIntBits(tmp) - DENORM_MAGIC);
+      } else {
+        int mant_odd = (bits >> 13) & 1; // resulting mantissa is odd
+
+        // update exponent, rounding bias part 1
+        bits += ROUNDING_CONST;
+        // rounding bias part 2
+        bits += mant_odd;
+        // take the bits!
+        output = (short) (bits >> 13);
+      }
+    }
+
+    // Add the sign back in
+    output = (short) (output | ((short) (sign >> 16)));
+
+    return output;
+  }
+
+  /**
+   * Converts a bf16 value stored in a short into a float value.
+   *
+   * @param input A uint16_t representing a bfloat16 value.
+   * @return A float.
+   */
+  public static float bf16ToFloat(short input) {
+    int bits = input << 16;
+    return Float.intBitsToFloat(bits);
+  }
+
+  /**
+   * Converts a float into bf16. May not produce correct values for subnormal floats.
+   *
+   * <p>Rounds to nearest even.
+   *
+   * @param input The float input.
+   * @return A bfloat16 value which is closest to the float.
+   */
+  public static short floatToBf16(float input) {
+    int bits = Float.floatToIntBits(input);
+    int lsb = (bits >> 16) & 1;
+    int roundingBias = 0x7fff + lsb;
+    bits += roundingBias;
+    return (short) (bits >> 16);
+  }
+
   static final class BufferTuple {
     final Buffer data;
     final int pos;
diff --git a/java/src/main/java/ai/onnxruntime/TensorInfo.java b/java/src/main/java/ai/onnxruntime/TensorInfo.java
index 613fcd61ea476..aa0d8952bc61d 100644
--- a/java/src/main/java/ai/onnxruntime/TensorInfo.java
+++ b/java/src/main/java/ai/onnxruntime/TensorInfo.java
@@ -35,12 +35,20 @@ public enum OnnxTensorType {
     ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128(
         15), // complex with float64 real and imaginary components
     ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16(
-        16); // Non-IEEE floating-point format based on IEEE754 single-precision
+        16), // Non-IEEE floating-point format based on IEEE754 single-precision
+    ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FN(
+        17), // Non-IEEE floating-point format based on IEEE754 single-precision
+    ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FNUZ(
+        18), // Non-IEEE floating-point format based on IEEE754 single-precision
+    ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2(
+        19), // Non-IEEE floating-point format based on IEEE754 single-precision
+    ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2FNUZ(
+        20); // Non-IEEE floating-point format based on IEEE754 single-precisio
 
     /** The int id on the native side. */
     public final int value;
 
-    private static final OnnxTensorType[] values = new OnnxTensorType[17];
+    private static final OnnxTensorType[] values = new OnnxTensorType[21];
 
     static {
       for (OnnxTensorType ot : OnnxTensorType.values()) {
@@ -92,6 +100,10 @@ public static OnnxTensorType mapFromJavaType(OnnxJavaType type) {
           return OnnxTensorType.ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL;
         case STRING:
           return OnnxTensorType.ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;
+        case FLOAT16:
+          return OnnxTensorType.ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
+        case BFLOAT16:
+          return OnnxTensorType.ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16;
         case UNKNOWN:
         default:
           return OnnxTensorType.ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
@@ -321,15 +333,20 @@ public static TensorInfo constructFromBuffer(Buffer buffer, long[] shape, OnnxJa
 
     long bufferRemaining = buffer.remaining();
 
+    // Check if size matches
     if (elementCount != bufferRemaining) {
-      throw new OrtException(
-          "Shape "
-              + Arrays.toString(shape)
-              + ", requires "
-              + elementCount
-              + " elements but the buffer has "
-              + bufferRemaining
-              + " elements.");
+      // if not it could be a ByteBuffer passed in, so check how many bytes there are
+      long elemRemaining = bufferRemaining / type.size;
+      if (elementCount != elemRemaining) {
+        throw new OrtException(
+            "Shape "
+                + Arrays.toString(shape)
+                + ", requires "
+                + elementCount
+                + " elements but the buffer has "
+                + bufferRemaining
+                + " elements.");
+      }
     }
 
     return new TensorInfo(
diff --git a/java/src/main/native/OrtJniUtil.c b/java/src/main/native/OrtJniUtil.c
index de49edf07b882..879ba8a310618 100644
--- a/java/src/main/native/OrtJniUtil.c
+++ b/java/src/main/native/OrtJniUtil.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, 2022 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2019, 2023 Oracle and/or its affiliates. All rights reserved.
  * Licensed under the MIT License.
  */
 #include <jni.h>
@@ -146,6 +146,14 @@ jint convertFromONNXDataFormat(ONNXTensorElementDataType type) {
             return 15;
         case ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16:    // Non-IEEE floating-point format based on IEEE754 single-precision
             return 16;
+        case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FN:
+            return 17;
+        case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FNUZ:
+            return 18;
+        case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2:
+            return 19;
+        case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2FNUZ:
+            return 20;
         default:
             return -1;
     }
@@ -190,6 +198,14 @@ ONNXTensorElementDataType convertToONNXDataFormat(jint type) {
             return ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128;  // complex with float64 real and imaginary components
         case 16:
             return ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16;    // Non-IEEE floating-point format based on IEEE754 single-precision
+        case 17:
+          return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FN;
+        case 18:
+          return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FNUZ;
+        case 19:
+          return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2;
+        case 20:
+          return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2FNUZ;
         default:
             return ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
     }
@@ -200,10 +216,15 @@ size_t onnxTypeSize(ONNXTensorElementDataType type) {
         case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8:   // maps to c type uint8_t
         case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:    // maps to c type int8_t
         case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL:
+        case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FN:
+        case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FNUZ:
+        case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2:
+        case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2FNUZ:
             return 1;
         case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16:  // maps to c type uint16_t
         case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16:   // maps to c type int16_t
         case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
+        case ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16:    // Non-IEEE floating-point format based on IEEE754 single-precision
             return 2;
         case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32:  // maps to c type uint32_t
         case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:   // maps to c type int32_t
@@ -215,7 +236,6 @@ size_t onnxTypeSize(ONNXTensorElementDataType type) {
             return 8;
         case ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING:  // maps to c++ type std::string
         case ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED:
-        case ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16:    // Non-IEEE floating-point format based on IEEE754 single-precision
         case ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64:   // complex with float32 real and imaginary components
         case ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128:  // complex with float64 real and imaginary components
         default:
@@ -259,6 +279,12 @@ jfloat convertHalfToFloat(const uint16_t half) {
     return output.floatVal;
 }
 
+jfloat convertBF16ToFloat(const uint16_t bf16) {
+    FP32 output;
+    output.intVal = bf16 << 16;
+    return output.floatVal;
+}
+
 jobject convertToValueInfo(JNIEnv *jniEnv, const OrtApi * api, const OrtTypeInfo * info) {
   ONNXType type = ONNX_TYPE_UNKNOWN;
   OrtErrorCode code = checkOrtStatus(jniEnv, api, api->GetOnnxTypeFromTypeInfo(info, &type));
@@ -486,6 +512,7 @@ int64_t copyJavaToPrimitiveArray(JNIEnv* jniEnv, ONNXTensorElementDataType onnxT
             (*jniEnv)->GetLongArrayRegion(jniEnv, typedArr, 0, inputLength, (jlong * )outputTensor);
             return consumedSize;
         }
+        case ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16:    // Non-IEEE floating-point format based on IEEE754 single-precision
         case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: {
             throwOrtException(jniEnv, convertErrorCode(ORT_NOT_IMPLEMENTED), "16-bit float not supported.");
             return -1;
@@ -522,7 +549,6 @@ int64_t copyJavaToPrimitiveArray(JNIEnv* jniEnv, ONNXTensorElementDataType onnxT
         }
         case ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64:   // complex with float32 real and imaginary components
         case ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128:  // complex with float64 real and imaginary components
-        case ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16:    // Non-IEEE floating-point format based on IEEE754 single-precision
         case ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED:
         default: {
             throwOrtException(jniEnv, convertErrorCode(ORT_INVALID_ARGUMENT), "Invalid outputTensor element type.");
@@ -600,6 +626,21 @@ int64_t copyPrimitiveArrayToJava(JNIEnv *jniEnv, ONNXTensorElementDataType onnxT
             free(floatArr);
             return consumedSize;
         }
+        case ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16: { // stored as a uint16_t
+            jfloat *floatArr = malloc(sizeof(jfloat) * outputLength);
+            if (floatArr == NULL) {
+                throwOrtException(jniEnv, 1, "Not enough memory");
+                return -1;
+            }
+            uint16_t *bf16Arr = (uint16_t *)inputTensor;
+            for (int32_t i = 0; i < outputLength; i++) {
+                floatArr[i] = convertBF16ToFloat(bf16Arr[i]);
+            }
+            jfloatArray typedArr = (jfloatArray)outputArray;
+            (*jniEnv)->SetFloatArrayRegion(jniEnv, typedArr, 0, outputLength, floatArr);
+            free(floatArr);
+            return consumedSize;
+        }
         case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: { // maps to c type float
             jfloatArray typedArr = (jfloatArray)outputArray;
             (*jniEnv)->SetFloatArrayRegion(jniEnv, typedArr, 0, outputLength, (jfloat * )inputTensor);
@@ -630,11 +671,6 @@ int64_t copyPrimitiveArrayToJava(JNIEnv *jniEnv, ONNXTensorElementDataType onnxT
           throwOrtException(jniEnv, convertErrorCode(ORT_NOT_IMPLEMENTED), "Invalid inputTensor element type ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128.");
           return -1;
         }
-        case ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16: {
-          // Non-IEEE floating-point format based on IEEE754 single-precision
-          throwOrtException(jniEnv, convertErrorCode(ORT_NOT_IMPLEMENTED), "Invalid inputTensor element type ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16.");
-          return -1;
-        }
         case ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED:
         default: {
           throwOrtException(jniEnv, convertErrorCode(ORT_NOT_IMPLEMENTED), "Invalid inputTensor element type ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED.");
diff --git a/java/src/main/native/OrtJniUtil.h b/java/src/main/native/OrtJniUtil.h
index 9c3be7135906a..023bc0c739583 100644
--- a/java/src/main/native/OrtJniUtil.h
+++ b/java/src/main/native/OrtJniUtil.h
@@ -44,6 +44,8 @@ OrtErrorCode getTensorTypeShape(JNIEnv * jniEnv, JavaTensorTypeShape * output, c
 
 jfloat convertHalfToFloat(uint16_t half);
 
+jfloat convertBF16ToFloat(uint16_t half);
+
 jobject convertToValueInfo(JNIEnv *jniEnv, const OrtApi * api, const OrtTypeInfo * info);
 
 jobject convertToTensorInfo(JNIEnv *jniEnv, const OrtApi * api, const OrtTensorTypeAndShapeInfo * info);
diff --git a/java/src/main/native/ai_onnxruntime_OnnxTensor.c b/java/src/main/native/ai_onnxruntime_OnnxTensor.c
index 1656b4043cfe9..b694f57357bb5 100644
--- a/java/src/main/native/ai_onnxruntime_OnnxTensor.c
+++ b/java/src/main/native/ai_onnxruntime_OnnxTensor.c
@@ -246,14 +246,7 @@ JNIEXPORT jfloat JNICALL Java_ai_onnxruntime_OnnxTensor_getFloat
   (void) jobj;  // Required JNI parameter not needed by functions which don't need to access their host object.
   const OrtApi* api = (const OrtApi*) apiHandle;
   ONNXTensorElementDataType onnxType = convertToONNXDataFormat(onnxTypeInt);
-  if (onnxType == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16) {
-    uint16_t* arr = NULL;
-    OrtErrorCode code = checkOrtStatus(jniEnv, api, api->GetTensorMutableData((OrtValue*)handle, (void**)&arr));
-    if (code == ORT_OK) {
-      jfloat floatVal = convertHalfToFloat(*arr);
-      return floatVal;
-    }
-  } else if (onnxType == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) {
+  if (onnxType == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT) {
     jfloat* arr = NULL;
     OrtErrorCode code = checkOrtStatus(jniEnv, api, api->GetTensorMutableData((OrtValue*)handle, (void**)&arr));
     if (code == ORT_OK) {
@@ -311,7 +304,10 @@ JNIEXPORT jshort JNICALL Java_ai_onnxruntime_OnnxTensor_getShort
     (void) jobj;  // Required JNI parameter not needed by functions which don't need to access their host object.
     const OrtApi* api = (const OrtApi*) apiHandle;
   ONNXTensorElementDataType onnxType = convertToONNXDataFormat(onnxTypeInt);
-  if ((onnxType == ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16) || (onnxType == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16)) {
+  if ((onnxType == ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16) ||
+      (onnxType == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16)  ||
+      (onnxType == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16) ||
+      (onnxType == ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16)) {
     uint16_t* arr = NULL;
     OrtErrorCode code = checkOrtStatus(jniEnv, api, api->GetTensorMutableData((OrtValue*)handle, (void**)&arr));
     if (code == ORT_OK) {
diff --git a/java/src/test/java/ai/onnxruntime/ModelGenerators.java b/java/src/test/java/ai/onnxruntime/ModelGenerators.java
index 6dcc4ce7f86ba..90fda4c5cf610 100644
--- a/java/src/test/java/ai/onnxruntime/ModelGenerators.java
+++ b/java/src/test/java/ai/onnxruntime/ModelGenerators.java
@@ -181,4 +181,74 @@ public void generateMatMul() throws IOException {
       model.build().writeTo(os);
     }
   }
+
+  private static void genCast(
+      String name,
+      OnnxMl.TensorProto.DataType inputDataType,
+      OnnxMl.TensorProto.DataType outputDataType)
+      throws IOException {
+    OnnxMl.GraphProto.Builder graph = OnnxMl.GraphProto.newBuilder();
+    graph.setName("ort-test-" + name);
+
+    // Add placeholders
+    OnnxMl.ValueInfoProto.Builder input = OnnxMl.ValueInfoProto.newBuilder();
+    input.setName("input");
+    OnnxMl.TypeProto inputType =
+        buildTensorTypeNode(new long[] {-1, 5}, new String[] {"batch_size", null}, inputDataType);
+    input.setType(inputType);
+    graph.addInput(input);
+    OnnxMl.ValueInfoProto.Builder output = OnnxMl.ValueInfoProto.newBuilder();
+    output.setName("output");
+    OnnxMl.TypeProto outputType =
+        buildTensorTypeNode(new long[] {-1, 5}, new String[] {"batch_size", null}, outputDataType);
+    output.setType(outputType);
+    graph.addOutput(output);
+
+    // Add operations
+    OnnxMl.NodeProto.Builder cast = OnnxMl.NodeProto.newBuilder();
+    cast.setName("cast-0");
+    cast.setOpType("Cast");
+    cast.addInput("input");
+    cast.addOutput("output");
+    cast.addAttribute(
+        OnnxMl.AttributeProto.newBuilder()
+            .setName("to")
+            .setType(OnnxMl.AttributeProto.AttributeType.INT)
+            .setI(outputDataType.getNumber())
+            .build());
+    graph.addNode(cast);
+
+    // Build model
+    OnnxMl.ModelProto.Builder model = OnnxMl.ModelProto.newBuilder();
+    model.setGraph(graph);
+    model.setDocString("ORT " + name + " test");
+    model.setModelVersion(0);
+    model.setIrVersion(8);
+    model.setDomain("ai.onnxruntime.test");
+    model.addOpsetImport(OnnxMl.OperatorSetIdProto.newBuilder().setVersion(18).build());
+    try (OutputStream os =
+        Files.newOutputStream(
+            Paths.get(
+                "..", "..", "..", "java", "src", "test", "resources", "java-" + name + ".onnx"))) {
+      model.build().writeTo(os);
+    }
+  }
+
+  public void generateFp16Fp32Cast() throws IOException {
+    genCast("fp16-to-fp32", OnnxMl.TensorProto.DataType.FLOAT16, OnnxMl.TensorProto.DataType.FLOAT);
+  }
+
+  public void generateFp32Fp16Cast() throws IOException {
+    genCast("fp32-to-fp16", OnnxMl.TensorProto.DataType.FLOAT, OnnxMl.TensorProto.DataType.FLOAT16);
+  }
+
+  public void generateBf16Fp32Cast() throws IOException {
+    genCast(
+        "bf16-to-fp32", OnnxMl.TensorProto.DataType.BFLOAT16, OnnxMl.TensorProto.DataType.FLOAT);
+  }
+
+  public void generateFp32Bf16Cast() throws IOException {
+    genCast(
+        "fp32-to-bf16", OnnxMl.TensorProto.DataType.FLOAT, OnnxMl.TensorProto.DataType.BFLOAT16);
+  }
 }
diff --git a/java/src/test/java/ai/onnxruntime/OnnxTensorTest.java b/java/src/test/java/ai/onnxruntime/OnnxTensorTest.java
new file mode 100644
index 0000000000000..28ac840c0f6c6
--- /dev/null
+++ b/java/src/test/java/ai/onnxruntime/OnnxTensorTest.java
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2021, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Licensed under the MIT License.
+ */
+package ai.onnxruntime;
+
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.FloatBuffer;
+import java.nio.ShortBuffer;
+import java.util.Collections;
+import java.util.SplittableRandom;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+public class OnnxTensorTest {
+
+  @Test
+  public void testScalarCreation() throws OrtException {
+    OrtEnvironment env = OrtEnvironment.getEnvironment();
+    String[] stringValues = new String[] {"true", "false"};
+    for (String s : stringValues) {
+      try (OnnxTensor t = OnnxTensor.createTensor(env, s)) {
+        Assertions.assertEquals(s, t.getValue());
+      }
+    }
+
+    boolean[] boolValues = new boolean[] {true, false};
+    for (boolean b : boolValues) {
+      try (OnnxTensor t = OnnxTensor.createTensor(env, b)) {
+        Assertions.assertEquals(b, t.getValue());
+      }
+    }
+
+    int[] intValues =
+        new int[] {-1, 0, 1, 12345678, -12345678, Integer.MAX_VALUE, Integer.MIN_VALUE};
+    for (int i : intValues) {
+      try (OnnxTensor t = OnnxTensor.createTensor(env, i)) {
+        Assertions.assertEquals(i, t.getValue());
+      }
+    }
+
+    long[] longValues =
+        new long[] {-1L, 0L, 1L, 12345678L, -12345678L, Long.MAX_VALUE, Long.MIN_VALUE};
+    for (long l : longValues) {
+      try (OnnxTensor t = OnnxTensor.createTensor(env, l)) {
+        Assertions.assertEquals(l, t.getValue());
+      }
+    }
+
+    float[] floatValues =
+        new float[] {
+          -1.0f,
+          0.0f,
+          -0.0f,
+          1.0f,
+          1234.5678f,
+          -1234.5678f,
+          (float) Math.PI,
+          (float) Math.E,
+          Float.MAX_VALUE,
+          Float.MIN_VALUE
+        };
+    for (float f : floatValues) {
+      try (OnnxTensor t = OnnxTensor.createTensor(env, f)) {
+        Assertions.assertEquals(f, t.getValue());
+      }
+    }
+
+    double[] doubleValues =
+        new double[] {
+          -1.0,
+          0.0,
+          -0.0,
+          1.0,
+          1234.5678,
+          -1234.5678,
+          Math.PI,
+          Math.E,
+          Double.MAX_VALUE,
+          Double.MIN_VALUE
+        };
+    for (double d : doubleValues) {
+      try (OnnxTensor t = OnnxTensor.createTensor(env, d)) {
+        Assertions.assertEquals(d, t.getValue());
+      }
+    }
+  }
+
+  @Test
+  public void testStringCreation() throws OrtException {
+    OrtEnvironment env = OrtEnvironment.getEnvironment();
+    String[] arrValues = new String[] {"this", "is", "a", "single", "dimensional", "string"};
+    try (OnnxTensor t = OnnxTensor.createTensor(env, arrValues)) {
+      Assertions.assertArrayEquals(new long[] {6}, t.getInfo().shape);
+      String[] output = (String[]) t.getValue();
+      Assertions.assertArrayEquals(arrValues, output);
+    }
+
+    String[][] stringValues =
+        new String[][] {{"this", "is", "a"}, {"multi", "dimensional", "string"}};
+    try (OnnxTensor t = OnnxTensor.createTensor(env, stringValues)) {
+      Assertions.assertArrayEquals(new long[] {2, 3}, t.getInfo().shape);
+      String[][] output = (String[][]) t.getValue();
+      Assertions.assertArrayEquals(stringValues, output);
+    }
+
+    String[][][] deepStringValues =
+        new String[][][] {
+          {{"this", "is", "a"}, {"multi", "dimensional", "string"}},
+          {{"with", "lots", "more"}, {"dimensions", "than", "before"}}
+        };
+    try (OnnxTensor t = OnnxTensor.createTensor(env, deepStringValues)) {
+      Assertions.assertArrayEquals(new long[] {2, 2, 3}, t.getInfo().shape);
+      String[][][] output = (String[][][]) t.getValue();
+      Assertions.assertArrayEquals(deepStringValues, output);
+    }
+  }
+
+  @Test
+  public void testUint8Creation() throws OrtException {
+    OrtEnvironment env = OrtEnvironment.getEnvironment();
+    byte[] buf = new byte[] {0, 1};
+    ByteBuffer data = ByteBuffer.wrap(buf);
+    long[] shape = new long[] {2};
+    try (OnnxTensor t = OnnxTensor.createTensor(env, data, shape, OnnxJavaType.UINT8)) {
+      Assertions.assertArrayEquals(buf, (byte[]) t.getValue());
+    }
+  }
+
+  @Test
+  public void testEmptyTensor() throws OrtException {
+    OrtEnvironment env = OrtEnvironment.getEnvironment();
+    FloatBuffer buf = FloatBuffer.allocate(0);
+    long[] shape = new long[] {4, 0};
+    try (OnnxTensor t = OnnxTensor.createTensor(env, buf, shape)) {
+      Assertions.assertArrayEquals(shape, t.getInfo().getShape());
+      float[][] output = (float[][]) t.getValue();
+      Assertions.assertEquals(4, output.length);
+      Assertions.assertEquals(0, output[0].length);
+      FloatBuffer fb = t.getFloatBuffer();
+      Assertions.assertEquals(0, fb.remaining());
+    }
+    shape = new long[] {0, 4};
+    try (OnnxTensor t = OnnxTensor.createTensor(env, buf, shape)) {
+      Assertions.assertArrayEquals(shape, t.getInfo().getShape());
+      float[][] output = (float[][]) t.getValue();
+      Assertions.assertEquals(0, output.length);
+    }
+  }
+
+  @Test
+  public void testBf16ToFp32() throws OrtException {
+    OrtEnvironment env = OrtEnvironment.getEnvironment();
+    String modelPath = TestHelpers.getResourcePath("/java-bf16-to-fp32.onnx").toString();
+    SplittableRandom rng = new SplittableRandom(1);
+
+    float[][] input = new float[10][5];
+    ByteBuffer buf = ByteBuffer.allocateDirect(2 * 10 * 5).order(ByteOrder.nativeOrder());
+    ShortBuffer shortBuf = buf.asShortBuffer();
+
+    // Generate data
+    for (int i = 0; i < input.length; i++) {
+      for (int j = 0; j < input[0].length; j++) {
+        short bits = (short) rng.nextInt();
+        input[i][j] = OrtUtil.bf16ToFloat(bits);
+        shortBuf.put(bits);
+      }
+    }
+    shortBuf.rewind();
+
+    try (OrtSession.SessionOptions opts = new OrtSession.SessionOptions();
+        OrtSession session = env.createSession(modelPath, opts);
+        OnnxTensor tensor =
+            OnnxTensor.createTensor(env, buf, new long[] {10, 5}, OnnxJavaType.BFLOAT16);
+        OrtSession.Result result = session.run(Collections.singletonMap("input", tensor))) {
+      OnnxTensor output = (OnnxTensor) result.get(0);
+      float[][] outputArr = (float[][]) output.getValue();
+      for (int i = 0; i < input.length; i++) {
+        Assertions.assertArrayEquals(input[i], outputArr[i]);
+      }
+    }
+  }
+
+  @Test
+  public void testFp16ToFp32() throws OrtException {
+    OrtEnvironment env = OrtEnvironment.getEnvironment();
+    String modelPath = TestHelpers.getResourcePath("/java-fp16-to-fp32.onnx").toString();
+    SplittableRandom rng = new SplittableRandom(1);
+
+    float[][] input = new float[10][5];
+    ByteBuffer buf = ByteBuffer.allocateDirect(2 * 10 * 5).order(ByteOrder.nativeOrder());
+    ShortBuffer shortBuf = buf.asShortBuffer();
+
+    // Generate data
+    for (int i = 0; i < input.length; i++) {
+      for (int j = 0; j < input[0].length; j++) {
+        short bits = (short) rng.nextInt();
+        input[i][j] = OrtUtil.fp16ToFloat(bits);
+        shortBuf.put(bits);
+      }
+    }
+    shortBuf.rewind();
+
+    try (OrtSession.SessionOptions opts = new OrtSession.SessionOptions();
+        OrtSession session = env.createSession(modelPath, opts);
+        OnnxTensor tensor =
+            OnnxTensor.createTensor(env, buf, new long[] {10, 5}, OnnxJavaType.FLOAT16);
+        OrtSession.Result result = session.run(Collections.singletonMap("input", tensor))) {
+      OnnxTensor output = (OnnxTensor) result.get(0);
+      float[][] outputArr = (float[][]) output.getValue();
+      for (int i = 0; i < input.length; i++) {
+        Assertions.assertArrayEquals(input[i], outputArr[i]);
+      }
+    }
+  }
+
+  @Test
+  public void testFp32ToFp16() throws OrtException {
+    OrtEnvironment env = OrtEnvironment.getEnvironment();
+    String modelPath = TestHelpers.getResourcePath("/java-fp32-to-fp16.onnx").toString();
+    SplittableRandom rng = new SplittableRandom(1);
+
+    float[][] input = new float[10][5];
+    FloatBuffer floatBuf =
+        ByteBuffer.allocateDirect(4 * 10 * 5).order(ByteOrder.nativeOrder()).asFloatBuffer();
+    ShortBuffer shortBuf = ShortBuffer.allocate(10 * 5);
+
+    // Generate data
+    for (int i = 0; i < input.length; i++) {
+      for (int j = 0; j < input[0].length; j++) {
+        int bits = rng.nextInt();
+        input[i][j] = Float.intBitsToFloat(bits);
+        floatBuf.put(input[i][j]);
+        shortBuf.put(OrtUtil.floatToFp16(input[i][j]));
+      }
+    }
+    floatBuf.rewind();
+    shortBuf.rewind();
+
+    try (OrtSession.SessionOptions opts = new OrtSession.SessionOptions();
+        OrtSession session = env.createSession(modelPath, opts);
+        OnnxTensor tensor = OnnxTensor.createTensor(env, floatBuf, new long[] {10, 5});
+        OrtSession.Result result = session.run(Collections.singletonMap("input", tensor))) {
+      OnnxTensor output = (OnnxTensor) result.get(0);
+
+      // Check outbound Java side cast to fp32 works
+      FloatBuffer castOutput = output.getFloatBuffer();
+      float[] expectedFloatArr = new float[10 * 5];
+      OrtUtil.convertFp16BufferToFloatBuffer(shortBuf).get(expectedFloatArr);
+      float[] actualFloatArr = new float[10 * 5];
+      castOutput.get(actualFloatArr);
+      Assertions.assertArrayEquals(expectedFloatArr, actualFloatArr);
+
+      // Check bits are correct
+      ShortBuffer outputBuf = output.getShortBuffer();
+      short[] expectedShortArr = new short[10 * 5];
+      shortBuf.get(expectedShortArr);
+      short[] actualShortArr = new short[10 * 5];
+      outputBuf.get(actualShortArr);
+      Assertions.assertArrayEquals(expectedShortArr, actualShortArr);
+    }
+  }
+
+  @Test
+  public void testFp32ToBf16() throws OrtException {
+    OrtEnvironment env = OrtEnvironment.getEnvironment();
+    String modelPath = TestHelpers.getResourcePath("/java-fp32-to-bf16.onnx").toString();
+    SplittableRandom rng = new SplittableRandom(1);
+
+    float[][] input = new float[10][5];
+    FloatBuffer floatBuf =
+        ByteBuffer.allocateDirect(4 * 10 * 5).order(ByteOrder.nativeOrder()).asFloatBuffer();
+    ShortBuffer shortBuf = ShortBuffer.allocate(10 * 5);
+
+    // Generate data
+    for (int i = 0; i < input.length; i++) {
+      for (int j = 0; j < input[0].length; j++) {
+        int bits = rng.nextInt();
+        input[i][j] = Float.intBitsToFloat(bits);
+        floatBuf.put(input[i][j]);
+        shortBuf.put(OrtUtil.floatToBf16(input[i][j]));
+      }
+    }
+    floatBuf.rewind();
+    shortBuf.rewind();
+
+    try (OrtSession.SessionOptions opts = new OrtSession.SessionOptions();
+        OrtSession session = env.createSession(modelPath, opts);
+        OnnxTensor tensor = OnnxTensor.createTensor(env, floatBuf, new long[] {10, 5});
+        OrtSession.Result result = session.run(Collections.singletonMap("input", tensor))) {
+      OnnxTensor output = (OnnxTensor) result.get(0);
+
+      // Check outbound Java side cast to fp32 works
+      FloatBuffer castOutput = output.getFloatBuffer();
+      float[] expectedFloatArr = new float[10 * 5];
+      OrtUtil.convertBf16BufferToFloatBuffer(shortBuf).get(expectedFloatArr);
+      float[] actualFloatArr = new float[10 * 5];
+      castOutput.get(actualFloatArr);
+      Assertions.assertArrayEquals(expectedFloatArr, actualFloatArr);
+
+      // Check bits are correct
+      ShortBuffer outputBuf = output.getShortBuffer();
+      short[] expectedShortArr = new short[10 * 5];
+      shortBuf.get(expectedShortArr);
+      short[] actualShortArr = new short[10 * 5];
+      outputBuf.get(actualShortArr);
+      Assertions.assertArrayEquals(expectedShortArr, actualShortArr);
+    }
+  }
+
+  @Test
+  public void testFp16RoundTrip() {
+    for (int i = 0; i < 0xffff; i++) {
+      // Round trip every value
+      short curVal = (short) (0xffff & i);
+      float upcast = OrtUtil.mlasFp16ToFloat(curVal);
+      short output = OrtUtil.mlasFloatToFp16(upcast);
+      if (!Float.isNaN(upcast)) {
+        // We coerce NaNs to the same value.
+        Assertions.assertEquals(
+            curVal,
+            output,
+            "Expected " + curVal + " received " + output + ", intermediate float was " + upcast);
+      }
+    }
+  }
+
+  @Test
+  public void testBf16RoundTrip() {
+    for (int i = 0; i < 0xffff; i++) {
+      // Round trip every value
+      short curVal = (short) (0xffff & i);
+      float upcast = OrtUtil.bf16ToFloat(curVal);
+      short output = OrtUtil.floatToBf16(upcast);
+      if (!Float.isNaN(upcast)) {
+        // We coerce NaNs to the same value.
+        Assertions.assertEquals(
+            curVal,
+            output,
+            "Expected " + curVal + " received " + output + ", intermediate float was " + upcast);
+      }
+    }
+  }
+}
diff --git a/java/src/test/java/ai/onnxruntime/TensorCreationTest.java b/java/src/test/java/ai/onnxruntime/TensorCreationTest.java
deleted file mode 100644
index bd3209279f11a..0000000000000
--- a/java/src/test/java/ai/onnxruntime/TensorCreationTest.java
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2021, 2023, Oracle and/or its affiliates. All rights reserved.
- * Licensed under the MIT License.
- */
-package ai.onnxruntime;
-
-import java.nio.ByteBuffer;
-import java.nio.FloatBuffer;
-import org.junit.jupiter.api.Assertions;
-import org.junit.jupiter.api.Test;
-
-public class TensorCreationTest {
-
-  @Test
-  public void testScalarCreation() throws OrtException {
-    OrtEnvironment env = OrtEnvironment.getEnvironment();
-    String[] stringValues = new String[] {"true", "false"};
-    for (String s : stringValues) {
-      try (OnnxTensor t = OnnxTensor.createTensor(env, s)) {
-        Assertions.assertEquals(s, t.getValue());
-      }
-    }
-
-    boolean[] boolValues = new boolean[] {true, false};
-    for (boolean b : boolValues) {
-      try (OnnxTensor t = OnnxTensor.createTensor(env, b)) {
-        Assertions.assertEquals(b, t.getValue());
-      }
-    }
-
-    int[] intValues =
-        new int[] {-1, 0, 1, 12345678, -12345678, Integer.MAX_VALUE, Integer.MIN_VALUE};
-    for (int i : intValues) {
-      try (OnnxTensor t = OnnxTensor.createTensor(env, i)) {
-        Assertions.assertEquals(i, t.getValue());
-      }
-    }
-
-    long[] longValues =
-        new long[] {-1L, 0L, 1L, 12345678L, -12345678L, Long.MAX_VALUE, Long.MIN_VALUE};
-    for (long l : longValues) {
-      try (OnnxTensor t = OnnxTensor.createTensor(env, l)) {
-        Assertions.assertEquals(l, t.getValue());
-      }
-    }
-
-    float[] floatValues =
-        new float[] {
-          -1.0f,
-          0.0f,
-          -0.0f,
-          1.0f,
-          1234.5678f,
-          -1234.5678f,
-          (float) Math.PI,
-          (float) Math.E,
-          Float.MAX_VALUE,
-          Float.MIN_VALUE
-        };
-    for (float f : floatValues) {
-      try (OnnxTensor t = OnnxTensor.createTensor(env, f)) {
-        Assertions.assertEquals(f, t.getValue());
-      }
-    }
-
-    double[] doubleValues =
-        new double[] {
-          -1.0,
-          0.0,
-          -0.0,
-          1.0,
-          1234.5678,
-          -1234.5678,
-          Math.PI,
-          Math.E,
-          Double.MAX_VALUE,
-          Double.MIN_VALUE
-        };
-    for (double d : doubleValues) {
-      try (OnnxTensor t = OnnxTensor.createTensor(env, d)) {
-        Assertions.assertEquals(d, t.getValue());
-      }
-    }
-  }
-
-  @Test
-  public void testStringCreation() throws OrtException {
-    OrtEnvironment env = OrtEnvironment.getEnvironment();
-    String[] arrValues = new String[] {"this", "is", "a", "single", "dimensional", "string"};
-    try (OnnxTensor t = OnnxTensor.createTensor(env, arrValues)) {
-      Assertions.assertArrayEquals(new long[] {6}, t.getInfo().shape);
-      String[] output = (String[]) t.getValue();
-      Assertions.assertArrayEquals(arrValues, output);
-    }
-
-    String[][] stringValues =
-        new String[][] {{"this", "is", "a"}, {"multi", "dimensional", "string"}};
-    try (OnnxTensor t = OnnxTensor.createTensor(env, stringValues)) {
-      Assertions.assertArrayEquals(new long[] {2, 3}, t.getInfo().shape);
-      String[][] output = (String[][]) t.getValue();
-      Assertions.assertArrayEquals(stringValues, output);
-    }
-
-    String[][][] deepStringValues =
-        new String[][][] {
-          {{"this", "is", "a"}, {"multi", "dimensional", "string"}},
-          {{"with", "lots", "more"}, {"dimensions", "than", "before"}}
-        };
-    try (OnnxTensor t = OnnxTensor.createTensor(env, deepStringValues)) {
-      Assertions.assertArrayEquals(new long[] {2, 2, 3}, t.getInfo().shape);
-      String[][][] output = (String[][][]) t.getValue();
-      Assertions.assertArrayEquals(deepStringValues, output);
-    }
-  }
-
-  @Test
-  public void testUint8Creation() throws OrtException {
-    OrtEnvironment env = OrtEnvironment.getEnvironment();
-    byte[] buf = new byte[] {0, 1};
-    ByteBuffer data = ByteBuffer.wrap(buf);
-    long[] shape = new long[] {2};
-    try (OnnxTensor t = OnnxTensor.createTensor(env, data, shape, OnnxJavaType.UINT8)) {
-      Assertions.assertArrayEquals(buf, (byte[]) t.getValue());
-    }
-  }
-
-  @Test
-  public void testEmptyTensor() throws OrtException {
-    OrtEnvironment env = OrtEnvironment.getEnvironment();
-    FloatBuffer buf = FloatBuffer.allocate(0);
-    long[] shape = new long[] {4, 0};
-    try (OnnxTensor t = OnnxTensor.createTensor(env, buf, shape)) {
-      Assertions.assertArrayEquals(shape, t.getInfo().getShape());
-      float[][] output = (float[][]) t.getValue();
-      Assertions.assertEquals(4, output.length);
-      Assertions.assertEquals(0, output[0].length);
-      FloatBuffer fb = t.getFloatBuffer();
-      Assertions.assertEquals(0, fb.remaining());
-    }
-    shape = new long[] {0, 4};
-    try (OnnxTensor t = OnnxTensor.createTensor(env, buf, shape)) {
-      Assertions.assertArrayEquals(shape, t.getInfo().getShape());
-      float[][] output = (float[][]) t.getValue();
-      Assertions.assertEquals(0, output.length);
-    }
-  }
-}
diff --git a/java/src/test/resources/java-bf16-to-fp32.onnx b/java/src/test/resources/java-bf16-to-fp32.onnx
new file mode 100644
index 0000000000000..e5c3181f530d1
--- /dev/null
+++ b/java/src/test/resources/java-bf16-to-fp32.onnx
@@ -0,0 +1,14 @@
+"ai.onnxruntime.test2ORT bf16-to-fp32 test:�
+(
+inputoutputcast-0"Cast*	
+to�ort-test-bf16-to-fp32Z!
+input
+
+
+batch_size
+b"
+output
+
+
+batch_size
+B
\ No newline at end of file
diff --git a/java/src/test/resources/java-fp16-to-fp32.onnx b/java/src/test/resources/java-fp16-to-fp32.onnx
new file mode 100644
index 0000000000000..0579dd4e0fc48
--- /dev/null
+++ b/java/src/test/resources/java-fp16-to-fp32.onnx
@@ -0,0 +1,15 @@
+"ai.onnxruntime.test2ORT fp16-to-fp32 test:�
+(
+inputoutputcast-0"Cast*	
+to�ort-test-fp16-to-fp32Z!
+input
+
+
+
+batch_size
+b"
+output
+
+
+batch_size
+B
\ No newline at end of file
diff --git a/java/src/test/resources/java-fp32-to-bf16.onnx b/java/src/test/resources/java-fp32-to-bf16.onnx
new file mode 100644
index 0000000000000..a9d495459f88e
--- /dev/null
+++ b/java/src/test/resources/java-fp32-to-bf16.onnx
@@ -0,0 +1,14 @@
+"ai.onnxruntime.test2ORT fp32-to-bf16 test:�
+(
+inputoutputcast-0"Cast*	
+to�ort-test-fp32-to-bf16Z!
+input
+
+
+batch_size
+b"
+output
+
+
+batch_size
+B
\ No newline at end of file
diff --git a/java/src/test/resources/java-fp32-to-fp16.onnx b/java/src/test/resources/java-fp32-to-fp16.onnx
new file mode 100644
index 0000000000000..8294e53a404d0
--- /dev/null
+++ b/java/src/test/resources/java-fp32-to-fp16.onnx
@@ -0,0 +1,16 @@
+"ai.onnxruntime.test2ORT fp32-to-fp16 test:�
+(
+inputoutputcast-0"Cast*	
+to
+�ort-test-fp32-to-fp16Z!
+input
+
+
+batch_size
+b"
+output
+
+
+
+batch_size
+B
\ No newline at end of file

From d3295f4329d744fe1f8419e1220e123807282b99 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Fri, 21 Jul 2023 09:17:34 -0700
Subject: [PATCH 16/34] [Better Engineering] Fix N802 lint errors in tests
 (#16788)

Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at
bottom):
* #16789
* __->__ #16788

This change fixes the N802 lint errors by renaming the test case to use
snake case.
---
 onnxruntime/python/tools/onnxruntime_test.py  |   7 +-
 .../python/onnxruntime_test_ort_trainer.py    |  36 +--
 ...e_test_ort_trainer_with_mixed_precision.py |  14 +-
 .../test/python/onnxruntime_test_python.py    | 211 +++++++++---------
 .../python/onnxruntime_test_python_azure.py   |   8 +-
 .../python/onnxruntime_test_python_backend.py |   4 +-
 .../onnxruntime_test_python_backend_mlops.py  |   6 +-
 .../onnxruntime_test_python_cudagraph.py      |   2 +-
 .../python/onnxruntime_test_python_keras.py   |   2 +-
 .../python/onnxruntime_test_python_mlops.py   |   8 +-
 .../onnxruntime_test_python_sparse_matmul.py  |   6 +-
 .../onnxruntime_test_training_unit_tests.py   |  17 +-
 .../test/python/quantization/op_test_utils.py |   2 +-
 .../python/quantization/test_op_concat.py     |   4 +-
 .../test/python/quantization/test_op_split.py |   4 +-
 .../test/python/quantization/test_op_where.py |   2 +-
 .../quantization/test_quantize_static.py      |   6 +-
 .../transformers/test_gemmfastgelu_fusion.py  |   8 +-
 .../python/transformers/test_optimizer.py     |   2 +-
 .../test_parity_decoder_attention.py          |   6 +-
 20 files changed, 174 insertions(+), 181 deletions(-)

diff --git a/onnxruntime/python/tools/onnxruntime_test.py b/onnxruntime/python/tools/onnxruntime_test.py
index b4c600cf30c84..c20e055d72720 100644
--- a/onnxruntime/python/tools/onnxruntime_test.py
+++ b/onnxruntime/python/tools/onnxruntime_test.py
@@ -2,6 +2,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
+from __future__ import annotations
 
 import argparse
 import os
@@ -29,8 +30,9 @@
 }
 
 
-def generate_feeds(sess, symbolic_dims={}):  # noqa: B006
+def generate_feeds(sess, symbolic_dims: dict | None = None):
     feeds = {}
+    symbolic_dims = symbolic_dims or {}
     for input_meta in sess.get_inputs():
         # replace any symbolic dimensions
         shape = []
@@ -67,10 +69,11 @@ def run_model(
     num_iters=1,
     debug=None,
     profile=None,
-    symbolic_dims={},  # noqa: B006
+    symbolic_dims=None,
     feeds=None,
     override_initializers=True,
 ):
+    symbolic_dims = symbolic_dims or {}
     if debug:
         print(f"Pausing execution ready for debugger to attach to pid: {os.getpid()}")
         print("Press key to continue.")
diff --git a/onnxruntime/test/python/onnxruntime_test_ort_trainer.py b/onnxruntime/test/python/onnxruntime_test_ort_trainer.py
index a3972151dd169..4cf2e5d7f7588 100644
--- a/onnxruntime/test/python/onnxruntime_test_ort_trainer.py
+++ b/onnxruntime/test/python/onnxruntime_test_ort_trainer.py
@@ -138,7 +138,7 @@ def create_ort_trainer(
     return model, model_desc, device
 
 
-def runBertTrainingTest(  # noqa: N802
+def run_bert_training_test(
     gradient_accumulation_steps,
     use_mixed_precision,
     allreduce_post_accumulation,
@@ -516,10 +516,10 @@ def run_mnist_training_and_testing(onnx_opset_ver):  # noqa: N805
             err_msg="test accuracy mismatch",
         )
 
-    def testMNISTTrainingAndTestingOpset12(self):  # noqa: N802
+    def test_mnist_training_and_testing_opset12(self):
         TestOrtTrainer.run_mnist_training_and_testing(onnx_opset_ver=12)
 
-    def testMNISTResumeTrainingAndTesting(self):  # noqa: N802
+    def test_mnist_resume_training_and_testing(self):
         torch.manual_seed(1)
         device = torch.device("cuda")
 
@@ -588,7 +588,7 @@ def testMNISTResumeTrainingAndTesting(self):  # noqa: N802
             err_msg="test accuracy mismatch",
         )
 
-    def testMNISTStateDict(self):  # noqa: N802
+    def test_mnist_state_dict(self):
         torch.manual_seed(1)
         device = torch.device("cuda")
 
@@ -617,7 +617,7 @@ def testMNISTStateDict(self):  # noqa: N802
             "bias_buffer",
         }
 
-    def testMNISTSaveAsONNX(self):  # noqa: N802
+    def test_mnist_save_as_onnx(self):
         torch.manual_seed(1)
         device = torch.device("cuda")
         onnx_file_name = "mnist.onnx"
@@ -643,7 +643,7 @@ def testMNISTSaveAsONNX(self):  # noqa: N802
         trainer.save_as_onnx(onnx_file_name)
         assert os.path.exists(onnx_file_name)
 
-    def testMNISTDevice(self):  # noqa: N802
+    def test_mnist_device(self):
         torch.manual_seed(1)
         device = torch.device("cuda")
 
@@ -662,7 +662,7 @@ def testMNISTDevice(self):  # noqa: N802
 
             loss, _ = trainer.train_step(data, target, torch.tensor([learningRate]))
 
-    def testMNISTInitializerNames(self):  # noqa: N802
+    def test_mnist_initializer_names(self):
         torch.manual_seed(1)
         device = torch.device("cuda")
 
@@ -683,7 +683,7 @@ def testMNISTInitializerNames(self):  # noqa: N802
             n for n, t in model.named_parameters()
         }
 
-    def testMNISTInitializerNamesWithInternalLoss(self):  # noqa: N802
+    def test_mnist_initializer_names_with_internal_loss(self):
         torch.manual_seed(1)
         device = torch.device("cuda")
 
@@ -711,7 +711,7 @@ def get_lr_this_step(global_step):
 
         assert {n.name for n in trainer.onnx_model_.graph.initializer} == {n for n, t in model.named_parameters()}
 
-    def testMNISTFrozenWeight(self):  # noqa: N802
+    def test_mnist_frozen_weight(self):
         torch.manual_seed(1)
         device = torch.device("cuda")
 
@@ -738,7 +738,7 @@ def testMNISTFrozenWeight(self):  # noqa: N802
         fc2_trainstep_2 = trainer.state_dict()["fc2.weight"]
         assert np.array_equal(fc1_trainstep_1, fc1_trainstep_2) and not np.array_equal(fc2_trainstep_1, fc2_trainstep_2)
 
-    def testMNISTTorchBuffer(self):  # noqa: N802
+    def test_mnist_torch_buffer(self):
         torch.manual_seed(1)
         device = torch.device("cuda")
 
@@ -767,7 +767,7 @@ def testMNISTTorchBuffer(self):  # noqa: N802
             bias_buffer_trainstep_1, bias_buffer_trainstep_2
         )
 
-    def testMNISTFrozenWeightCheckpoint(self):  # noqa: N802
+    def test_mnist_frozen_weight_checkpoint(self):
         torch.manual_seed(1)
         device = torch.device("cuda")
 
@@ -806,7 +806,7 @@ def testMNISTFrozenWeightCheckpoint(self):  # noqa: N802
         loaded_state_dict = trainer.state_dict()
         assert state_dict.keys() == loaded_state_dict.keys()
 
-    def testMNISTTrainingCheckpoint(self):  # noqa: N802
+    def test_mnist_training_checkpoint(self):
         torch.manual_seed(1)
         device = torch.device("cuda")
 
@@ -860,7 +860,7 @@ def testMNISTTrainingCheckpoint(self):  # noqa: N802
         for key in state_dict:
             assert np.array_equal(state_dict[key], loaded_state_dict[key])
 
-    def testBertTrainingBasic(self):  # noqa: N802
+    def test_bert_training_basic(self):
         expected_losses = [
             11.027887,
             11.108191,
@@ -872,7 +872,7 @@ def testBertTrainingBasic(self):  # noqa: N802
             10.920979,
         ]
         expected_eval_loss = [10.958977]
-        actual_losses, actual_eval_loss = runBertTrainingTest(
+        actual_losses, actual_eval_loss = run_bert_training_test(
             gradient_accumulation_steps=1,
             use_mixed_precision=False,
             allreduce_post_accumulation=False,
@@ -894,7 +894,7 @@ def testBertTrainingBasic(self):  # noqa: N802
             err_msg="evaluation loss mismatch",
         )
 
-    def testBertTrainingGradientAccumulation(self):  # noqa: N802
+    def test_bert_training_gradient_accumulation(self):
         expected_losses = [
             11.027887,
             11.108191,
@@ -907,7 +907,7 @@ def testBertTrainingGradientAccumulation(self):  # noqa: N802
         ]
         expected_eval_loss = [10.958998]
 
-        actual_losses, actual_eval_loss = runBertTrainingTest(
+        actual_losses, actual_eval_loss = run_bert_training_test(
             gradient_accumulation_steps=4,
             use_mixed_precision=False,
             allreduce_post_accumulation=False,
@@ -929,7 +929,7 @@ def testBertTrainingGradientAccumulation(self):  # noqa: N802
             err_msg="evaluation loss mismatch",
         )
 
-    def testBertCheckpointingBasic(self):  # noqa: N802
+    def test_bert_checkpointing_basic(self):
         model, _, _ = create_ort_trainer(
             gradient_accumulation_steps=1,
             use_mixed_precision=False,
@@ -963,7 +963,7 @@ def testBertCheckpointingBasic(self):  # noqa: N802
         for k, v in loaded_sd.items():
             assert torch.all(torch.eq(v, sd[k]))
 
-    def testWrapModelLossFnStateDict(self):  # noqa: N802
+    def test_wrap_model_loss_fn_state_dict(self):
         torch.manual_seed(1)
         device = torch.device("cuda")
 
diff --git a/onnxruntime/test/python/onnxruntime_test_ort_trainer_with_mixed_precision.py b/onnxruntime/test/python/onnxruntime_test_ort_trainer_with_mixed_precision.py
index 53ace2a642652..3b994e6f26710 100644
--- a/onnxruntime/test/python/onnxruntime_test_ort_trainer_with_mixed_precision.py
+++ b/onnxruntime/test/python/onnxruntime_test_ort_trainer_with_mixed_precision.py
@@ -4,11 +4,11 @@
 import unittest
 
 from numpy.testing import assert_allclose, assert_array_equal
-from onnxruntime_test_ort_trainer import runBertTrainingTest
+from onnxruntime_test_ort_trainer import run_bert_training_test
 
 
 class TestOrtTrainer(unittest.TestCase):
-    def testBertTrainingMixedPrecision(self):  # noqa: N802
+    def test_bert_training_mixed_precision(self):
         expected_losses = [
             11.034248352050781,
             11.125300407409668,
@@ -21,7 +21,7 @@ def testBertTrainingMixedPrecision(self):  # noqa: N802
         ]
         expected_all_finites = [True, True, True, True, True, True, True, True]
         expected_eval_loss = [10.959012985229492]
-        actual_losses, actual_all_finites, actual_eval_loss = runBertTrainingTest(
+        actual_losses, actual_all_finites, actual_eval_loss = run_bert_training_test(
             gradient_accumulation_steps=1,
             use_mixed_precision=True,
             allreduce_post_accumulation=False,
@@ -38,7 +38,7 @@ def testBertTrainingMixedPrecision(self):  # noqa: N802
             err_msg="evaluation loss mismatch",
         )
 
-    def testBertTrainingMixedPrecisionInternalLossScale(self):  # noqa: N802
+    def test_bert_training_mixed_precision_internal_loss_scale(self):
         expected_losses = [
             11.034248352050781,
             11.125300407409668,
@@ -50,7 +50,7 @@ def testBertTrainingMixedPrecisionInternalLossScale(self):  # noqa: N802
             10.971782684326172,
         ]
         expected_eval_loss = [10.959012985229492]
-        actual_losses, actual_eval_loss = runBertTrainingTest(
+        actual_losses, actual_eval_loss = run_bert_training_test(
             gradient_accumulation_steps=1,
             use_mixed_precision=True,
             allreduce_post_accumulation=False,
@@ -67,7 +67,7 @@ def testBertTrainingMixedPrecisionInternalLossScale(self):  # noqa: N802
             err_msg="evaluation loss mismatch",
         )
 
-    def testBertTrainingGradientAccumulationMixedPrecision(self):  # noqa: N802
+    def test_bert_training_gradient_accumulation_mixed_precision(self):
         expected_losses = [
             11.034248352050781,
             11.125300407409668,
@@ -80,7 +80,7 @@ def testBertTrainingGradientAccumulationMixedPrecision(self):  # noqa: N802
         ]
         expected_all_finites = [True, True]
         expected_eval_loss = [10.95903205871582]
-        actual_losses, actual_all_finites, actual_eval_loss = runBertTrainingTest(
+        actual_losses, actual_all_finites, actual_eval_loss = run_bert_training_test(
             gradient_accumulation_steps=4,
             use_mixed_precision=True,
             allreduce_post_accumulation=False,
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index e2e2aa8d850f8..988b5dc07cfa0 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -1,8 +1,9 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
-# pylint: disable=C0116,W0212,R1720,C0114
+from __future__ import annotations
 
 import copy
+import ctypes
 import gc
 import os
 import pathlib
@@ -59,21 +60,21 @@ def run_model_with_input(self, session_object, input_name, input_value, iter_num
             predict = session_object.run(None, {input_name: input_value})[0]
             queue.put(max(predict.flatten().tolist()))
 
-    def testTvmImported(self):  # noqa: N802
+    def test_tvm_imported(self):
         if "TvmExecutionProvider" not in onnxrt.get_available_providers():
             return
         import tvm
 
         self.assertTrue(tvm is not None)
 
-    def testGetVersionString(self):  # noqa: N802
+    def test_get_version_string(self):
         self.assertIsNot(onnxrt.get_version_string(), None)
 
-    def testGetBuildInfo(self):  # noqa: N802
+    def test_get_build_info(self):
         self.assertIsNot(onnxrt.get_build_info(), None)
         self.assertIn("Build Info", onnxrt.get_build_info())
 
-    def testModelSerialization(self):  # noqa: N802
+    def test_model_serialization(self):
         try:
             so = onnxrt.SessionOptions()
             so.log_severity_level = 1
@@ -94,7 +95,7 @@ def testModelSerialization(self):  # noqa: N802
             else:
                 raise onnxruntime_error
 
-    def testModelSerializationWithExternalInitializers(self):  # noqa: N802
+    def test_model_serialization_with_external_initializers(self):
         try:
             so = onnxrt.SessionOptions()
             so.log_severity_level = 1
@@ -121,7 +122,7 @@ def testModelSerializationWithExternalInitializers(self):  # noqa: N802
             else:
                 raise onnxruntime_error
 
-    def testModelSerializationWithExternalInitializersToDirectory(self):  # noqa: N802
+    def test_model_serialization_with_external_initializers_to_directory(self):
         try:
             so = onnxrt.SessionOptions()
             so.log_severity_level = 1
@@ -145,7 +146,7 @@ def testModelSerializationWithExternalInitializersToDirectory(self):  # noqa: N8
             else:
                 raise onnxruntime_error
 
-    def testGetProviders(self):  # noqa: N802
+    def test_get_providers(self):
         self.assertTrue("CPUExecutionProvider" in onnxrt.get_available_providers())
         # get_all_providers() returns the default EP order from highest to lowest.
         # CPUExecutionProvider should always be last.
@@ -153,18 +154,18 @@ def testGetProviders(self):  # noqa: N802
         sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
         self.assertTrue("CPUExecutionProvider" in sess.get_providers())
 
-    def testEnablingAndDisablingTelemetry(self):  # noqa: N802
+    def test_enabling_and_disabling_telemetry(self):
         onnxrt.disable_telemetry_events()
 
         # no-op on non-Windows builds
         # may be no-op on certain Windows builds based on build configuration
         onnxrt.enable_telemetry_events()
 
-    def testDeserializationFromPathObject(self):  # noqa: N802
+    def test_deserialization_from_path_object(self):
         # path object is allowed
         onnxrt.InferenceSession(pathlib.Path(get_name("mul_1.onnx")), providers=available_providers)
 
-    def testSetProviders(self):  # noqa: N802
+    def test_set_providers(self):
         if "CUDAExecutionProvider" in onnxrt.get_available_providers():
             sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=["CUDAExecutionProvider"])
             # confirm that CUDA Provider is in list of registered providers.
@@ -174,7 +175,7 @@ def testSetProviders(self):  # noqa: N802
             # confirm only CPU Provider is registered now.
             self.assertEqual(["CPUExecutionProvider"], sess.get_providers())
 
-    def testSetProvidersWithOptions(self):  # noqa: N802
+    def test_set_providers_with_options(self):
         if "TensorrtExecutionProvider" in onnxrt.get_available_providers():
             sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=["TensorrtExecutionProvider"])
             self.assertIn("TensorrtExecutionProvider", sess.get_providers())
@@ -238,12 +239,9 @@ def testSetProvidersWithOptions(self):  # noqa: N802
             """
 
         if "CUDAExecutionProvider" in onnxrt.get_available_providers():
-            import ctypes
-            import sys  # noqa: F401
+            cuda_success = 0
 
-            CUDA_SUCCESS = 0  # noqa: N806
-
-            def runBaseTest1():  # noqa: N802
+            def run_base_test1():
                 sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=["CUDAExecutionProvider"])
                 self.assertTrue("CUDAExecutionProvider" in sess.get_providers())
 
@@ -262,7 +260,7 @@ def runBaseTest1():  # noqa: N802
                     sess.get_providers(),
                 )
 
-            def runBaseTest2():  # noqa: N802
+            def run_base_test2():
                 sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=["CUDAExecutionProvider"])
                 self.assertIn("CUDAExecutionProvider", sess.get_providers())
 
@@ -343,27 +341,21 @@ def test_get_and_set_option_with_values(option_name, option_values):
                 with self.assertRaises(RuntimeError):
                     sess.set_providers(["CUDAExecutionProvider"], [option])
 
-            def getCudaDeviceCount():  # noqa: N802
-                import ctypes
-
+            def get_cuda_device_count():
                 num_device = ctypes.c_int()
                 result = ctypes.c_int()
                 error_str = ctypes.c_char_p()
 
                 result = cuda.cuInit(0)
                 result = cuda.cuDeviceGetCount(ctypes.byref(num_device))
-                if result != CUDA_SUCCESS:
+                if result != cuda_success:
                     cuda.cuGetErrorString(result, ctypes.byref(error_str))
                     print("cuDeviceGetCount failed with error code %d: %s" % (result, error_str.value.decode()))
                     return -1
 
                 return num_device.value
 
-            def setDeviceIdTest(i):  # noqa: N802
-                import ctypes
-
-                import onnxruntime as onnxrt
-
+            def set_device_id_test(i):
                 device = ctypes.c_int()
                 result = ctypes.c_int()
                 error_str = ctypes.c_char_p()
@@ -376,21 +368,21 @@ def setDeviceIdTest(i):  # noqa: N802
                     sess.get_providers(),
                 )
                 result = cuda.cuCtxGetDevice(ctypes.byref(device))
-                if result != CUDA_SUCCESS:
+                if result != cuda_success:
                     cuda.cuGetErrorString(result, ctypes.byref(error_str))
-                    print("cuCtxGetDevice failed with error code %d: %s" % (result, error_str.value.decode()))
+                    print(f"cuCtxGetDevice failed with error code {result}: {error_str.value.decode()}")
 
-                self.assertEqual(result, CUDA_SUCCESS)
+                self.assertEqual(result, cuda_success)
                 self.assertEqual(i, device.value)
 
-            def runAdvancedTest():  # noqa: N802
-                num_device = getCudaDeviceCount()
+            def run_advanced_test():
+                num_device = get_cuda_device_count()
                 if num_device < 0:
                     return
 
                 # Configure session to be ready to run on all available cuda devices
                 for i in range(num_device):
-                    setDeviceIdTest(i)
+                    set_device_id_test(i)
 
                 sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=["CPUExecutionProvider"])
 
@@ -410,22 +402,21 @@ def runAdvancedTest():  # noqa: N802
             for libname in libnames:
                 try:
                     cuda = ctypes.CDLL(libname)
-                    runBaseTest1()
-                    runBaseTest2()
-                    runAdvancedTest()
+                    run_base_test1()
+                    run_base_test2()
+                    run_advanced_test()
 
                 except OSError:
                     continue
                 else:
                     break
             else:
-                runBaseTest1()
-                runBaseTest2()
-                # raise OSError("could not load any of: " + ' '.join(libnames))
+                run_base_test1()
+                run_base_test2()
 
         if "ROCMExecutionProvider" in onnxrt.get_available_providers():
 
-            def runRocmOptionsTest():  # noqa: N802
+            def run_rocm_options_test():
                 sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=["ROCMExecutionProvider"])
                 self.assertIn("ROCMExecutionProvider", sess.get_providers())
                 options = sess.get_provider_options()
@@ -450,22 +441,22 @@ def test_get_and_set_option_with_values(option_name, option_values):
 
                 test_get_and_set_option_with_values("tunable_op_max_tuning_duration_ms", ["-1", "1"])
 
-            runRocmOptionsTest()
+            run_rocm_options_test()
 
-    def testInvalidSetProviders(self):  # noqa: N802
+    def test_invalid_set_providers(self):
         with self.assertRaises(RuntimeError) as context:
             sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=["CPUExecutionProvider"])
             sess.set_providers(["InvalidProvider"])
         self.assertTrue("Unknown Provider Type: InvalidProvider" in str(context.exception))
 
-    def testSessionProviders(self):  # noqa: N802
+    def test_session_providers(self):
         if "CUDAExecutionProvider" in onnxrt.get_available_providers():
             # create session from scratch, but constrain it to only use the CPU.
             sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=["CPUExecutionProvider"])
             self.assertEqual(["CPUExecutionProvider"], sess.get_providers())
 
-    def testGetAndSetTuningResults(self):  # noqa: N802
-        def getTuningResultsForEp(sess, ep):  # without the outer list  # noqa: N802
+    def test_get_and_set_tuning_results(self):
+        def get_tuning_results_for_ep(sess, ep):  # without the outer list
             tuning_results = sess.get_tuning_results()
             self.assertGreaterEqual(len(tuning_results), 1)
             tuning_results_for_this_ep = [t for t in tuning_results if t.get("ep") == ep]
@@ -476,23 +467,23 @@ def getTuningResultsForEp(sess, ep):  # without the outer list  # noqa: N802
         probe_params_sig = "probe_but_not_an_params_signature"
         probe_value = 10000000
 
-        def copyTuningResultsWithProbe(tr):  # noqa: N802
+        def copy_tuning_results_with_probe(tr):
             tr = copy.deepcopy(tr)
             tr["results"][probe_op_sig] = {probe_params_sig: probe_value}
             return tr
 
-        def assertTuningResultsLoaded(sess, ep):  # noqa: N802
-            tr = getTuningResultsForEp(sess, ep)
+        def assert_tuning_results_loaded(sess, ep):
+            tr = get_tuning_results_for_ep(sess, ep)
             self.assertIn(probe_op_sig, tr["results"])
             self.assertEqual(tr["results"][probe_op_sig], {probe_params_sig: probe_value})
 
-        def assertTuningResultsNotLoaded(sess, ep):  # noqa: N802
-            tr = getTuningResultsForEp(sess, ep)
+        def assert_tuning_results_not_loaded(sess, ep):
+            tr = get_tuning_results_for_ep(sess, ep)
             self.assertNotIn(probe_op_sig, tr["results"])
 
-        def doTestGetAndSetTuningResults(ep):  # noqa: N802
+        def do_test_get_and_set_tuning_results(ep):
             sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=[ep])
-            tuning_results = getTuningResultsForEp(sess, ep)
+            tuning_results = get_tuning_results_for_ep(sess, ep)
 
             self.assertIn("ep", tuning_results)
             self.assertIn("results", tuning_results)
@@ -501,53 +492,53 @@ def doTestGetAndSetTuningResults(ep):  # noqa: N802
             self.assertNotIn("NOT_A_VALIDATOR_KEY", tuning_results["validators"])
 
             # invalid EP will be rejected
-            invalid_unknown_ep = copyTuningResultsWithProbe(tuning_results)
+            invalid_unknown_ep = copy_tuning_results_with_probe(tuning_results)
             invalid_unknown_ep["ep"] = "UnknownEP"
             sess.set_tuning_results([invalid_unknown_ep])
             with self.assertRaises(RuntimeError) as context:
                 sess.set_tuning_results([invalid_unknown_ep], error_on_invalid=True)
             self.assertIn("Cannot find execution provider UnknownEP", str(context.exception))
-            assertTuningResultsNotLoaded(sess, ep)
+            assert_tuning_results_not_loaded(sess, ep)
 
             # missing validator key will be rejected
-            mismatched_validator_key_missing = copyTuningResultsWithProbe(tuning_results)
+            mismatched_validator_key_missing = copy_tuning_results_with_probe(tuning_results)
             mismatched_validator_key_missing["validators"].pop("ORT_VERSION")
             sess.set_tuning_results([mismatched_validator_key_missing])
             with self.assertRaises(RuntimeError) as context:
                 sess.set_tuning_results([mismatched_validator_key_missing], error_on_invalid=True)
             self.assertIn("ORT_VERSION", str(context.exception))
             self.assertIn("is not provided for validation", str(context.exception))
-            assertTuningResultsNotLoaded(sess, ep)
+            assert_tuning_results_not_loaded(sess, ep)
 
-            mismatched_validator_key_extra = copyTuningResultsWithProbe(tuning_results)
+            mismatched_validator_key_extra = copy_tuning_results_with_probe(tuning_results)
             mismatched_validator_key_extra["validators"]["NOT_A_VALIDATOR_KEY"] = "NOT_USED"
             sess.set_tuning_results([mismatched_validator_key_extra])
             with self.assertRaises(RuntimeError) as context:
                 sess.set_tuning_results([mismatched_validator_key_extra], error_on_invalid=True)
             self.assertIn("NOT_A_VALIDATOR_KEY", str(context.exception))
             self.assertIn("is unable to consume it", str(context.exception))
-            assertTuningResultsNotLoaded(sess, ep)
+            assert_tuning_results_not_loaded(sess, ep)
 
-            validation_failure = copyTuningResultsWithProbe(tuning_results)
+            validation_failure = copy_tuning_results_with_probe(tuning_results)
             validation_failure["validators"]["ORT_VERSION"] = "This is not a proper ORT_VERSION value!"
             sess.set_tuning_results([validation_failure])
             with self.assertRaises(RuntimeError) as context:
                 sess.set_tuning_results([validation_failure], error_on_invalid=True)
             self.assertIn("Failed to load TuningResults", str(context.exception))
             self.assertIn("version mismatch", str(context.exception))
-            assertTuningResultsNotLoaded(sess, ep)
+            assert_tuning_results_not_loaded(sess, ep)
 
-            loadable = copyTuningResultsWithProbe(tuning_results)
+            loadable = copy_tuning_results_with_probe(tuning_results)
             sess.set_tuning_results([loadable], error_on_invalid=True)
-            assertTuningResultsLoaded(sess, ep)
+            assert_tuning_results_loaded(sess, ep)
 
         if "CUDAExecutionProvider" in onnxrt.get_available_providers():
-            doTestGetAndSetTuningResults("CUDAExecutionProvider")
+            do_test_get_and_set_tuning_results("CUDAExecutionProvider")
 
         if "ROCMExecutionProvider" in onnxrt.get_available_providers():
-            doTestGetAndSetTuningResults("ROCMExecutionProvider")
+            do_test_get_and_set_tuning_results("ROCMExecutionProvider")
 
-    def testRunModel(self):  # noqa: N802
+    def test_run_model(self):
         sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=available_providers)
         x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         input_name = sess.get_inputs()[0].name
@@ -562,7 +553,7 @@ def testRunModel(self):  # noqa: N802
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
         np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
 
-    def testRunModelFromBytes(self):  # noqa: N802
+    def test_run_model_from_bytes(self):
         with open(get_name("mul_1.onnx"), "rb") as f:
             content = f.read()
         sess = onnxrt.InferenceSession(content, providers=onnxrt.get_available_providers())
@@ -579,7 +570,7 @@ def testRunModelFromBytes(self):  # noqa: N802
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
         np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
 
-    def testRunModel2(self):  # noqa: N802
+    def test_run_model2(self):
         sess = onnxrt.InferenceSession(get_name("matmul_1.onnx"), providers=onnxrt.get_available_providers())
         x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         input_name = sess.get_inputs()[0].name
@@ -594,7 +585,7 @@ def testRunModel2(self):  # noqa: N802
         output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32)
         np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
 
-    def testRunModel2Contiguous(self):  # noqa: N802
+    def test_run_model2_contiguous(self):
         sess = onnxrt.InferenceSession(get_name("matmul_1.onnx"), providers=onnxrt.get_available_providers())
         x = np.array([[2.0, 1.0], [4.0, 3.0], [6.0, 5.0]], dtype=np.float32)[:, [1, 0]]
         input_name = sess.get_inputs()[0].name
@@ -612,7 +603,7 @@ def testRunModel2Contiguous(self):  # noqa: N802
         rescontiguous = sess.run([output_name], {input_name: xcontiguous})
         np.testing.assert_allclose(output_expected, rescontiguous[0], rtol=1e-05, atol=1e-08)
 
-    def testRunModelMultipleThreads(self):  # noqa: N802
+    def test_run_model_multiple_threads(self):
         # Skip this test for a "pure" DML onnxruntime python wheel.
         # We keep this test enabled for instances where both DML and CUDA EPs are available
         # (Windows GPU CI pipeline has this config) - this test will pass because CUDA has higher precedence
@@ -669,7 +660,7 @@ def testRunModelMultipleThreads(self):  # noqa: N802
             while q.qsize() > 0:
                 self.assertEqual(result, q.get())
 
-    def testListAsInput(self):  # noqa: N802
+    def test_list_as_input(self):
         sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
         x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         input_name = sess.get_inputs()[0].name
@@ -677,18 +668,18 @@ def testListAsInput(self):  # noqa: N802
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
         np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
 
-    def testStringListAsInput(self):  # noqa: N802
+    def test_string_list_as_input(self):
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
         x = np.array(["this", "is", "identity", "test"], dtype=str).reshape((2, 2))
         x_name = sess.get_inputs()[0].name
         res = sess.run([], {x_name: x.tolist()})
         np.testing.assert_equal(x, res[0])
 
-    def testRunDevice(self):  # noqa: N802
+    def test_run_device(self):
         device = onnxrt.get_device()
         self.assertTrue("CPU" in device or "GPU" in device)
 
-    def testRunModelSymbolicInput(self):  # noqa: N802
+    def test_run_model_symbolic_input(self):
         sess = onnxrt.InferenceSession(get_name("matmul_2.onnx"), providers=available_providers_without_tvm)
         x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         input_name = sess.get_inputs()[0].name
@@ -705,7 +696,7 @@ def testRunModelSymbolicInput(self):  # noqa: N802
         output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32)
         np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
 
-    def testBooleanInputs(self):  # noqa: N802
+    def test_boolean_inputs(self):
         sess = onnxrt.InferenceSession(get_name("logicaland.onnx"), providers=available_providers)
         a = np.array([[True, True], [False, False]], dtype=bool)
         b = np.array([[True, False], [True, False]], dtype=bool)
@@ -737,7 +728,7 @@ def testBooleanInputs(self):  # noqa: N802
         res = sess.run([output_name], {a_name: a, b_name: b})
         np.testing.assert_equal(output_expected, res[0])
 
-    def testStringInput1(self):  # noqa: N802
+    def test_string_input1(self):
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
         x = np.array(["this", "is", "identity", "test"], dtype=str).reshape((2, 2))
 
@@ -758,7 +749,7 @@ def testStringInput1(self):  # noqa: N802
         res = sess.run([output_name], {x_name: x})
         np.testing.assert_equal(x, res[0])
 
-    def testStringInput2(self):  # noqa: N802
+    def test_string_input2(self):
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
         x = np.array(["Olá", "你好", "여보세요", "hello"], dtype=str).reshape((2, 2))
 
@@ -779,7 +770,7 @@ def testStringInput2(self):  # noqa: N802
         res = sess.run([output_name], {x_name: x})
         np.testing.assert_equal(x, res[0])
 
-    def testInputBytes(self):  # noqa: N802
+    def test_input_bytes(self):
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
         x = np.array([b"this", b"is", b"identity", b"test"]).reshape((2, 2))
 
@@ -800,7 +791,7 @@ def testInputBytes(self):  # noqa: N802
         res = sess.run([output_name], {x_name: x})
         np.testing.assert_equal(x, res[0].astype("|S8"))
 
-    def testInputObject(self):  # noqa: N802
+    def test_input_object(self):
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
         x = np.array(["this", "is", "identity", "test"], object).reshape((2, 2))
 
@@ -821,7 +812,7 @@ def testInputObject(self):  # noqa: N802
         res = sess.run([output_name], {x_name: x})
         np.testing.assert_equal(x, res[0])
 
-    def testInputVoid(self):  # noqa: N802
+    def test_input_void(self):
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
         # numpy 1.20+ doesn't automatically pad the bytes based entries in the array when dtype is np.void,
         # so we use inputs where that is the case
@@ -846,7 +837,7 @@ def testInputVoid(self):  # noqa: N802
         expr = np.array([["must", "have"], ["same", "size"]], dtype=object)
         np.testing.assert_equal(expr, res[0])
 
-    def testRaiseWrongNumInputs(self):  # noqa: N802
+    def test_raise_wrong_num_inputs(self):
         with self.assertRaises(ValueError) as context:
             sess = onnxrt.InferenceSession(get_name("logicaland.onnx"), providers=onnxrt.get_available_providers())
             a = np.array([[True, True], [False, False]], dtype=bool)
@@ -855,7 +846,7 @@ def testRaiseWrongNumInputs(self):  # noqa: N802
             "Required inputs (['input1:0']) are missing from input feed (['input:0'])", str(context.exception)
         )
 
-    def testModelMeta(self):  # noqa: N802
+    def test_model_meta(self):
         model_path = "../models/opset8/test_squeezenet/model.onnx"
         if not os.path.exists(model_path):
             return
@@ -867,7 +858,7 @@ def testModelMeta(self):  # noqa: N802
         self.assertEqual("", modelmeta.description)
         self.assertEqual("", modelmeta.graph_description)
 
-    def testProfilerWithSessionOptions(self):  # noqa: N802
+    def test_profiler_with_session_options(self):
         so = onnxrt.SessionOptions()
         so.enable_profiling = True
         sess = onnxrt.InferenceSession(
@@ -888,8 +879,8 @@ def testProfilerWithSessionOptions(self):  # noqa: N802
                     self.assertTrue(tag in lines[i])
             self.assertTrue("]" in lines[-1])
 
-    def testProfilerGetStartTimeNs(self):  # noqa: N802
-        def getSingleSessionProfilingStartTime():  # noqa: N802
+    def test_profiler_get_start_time_ns(self):
+        def get_single_session_profiling_start_time():
             so = onnxrt.SessionOptions()
             so.enable_profiling = True
             sess = onnxrt.InferenceSession(
@@ -900,16 +891,16 @@ def getSingleSessionProfilingStartTime():  # noqa: N802
             return sess.get_profiling_start_time_ns()
 
         # Get 1st profiling's start time
-        start_time_1 = getSingleSessionProfilingStartTime()
+        start_time_1 = get_single_session_profiling_start_time()
         # Get 2nd profiling's start time
-        start_time_2 = getSingleSessionProfilingStartTime()
+        start_time_2 = get_single_session_profiling_start_time()
         # Get 3rd profiling's start time
-        start_time_3 = getSingleSessionProfilingStartTime()
+        start_time_3 = get_single_session_profiling_start_time()
 
         # Chronological profiling's start time
         self.assertTrue(start_time_1 <= start_time_2 <= start_time_3)
 
-    def testGraphOptimizationLevel(self):  # noqa: N802
+    def test_graph_optimization_level(self):
         opt = onnxrt.SessionOptions()
         # default should be all optimizations optimization
         self.assertEqual(opt.graph_optimization_level, onnxrt.GraphOptimizationLevel.ORT_ENABLE_ALL)
@@ -924,7 +915,7 @@ def testGraphOptimizationLevel(self):  # noqa: N802
 
         sess.run([], {"input1:0": a, "input:0": b})
 
-    def testSequenceLength(self):  # noqa: N802
+    def test_sequence_length(self):
         sess = onnxrt.InferenceSession(get_name("sequence_length.onnx"), providers=available_providers_without_tvm)
         x = [
             np.array([1.0, 0.0, 3.0, 44.0, 23.0, 11.0], dtype=np.float32).reshape((2, 3)),
@@ -945,7 +936,7 @@ def testSequenceLength(self):  # noqa: N802
         res = sess.run([output_name], {x_name: x})
         self.assertEqual(output_expected, res[0])
 
-    def testSequenceConstruct(self):  # noqa: N802
+    def test_sequence_construct(self):
         sess = onnxrt.InferenceSession(
             get_name("sequence_construct.onnx"),
             providers=available_providers_without_tvm,
@@ -977,7 +968,7 @@ def testSequenceConstruct(self):  # noqa: N802
 
         np.testing.assert_array_equal(output_expected, res[0])
 
-    def testSequenceInsert(self):  # noqa: N802
+    def test_sequence_insert(self):
         opt = onnxrt.SessionOptions()
         opt.execution_mode = onnxrt.ExecutionMode.ORT_SEQUENTIAL
         sess = onnxrt.InferenceSession(
@@ -1007,13 +998,13 @@ def testSequenceInsert(self):  # noqa: N802
         )
         np.testing.assert_array_equal(output_expected, res[0])
 
-    def testOrtExecutionMode(self):  # noqa: N802
+    def test_ort_execution_mode(self):
         opt = onnxrt.SessionOptions()
         self.assertEqual(opt.execution_mode, onnxrt.ExecutionMode.ORT_SEQUENTIAL)
         opt.execution_mode = onnxrt.ExecutionMode.ORT_PARALLEL
         self.assertEqual(opt.execution_mode, onnxrt.ExecutionMode.ORT_PARALLEL)
 
-    def testLoadingSessionOptionsFromModel(self):  # noqa: N802
+    def test_loading_session_options_from_model(self):
         try:
             os.environ["ORT_LOAD_CONFIG_FROM_MODEL"] = str(1)
             sess = onnxrt.InferenceSession(
@@ -1044,7 +1035,7 @@ def testLoadingSessionOptionsFromModel(self):  # noqa: N802
             # Make sure the usage of the feature is disabled after this test
             os.environ["ORT_LOAD_CONFIG_FROM_MODEL"] = str(0)
 
-    def testSessionOptionsAddFreeDimensionOverrideByDenotation(self):  # noqa: N802
+    def test_session_options_add_free_dimension_override_by_denotation(self):
         so = onnxrt.SessionOptions()
         so.add_free_dimension_override_by_denotation("DATA_BATCH", 3)
         so.add_free_dimension_override_by_denotation("DATA_CHANNEL", 5)
@@ -1059,7 +1050,7 @@ def testSessionOptionsAddFreeDimensionOverrideByDenotation(self):  # noqa: N802
         # Free dims with denotations - "DATA_BATCH" and "DATA_CHANNEL" have values assigned to them.
         self.assertEqual(input_shape, [3, 5, 5])
 
-    def testSessionOptionsAddFreeDimensionOverrideByName(self):  # noqa: N802
+    def test_session_options_add_free_dimension_override_by_name(self):
         so = onnxrt.SessionOptions()
         so.add_free_dimension_override_by_name("Dim1", 4)
         so.add_free_dimension_override_by_name("Dim2", 6)
@@ -1074,14 +1065,14 @@ def testSessionOptionsAddFreeDimensionOverrideByName(self):  # noqa: N802
         # "Dim1" and "Dim2" have values assigned to them.
         self.assertEqual(input_shape, [4, 6, 5])
 
-    def testSessionOptionsAddConfigEntry(self):  # noqa: N802
+    def test_session_options_add_config_entry(self):
         so = onnxrt.SessionOptions()
         key = "CONFIG_KEY"
         val = "CONFIG_VAL"
         so.add_session_config_entry(key, val)
         self.assertEqual(so.get_session_config_entry(key), val)
 
-    def testInvalidSessionOptionsConfigEntry(self):  # noqa: N802
+    def test_invalid_session_options_config_entry(self):
         so = onnxrt.SessionOptions()
         invalide_key = "INVALID_KEY"
         with self.assertRaises(RuntimeError) as context:
@@ -1090,7 +1081,7 @@ def testInvalidSessionOptionsConfigEntry(self):  # noqa: N802
             "SessionOptions does not have configuration with key: " + invalide_key in str(context.exception)
         )
 
-    def testSessionOptionsAddInitializer(self):  # noqa: N802
+    def test_session_options_add_initializer(self):
         # Create an initializer and add it to a SessionOptions instance
         so = onnxrt.SessionOptions()
         # This initializer is different from the actual initializer in the model for "W"
@@ -1116,7 +1107,7 @@ def testSessionOptionsAddInitializer(self):  # noqa: N802
             )
         )
 
-    def testSessionOptionsAddExternalInitializers(self):  # noqa: N802
+    def test_session_options_add_external_initializers(self):
         # Create an external initializer data in OrtValue
         # This initializer will replace the initializer with external data reference in the graph
         ortvalue_initializer = onnxrt.OrtValue.ortvalue_from_numpy(np.array([0, 0, 1, 1]).astype(np.int64))
@@ -1129,7 +1120,7 @@ def testSessionOptionsAddExternalInitializers(self):  # noqa: N802
             providers=["CPUExecutionProvider"],
         )
 
-    def testRegisterCustomOpsLibrary(self):  # noqa: N802
+    def test_register_custom_ops_library(self):
         if sys.platform.startswith("win"):
             shared_library = "custom_op_library.dll"
             if not os.path.exists(shared_library):
@@ -1183,7 +1174,7 @@ def testRegisterCustomOpsLibrary(self):  # noqa: N802
             custom_op_model, sess_options=so3, providers=available_providers_without_tvm_and_tensorrt
         )
 
-    def testOrtValue(self):  # noqa: N802
+    def test_ort_value(self):
         numpy_arr_input = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         numpy_arr_output = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
 
@@ -1221,7 +1212,7 @@ def test_session_with_ortvalue_input(ortvalue):
             # The constructed OrtValue should still be valid after being used in a session
             self.assertTrue(np.array_equal(ortvalue2.numpy(), numpy_arr_input))
 
-    def testOrtValue_ghIssue9799(self):  # noqa: N802
+    def test_ort_value_gh_issue9799(self):
         if "CUDAExecutionProvider" in onnxrt.get_available_providers():
             session = onnxrt.InferenceSession(
                 get_name("identity_9799.onnx"),
@@ -1235,7 +1226,7 @@ def testOrtValue_ghIssue9799(self):  # noqa: N802
                 outs = session.run(output_names=["output"], input_feed=upstreams_onnxrt)[0]
                 self.assertTrue(np.allclose(inps, outs))
 
-    def testSparseTensorCooFormat(self):  # noqa: N802
+    def test_sparse_tensor_coo_format(self):
         cpu_device = onnxrt.OrtDevice.make("cpu", 0)
         shape = [9, 9]
         values = np.array([1.0, 2.0, 3.0], dtype=np.float32)
@@ -1302,7 +1293,7 @@ def testSparseTensorCooFormat(self):  # noqa: N802
             with self.assertRaises(RuntimeError):
                 sparse_tensor.to_cuda(cuda_device)
 
-    def testSparseTensorCsrFormat(self):  # noqa: N802
+    def test_sparse_tensor_csr_format(self):
         cpu_device = onnxrt.OrtDevice.make("cpu", 0)
         shape = [9, 9]
         values = np.array([1.0, 2.0, 3.0], dtype=np.float32)
@@ -1343,7 +1334,7 @@ def testSparseTensorCsrFormat(self):  # noqa: N802
             self.assertEqual(cuda_sparse_tensor.dense_shape(), shape)
             self.assertEqual(cuda_sparse_tensor.data_type(), "sparse_tensor(float)")
 
-    def testRunModelWithCudaCopyStream(self):  # noqa: N802
+    def test_run_model_with_cuda_copy_stream(self):
         available_providers = onnxrt.get_available_providers()
 
         if "CUDAExecutionProvider" not in available_providers:
@@ -1365,7 +1356,7 @@ def testRunModelWithCudaCopyStream(self):  # noqa: N802
             for _iteration in range(100000):
                 session.run(output_names=["output"], input_feed={"shape": shape})
 
-    def testSharedAllocatorUsingCreateAndRegisterAllocator(self):  # noqa: N802
+    def test_shared_allocator_using_create_and_register_allocator(self):
         # Create and register an arena based allocator
 
         # To create an OrtArenaCfg using non-default parameters, use one of below templates:
@@ -1417,7 +1408,7 @@ def testSharedAllocatorUsingCreateAndRegisterAllocator(self):  # noqa: N802
                 providers=onnxrt.get_available_providers(),
             )
 
-    def testMemoryArenaShrinkage(self):  # noqa: N802
+    def test_memory_arena_shrinkage(self):
         if platform.architecture()[0] == "32bit" or "ppc" in platform.machine() or "powerpc" in platform.machine():
             # on x86 or ppc builds, the CPU allocator does not use an arena
             print("Skipping testMemoryArenaShrinkage in 32bit or powerpc platform.")
@@ -1450,7 +1441,7 @@ def testMemoryArenaShrinkage(self):  # noqa: N802
                 )
                 sess2.run([], {input_name: x}, ro2)
 
-    def testCheckAndNormalizeProviderArgs(self):  # noqa: N802
+    def test_check_and_normalize_provider_args(self):
         from onnxruntime.capi.onnxruntime_inference_collection import check_and_normalize_provider_args
 
         valid_providers = ["a", "b", "c"]
@@ -1502,7 +1493,7 @@ def check_failure(providers, provider_options):
         # provider options unsupported mixed specification
         check_failure([("a", {1: 2})], [{3: 4}])
 
-    def testRegisterCustomEPsLibrary(self):  # noqa: N802
+    def test_register_custom_e_ps_library(self):
         from onnxruntime.capi import _pybind_state as C
 
         available_eps = C.get_available_providers()
@@ -1542,7 +1533,7 @@ def testRegisterCustomEPsLibrary(self):  # noqa: N802
         )
         print("Create session with customize execution provider successfully!")
 
-    def testCreateAllocator(self):  # noqa: N802
+    def test_create_allocator(self):
         def verify_allocator(allocator, expected_config):
             for key, val in expected_config.items():
                 if key == "max_mem":
diff --git a/onnxruntime/test/python/onnxruntime_test_python_azure.py b/onnxruntime/test/python/onnxruntime_test_python_azure.py
index 24bf928a29cbf..717c263f3987a 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_azure.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_azure.py
@@ -8,7 +8,7 @@
 
 class TestAmlEndpoint(unittest.TestCase):
     # test an endpoint of adding floats
-    def testAddf(self):  # noqa: N802
+    def test_addf(self):
         sess_opt = ort.SessionOptions()
         sess_opt.add_session_config_entry("azure.endpoint_type", "triton")
         sess_opt.add_session_config_entry("azure.uri", "https://endpoint-2930.westus2.inference.ml.azure.com")
@@ -32,7 +32,7 @@ def testAddf(self):  # noqa: N802
         np.testing.assert_allclose(z, expected_z, rtol=1e-05, atol=1e-08)
 
     # test an endpoint of adding doubles
-    def testAddf8(self):  # noqa: N802
+    def test_addf8(self):
         sess_opt = ort.SessionOptions()
         sess_opt.add_session_config_entry("azure.endpoint_type", "triton")
         sess_opt.add_session_config_entry("azure.uri", "https://endpoint-1364.westus2.inference.ml.azure.com")
@@ -56,7 +56,7 @@ def testAddf8(self):  # noqa: N802
         np.testing.assert_allclose(z, expected_z, rtol=1e-05, atol=1e-08)
 
     # test an endpoint of adding int
-    def testAddi4(self):  # noqa: N802
+    def test_addi4(self):
         sess_opt = ort.SessionOptions()
         sess_opt.add_session_config_entry("azure.endpoint_type", "triton")
         sess_opt.add_session_config_entry("azure.uri", "https://endpoint-9879.westus2.inference.ml.azure.com")
@@ -80,7 +80,7 @@ def testAddi4(self):  # noqa: N802
         np.testing.assert_allclose(z, expected_z, rtol=1e-05, atol=1e-08)
 
     # test an endpoint of "And"
-    def testAnd(self):  # noqa: N802
+    def test_and(self):
         sess_opt = ort.SessionOptions()
         sess_opt.add_session_config_entry("azure.endpoint_type", "triton")
         sess_opt.add_session_config_entry("azure.uri", "https://endpoint-6811.westus2.inference.ml.azure.com")
diff --git a/onnxruntime/test/python/onnxruntime_test_python_backend.py b/onnxruntime/test/python/onnxruntime_test_python_backend.py
index b7fb95f834455..1f6cd78f28334 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_backend.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_backend.py
@@ -13,7 +13,7 @@
 
 
 class TestBackend(unittest.TestCase):
-    def testRunModel(self):  # noqa: N802
+    def test_run_model(self):
         name = get_name("mul_1.onnx")
         rep = backend.prepare(name)
         x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
@@ -21,7 +21,7 @@ def testRunModel(self):  # noqa: N802
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
         np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
 
-    def testAllocationPlanWorksWithOnlyExecutePathToFetchesOption(self):  # noqa: N802
+    def test_allocation_plan_works_with_only_execute_path_to_fetches_option(self):
         """
                (inp0)  (inp1)
                   |  \\/  |
diff --git a/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py b/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py
index 42103fbbe3bc7..b5400b487cfc2 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py
@@ -32,7 +32,7 @@ def check_list_of_map_to_float(testcase, expected_rows, actual_rows):
 
 
 class TestBackend(unittest.TestCase):
-    def testRunModelNonTensor(self):  # noqa: N802
+    def test_run_model_non_tensor(self):
         name = get_name("pipeline_vectorize.onnx")
         rep = backend.prepare(name)
         x = {0: 25.0, 1: 5.13, 2: 0.0, 3: 0.453, 4: 5.966}
@@ -40,7 +40,7 @@ def testRunModelNonTensor(self):  # noqa: N802
         output_expected = np.array([[49.752754]], dtype=np.float32)
         np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
 
-    def testRunModelProto(self):  # noqa: N802
+    def test_run_model_proto(self):
         name = datasets.get_example("logreg_iris.onnx")
         model = load(name)
 
@@ -65,7 +65,7 @@ def testRunModelProto(self):  # noqa: N802
 
         check_list_of_map_to_float(self, output_expected, res[1])
 
-    def testRunModelProtoApi(self):  # noqa: N802
+    def test_run_model_proto_api(self):
         name = datasets.get_example("logreg_iris.onnx")
         model = load(name)
 
diff --git a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
index a322ebe93ad44..c4e13e773535d 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
@@ -124,7 +124,7 @@ def run_model_with_cuda_graph(self, providers):
             atol=1e-05,
         )
 
-    def testArenaWithCudaGraph(self):  # noqa: N802
+    def test_arena_with_cuda_graph(self):
         if "CUDAExecutionProvider" in onnxrt.get_available_providers():
             # To test cuda graph catpure, we set Arena extend strategy to be SameAsRequested so as to detect any
             # potential memory allocation after the first run.
diff --git a/onnxruntime/test/python/onnxruntime_test_python_keras.py b/onnxruntime/test/python/onnxruntime_test_python_keras.py
index c24cb6954df98..99d964f03169c 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_keras.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_keras.py
@@ -43,7 +43,7 @@ def custom_activation(scope, operator, container):
 
 
 class TestInferenceSessionKeras(unittest.TestCase):
-    def testRunModelConv(self):  # noqa: N802
+    def test_run_model_conv(self):
         # keras model
         N, C, H, W = 2, 3, 5, 5  # noqa: N806
         x = np.random.rand(N, H, W, C).astype(np.float32, copy=False)
diff --git a/onnxruntime/test/python/onnxruntime_test_python_mlops.py b/onnxruntime/test/python/onnxruntime_test_python_mlops.py
index c9cb9bfbf58aa..6cdf820c8a0e9 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_mlops.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_mlops.py
@@ -13,7 +13,7 @@
 
 
 class TestInferenceSession(unittest.TestCase):
-    def testZipMapStringFloat(self):  # noqa: N802
+    def test_zip_map_string_float(self):
         sess = onnxrt.InferenceSession(
             get_name("zipmap_stringfloat.onnx"),
             providers=onnxrt.get_available_providers(),
@@ -37,7 +37,7 @@ def testZipMapStringFloat(self):  # noqa: N802
         res = sess.run([output_name], {x_name: x})
         self.assertEqual(output_expected, res[0])
 
-    def testZipMapInt64Float(self):  # noqa: N802
+    def test_zip_map_int64_float(self):
         sess = onnxrt.InferenceSession(
             get_name("zipmap_int64float.onnx"),
             providers=onnxrt.get_available_providers(),
@@ -58,7 +58,7 @@ def testZipMapInt64Float(self):  # noqa: N802
         res = sess.run([output_name], {x_name: x})
         self.assertEqual(output_expected, res[0])
 
-    def testDictVectorizer(self):  # noqa: N802
+    def test_dict_vectorizer(self):
         sess = onnxrt.InferenceSession(
             get_name("pipeline_vectorize.onnx"),
             providers=onnxrt.get_available_providers(),
@@ -108,7 +108,7 @@ def testDictVectorizer(self):  # noqa: N802
         output_expected = np.array([[49.752754]], dtype=np.float32)
         np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
 
-    def testLabelEncoder(self):  # noqa: N802
+    def test_label_encoder(self):
         sess = onnxrt.InferenceSession(get_name("LabelEncoder.onnx"), providers=onnxrt.get_available_providers())
         input_name = sess.get_inputs()[0].name
         self.assertEqual(input_name, "input")
diff --git a/onnxruntime/test/python/onnxruntime_test_python_sparse_matmul.py b/onnxruntime/test/python/onnxruntime_test_python_sparse_matmul.py
index 4abe799ac89d7..22a09ef565d59 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_sparse_matmul.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_sparse_matmul.py
@@ -14,7 +14,7 @@
 
 
 class TestSparseToDenseMatmul(unittest.TestCase):
-    def testRunSparseOutputOrtValueVector(self):  # noqa: N802
+    def test_run_sparse_output_ort_value_vector(self):
         """
         Try running models using the new run_with_ort_values
         sparse_initializer_as_output.onnx - requires no inputs, but only one output
@@ -28,7 +28,7 @@ def testRunSparseOutputOrtValueVector(self):  # noqa: N802
         res = sess._sess.run_with_ort_values({}, ["values"], RunOptions())
         self.assertIsInstance(res, OrtValueVector)
 
-    def testRunSparseOutputOnly(self):  # noqa: N802
+    def test_run_sparse_output_only(self):
         """
         Try running models using the new run_with_ort_values
         sparse_initializer_as_output.onnx - requires no inputs, but only one output
@@ -52,7 +52,7 @@ def testRunSparseOutputOnly(self):  # noqa: N802
         self.assertTrue(np.array_equal(values, sparse_output.values()))
         self.assertTrue(np.array_equal(indices, sparse_output.as_coo_view().indices()))
 
-    def testRunContribSparseMatMul(self):  # noqa: N802
+    def test_run_contrib_sparse_mat_mul(self):
         """
         Mutliple sparse COO tensor to dense
         """
diff --git a/onnxruntime/test/python/onnxruntime_test_training_unit_tests.py b/onnxruntime/test/python/onnxruntime_test_training_unit_tests.py
index 4f322478673ae..08cbc4a5d392d 100644
--- a/onnxruntime/test/python/onnxruntime_test_training_unit_tests.py
+++ b/onnxruntime/test/python/onnxruntime_test_training_unit_tests.py
@@ -12,18 +12,17 @@
 import onnxruntime
 from onnxruntime.capi.ort_trainer import IODescription, ModelDescription, ORTTrainer
 
-torch.manual_seed(1)
-onnxruntime.set_seed(1)
-
 
 class TestTrainingDropout(unittest.TestCase):
-    def testTrainingAndEvalDropout(self):  # noqa: N802
-        # Temporarily disable this test.
-        # The graph below will trigger ORT
-        # to sort backward graph before forward graph which gives incorrect result.
-        # TODO Re-enable when that is fixed.
-        return
+    def setUp(self):
+        torch.manual_seed(1)
+        onnxruntime.set_seed(1)
 
+    @unittest.skip(
+        "Temporarily disable this test. The graph below will trigger ORT to "
+        "sort backward graph before forward graph which gives incorrect result."
+    )
+    def test_training_and_eval_dropout(self):
         class TwoDropoutNet(nn.Module):
             def __init__(self, drop_prb_1, drop_prb_2, dim_size):
                 super().__init__()
diff --git a/onnxruntime/test/python/quantization/op_test_utils.py b/onnxruntime/test/python/quantization/op_test_utils.py
index 81506caab1b19..6d9dba8ddafbd 100644
--- a/onnxruntime/test/python/quantization/op_test_utils.py
+++ b/onnxruntime/test/python/quantization/op_test_utils.py
@@ -23,7 +23,7 @@ def rewind(self):
         self.iter_next = iter(self.data_feeds)
 
 
-def InputFeedsNegOneZeroOne(n, name2shape):  # noqa: N802
+def input_feeds_neg_one_zero_one(n, name2shape):
     """
     randomize n feed according to shape, its values are from -1, 0, and 1
     """
diff --git a/onnxruntime/test/python/quantization/test_op_concat.py b/onnxruntime/test/python/quantization/test_op_concat.py
index 1c2a1fa44defc..5107ae07923b1 100644
--- a/onnxruntime/test/python/quantization/test_op_concat.py
+++ b/onnxruntime/test/python/quantization/test_op_concat.py
@@ -9,10 +9,10 @@
 import numpy as np
 from onnx import TensorProto, helper, numpy_helper, save
 from op_test_utils import (
-    InputFeedsNegOneZeroOne,
     check_model_correctness,
     check_op_type_count,
     check_qtype_by_node_type,
+    input_feeds_neg_one_zero_one,
 )
 
 from onnxruntime.quantization import QuantFormat, QuantType, quantize_static
@@ -91,7 +91,7 @@ def quantize_concat_test(self, activation_type, weight_type, extra_options={}):
         np.random.seed(1)
         model_fp32_path = "concat_fp32.onnx"
         self.construct_model(model_fp32_path)
-        data_reader = InputFeedsNegOneZeroOne(1, {"input": [1, 3, 15, 15]})
+        data_reader = input_feeds_neg_one_zero_one(1, {"input": [1, 3, 15, 15]})
 
         activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
         activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8"
diff --git a/onnxruntime/test/python/quantization/test_op_split.py b/onnxruntime/test/python/quantization/test_op_split.py
index 4a81f134f235d..13803b4d3e662 100644
--- a/onnxruntime/test/python/quantization/test_op_split.py
+++ b/onnxruntime/test/python/quantization/test_op_split.py
@@ -10,10 +10,10 @@
 import onnx
 from onnx import TensorProto, helper, save
 from op_test_utils import (
-    InputFeedsNegOneZeroOne,
     check_model_correctness,
     check_op_type_count,
     check_qtype_by_node_type,
+    input_feeds_neg_one_zero_one,
 )
 
 from onnxruntime.quantization import QuantFormat, QuantType, quantize_static
@@ -78,7 +78,7 @@ def quantize_split_test(self, activation_type, weight_type, extra_options={}):
         np.random.seed(1)
         model_fp32_path = "split_fp32.onnx"
         self.construct_model(model_fp32_path)
-        data_reader = InputFeedsNegOneZeroOne(1, {"input": [6, 3]})
+        data_reader = input_feeds_neg_one_zero_one(1, {"input": [6, 3]})
 
         activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
         activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8"
diff --git a/onnxruntime/test/python/quantization/test_op_where.py b/onnxruntime/test/python/quantization/test_op_where.py
index 46915d83d9748..4f96283c7d03b 100644
--- a/onnxruntime/test/python/quantization/test_op_where.py
+++ b/onnxruntime/test/python/quantization/test_op_where.py
@@ -147,7 +147,7 @@ def test_quantize_where_u8u8(self):
         self.quantize_where_test(QuantType.QUInt8, QuantType.QUInt8, extra_options={"ForceQuantizeNoInputCheck": True})
         print(__name__)
 
-    def test_quantize_where_u8u8_no_ForceQuantizeNoInputCheck(self):  # noqa: N802
+    def test_quantize_where_u8u8_no_force_quantize_no_input_check(self):
         self.quantize_where_test(QuantType.QUInt8, QuantType.QUInt8, extra_options={"ForceQuantizeNoInputCheck": False})
         print(__name__)
 
diff --git a/onnxruntime/test/python/quantization/test_quantize_static.py b/onnxruntime/test/python/quantization/test_quantize_static.py
index 1fb7ad2e9efa4..4ae2cbe06d46c 100644
--- a/onnxruntime/test/python/quantization/test_quantize_static.py
+++ b/onnxruntime/test/python/quantization/test_quantize_static.py
@@ -12,7 +12,7 @@
 import numpy as np
 import onnx
 from onnx import TensorProto, helper
-from op_test_utils import InputFeedsNegOneZeroOne, check_model_correctness, generate_random_initializer
+from op_test_utils import check_model_correctness, generate_random_initializer, input_feeds_neg_one_zero_one
 
 from onnxruntime.quantization import QuantType, StaticQuantConfig, quantize, quantize_static
 
@@ -72,7 +72,7 @@ def tearDownClass(cls):
         cls._tmp_model_dir.cleanup()
 
     def test_save_as_external(self):
-        data_reader = InputFeedsNegOneZeroOne(10, {"input": [1, self._channel_size, 1, 3]})
+        data_reader = input_feeds_neg_one_zero_one(10, {"input": [1, self._channel_size, 1, 3]})
         for use_external_data_format in [True, False]:
             quant_model_path = str(Path(self._tmp_model_dir.name) / f"quant.{use_external_data_format}.onnx")
             quantize_static(
@@ -89,7 +89,7 @@ def test_save_as_external(self):
             data_reader.rewind()
 
     def test_static_quant_config(self):
-        data_reader = InputFeedsNegOneZeroOne(10, {"input": [1, self._channel_size, 1, 3]})
+        data_reader = input_feeds_neg_one_zero_one(10, {"input": [1, self._channel_size, 1, 3]})
         quant_config = StaticQuantConfig(data_reader)
         quant_model_path = str(Path(self._tmp_model_dir.name) / "quant.config.onnx")
         quantize(self._model_fp32_path, quant_model_path, quant_config)
diff --git a/onnxruntime/test/python/transformers/test_gemmfastgelu_fusion.py b/onnxruntime/test/python/transformers/test_gemmfastgelu_fusion.py
index 3a948705770ee..431ae21cd5eaf 100644
--- a/onnxruntime/test/python/transformers/test_gemmfastgelu_fusion.py
+++ b/onnxruntime/test/python/transformers/test_gemmfastgelu_fusion.py
@@ -43,7 +43,7 @@ def float_tensor(name: str, shape: List[int], random=False):
     return helper.make_tensor(name, TensorProto.FLOAT, shape, weights)
 
 
-def create_MatMul_FastGelu_withoutBias(batch_size, m, n, k):  # noqa: N802
+def create_mat_mul_fast_gelu_without_bias(batch_size, m, n, k):
     # MatMul + FastGelu
     nodes = [
         helper.make_node("MatMul", ["input", "matmul_weight"], ["fastgelu_input"], "matmul"),
@@ -77,7 +77,7 @@ def create_MatMul_FastGelu_withoutBias(batch_size, m, n, k):  # noqa: N802
     return helper.make_model(graph)
 
 
-def create_MatMul_FastGelu_withBias(batch_size, m, n, k):  # noqa: N802
+def create_mat_mul_fast_gelu_with_bias(batch_size, m, n, k):
     # MatMul + FastGelu
     nodes = [
         helper.make_node("MatMul", ["input", "matmul_weight"], ["fastgelu_input"], "matmul"),
@@ -122,7 +122,7 @@ def verify_fusion(self, optimized_model, expected_model_filename):
         self.assertEqual(str(optimized_model.model.graph), str(expected_model.model.graph))
 
     def test_gemmfastgelu_fusion_withoutbias(self):
-        model = create_MatMul_FastGelu_withoutBias(32, 128, 64, 1024)
+        model = create_mat_mul_fast_gelu_without_bias(32, 128, 64, 1024)
         dir = "."
         model_path = os.path.join(dir, "gemmfastgelu_nobias.onnx")
         onnx.save(model, model_path)
@@ -135,7 +135,7 @@ def test_gemmfastgelu_fusion_withoutbias(self):
         self.verify_fusion(optimized_model, "gemmfastgelu_nobias_opt.onnx")
 
     def test_gemmfastgelu_fusion_withbias(self):
-        model = create_MatMul_FastGelu_withBias(32, 128, 64, 1024)
+        model = create_mat_mul_fast_gelu_with_bias(32, 128, 64, 1024)
         dir = "."
         model_path = os.path.join(dir, "gemmfastgelu_withbias.onnx")
         onnx.save(model, model_path)
diff --git a/onnxruntime/test/python/transformers/test_optimizer.py b/onnxruntime/test/python/transformers/test_optimizer.py
index d1fb88c0d0322..eedadfd8d4448 100644
--- a/onnxruntime/test/python/transformers/test_optimizer.py
+++ b/onnxruntime/test/python/transformers/test_optimizer.py
@@ -311,7 +311,7 @@ def test_huggingface_vit_fusion(self):
 
 @unittest.skipUnless(is_tf_available(), "skip TestBertOptimizationTF since tensorflow is not available")
 class TestTensorflowModelOptimization(unittest.TestCase):
-    def Setup(self):  # noqa: N802
+    def setUp(self):
         try:
             import tf2onnx  # noqa: F401
         except ImportError:
diff --git a/onnxruntime/test/python/transformers/test_parity_decoder_attention.py b/onnxruntime/test/python/transformers/test_parity_decoder_attention.py
index 992a805eb2eb3..15ac5a8b7dd4d 100644
--- a/onnxruntime/test/python/transformers/test_parity_decoder_attention.py
+++ b/onnxruntime/test/python/transformers/test_parity_decoder_attention.py
@@ -239,7 +239,7 @@ def forward(
 
         return attn_output, new_key_cache, new_value_cache
 
-    def ORT_forward(  # noqa: N802
+    def ort_forward(
         self,
         query,
         key: Tensor,
@@ -458,7 +458,7 @@ def parity_check(
         use_past,
         has_key_padding_mask,
     )
-    attn_output_ort, new_key_cache_ort, new_value_cache_ort = attn.ORT_forward(
+    attn_output_ort, new_key_cache_ort, new_value_cache_ort = attn.ort_forward(
         query,
         key,
         key_padding_mask,
@@ -468,7 +468,7 @@ def parity_check(
         use_past,
         has_key_padding_mask,
     )
-    attn_output_ort_1, _, _ = attn.ORT_forward(
+    attn_output_ort_1, _, _ = attn.ort_forward(
         query,
         key,
         key_padding_mask,

From d79515041c21c42470c470c2c0617f495e626f02 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Fri, 21 Jul 2023 12:53:41 -0700
Subject: [PATCH 17/34] [Better Engineering] Bump ruff to 0.0.278 and fix new
 lint errors (#16789)

Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at
bottom):
* __->__ #16789

Bump ruff to 0.0.278 and fix new lint errors. I added noqa to all
existing RUF012 errors which requires mutable class variables to be
annotated with `ClassVar`, as well as all PERF issues.

Signed-off-by: Justin Chu <justinchu@microsoft.com>
---
 docs/python/_common/onnx_sphinx.py            |  2 +-
 docs/python/examples/plot_common_errors.py    |  6 ++--
 onnxruntime/python/backend/backend.py         |  2 +-
 .../onnxruntime_inference_collection.py       |  7 ++--
 .../create_custom_op_wrapper.py               |  2 +-
 .../kernels/batched_gemm_test.py              |  2 +-
 .../kernels/gemm_softmax_gemm_permute_test.py |  2 +-
 .../kernel_explorer/kernels/gemm_test.py      |  2 +-
 .../kernels/strided_batched_gemm_test.py      |  2 +-
 .../tools/pytorch_export_contrib_ops.py       |  2 +-
 .../python/tools/qnn/add_trans_cast.py        |  4 +--
 .../python/tools/quantization/calibrate.py    |  2 +-
 .../python/tools/quantization/onnx_model.py   |  8 ++---
 .../tools/quantization/qdq_loss_debug.py      |  2 +-
 .../tools/quantization/shape_inference.py     |  3 +-
 .../python/tools/symbolic_shape_infer.py      |  8 ++---
 .../python/tools/tensorrt/perf/benchmark.py   | 12 +++----
 .../python/tools/tensorrt/perf/post.py        |  4 +--
 .../perf/setup_scripts/setup_onnx_zoo.py      |  2 +-
 .../python/tools/transformers/benchmark.py    |  4 +--
 .../tools/transformers/benchmark_helper.py    |  4 +--
 .../convert_tf_models_to_pytorch.py           |  2 +-
 .../transformers/convert_to_packing_mode.py   |  2 +-
 .../python/tools/transformers/float16.py      |  4 +--
 .../tools/transformers/fusion_embedlayer.py   |  4 +--
 .../tools/transformers/fusion_group_norm.py   |  2 +-
 .../transformers/models/bert/eval_squad.py    |  2 +-
 .../models/gpt2/benchmark_gpt2.py             |  4 +--
 .../transformers/models/gpt2/gpt2_helper.py   |  2 +-
 .../transformers/models/gpt2/gpt2_parity.py   |  4 +--
 .../models/longformer/benchmark_longformer.py |  4 +--
 .../onnxruntime_cuda_txt2img.py               |  2 +-
 .../onnxruntime_tensorrt_txt2img.py           |  2 +-
 .../transformers/models/t5/t5_decoder.py      |  8 ++---
 .../models/whisper/whisper_decoder.py         |  8 ++---
 .../python/tools/transformers/onnx_model.py   | 35 ++++++++-----------
 .../tools/transformers/onnx_model_unet.py     |  2 +-
 .../tools/transformers/shape_optimizer.py     |  6 ++--
 .../test/python/onnxruntime_test_float8.py    |  2 +-
 .../test/python/onnxruntime_test_python.py    |  2 +-
 ...untime_test_python_symbolic_shape_infer.py |  2 +-
 .../onnxruntime_test_training_unit_tests.py   |  3 +-
 .../python/quantization/test_calibration.py   |  2 +-
 .../quantization/test_qdq_loss_debug.py       |  4 +--
 .../generate_tiny_keras2onnx_bert_models.py   |  2 +-
 .../generate_tiny_gpt2_model.py               |  2 +-
 .../python/transformers/test_parity_t5_mha.py |  4 +--
 .../orttraining/python/checkpointing_utils.py |  4 +--
 orttraining/orttraining/python/ort_trainer.py |  4 +--
 .../orttraining/python/training/_utils.py     |  2 +-
 .../python/training/amp/loss_scaler.py        |  4 +--
 .../orttraining/python/training/checkpoint.py | 12 +++----
 .../python/training/onnxblock/optim/optim.py  |  2 +-
 .../training/optim/_megatron_modifier.py      |  6 ++--
 .../python/training/optim/config.py           |  2 +-
 .../python/training/ort_triton/_cache.py      |  4 +--
 .../python/training/ort_triton/_codegen.py    |  2 +-
 .../python/training/ort_triton/_common.py     |  2 +-
 .../python/training/ort_triton/_lowering.py   |  8 +++--
 .../training/ort_triton/_sorted_graph.py      |  8 ++---
 .../python/training/ort_triton/_utils.py      |  2 +-
 .../training/ort_triton/kernel/_slice_scel.py |  4 +--
 .../ortmodule/_custom_gradient_registry.py    |  4 +--
 .../ortmodule/_custom_op_symbolic_registry.py |  2 +-
 .../python/training/ortmodule/_io.py          |  4 +--
 .../training/ortmodule/_runtime_inspector.py  |  4 +--
 .../_hierarchical_ortmodule.py                | 14 ++++----
 .../json_config/_load_config_from_json.py     |  6 ++--
 .../ortmodule/graph_transformer_registry.py   |  2 +-
 .../ortmodule/torch_cpp_extensions/install.py |  2 +-
 .../orttraining/python/training/orttrainer.py |  2 +-
 .../python/training/orttrainer_options.py     |  2 +-
 .../python/training/postprocess.py            | 10 +++---
 .../training/torchdynamo/ort_backend.py       |  2 +-
 .../python/onnxruntime_test_postprocess.py    |  2 +-
 .../orttraining_test_checkpoint_storage.py    |  4 +--
 .../python/orttraining_test_data_loader.py    |  4 +--
 ...orttraining_test_hierarchical_ortmodule.py |  4 +--
 .../orttraining_test_layer_norm_transform.py  |  4 +--
 .../orttraining_test_model_transform.py       |  6 ++--
 .../orttraining_test_onnx_ops_ortmodule.py    |  2 +-
 .../python/orttraining_test_ortmodule_api.py  | 12 +++----
 .../orttraining_test_ortmodule_triton.py      |  2 +-
 ...ttraining_test_orttrainer_bert_toy_onnx.py |  6 ++--
 .../orttraining_test_orttrainer_frontend.py   |  8 ++---
 .../mnist_training.py                         |  2 --
 orttraining/tools/amdgpu/script/rocprof.py    |  2 +-
 orttraining/tools/ci_test/compare_results.py  |  2 +-
 .../tools/scripts/gpt2_model_transform.py     | 10 +++---
 .../tools/scripts/layer_norm_transform.py     |  4 +--
 orttraining/tools/scripts/model_transform.py  | 12 +++----
 .../tools/scripts/opset12_model_transform.py  |  2 +-
 .../scripts/performance_investigation.py      |  4 +--
 .../tools/scripts/pipeline_model_split.py     | 18 +++++-----
 pyproject.toml                                | 18 +++++-----
 requirements-lintrunner.txt                   |  4 +--
 .../build_custom_android_package.py           |  8 +++--
 tools/ci_build/compile_triton.py              |  2 +-
 tools/ci_build/get_docker_image.py            |  2 +-
 .../github/js/validate-npm-packages.py        |  6 ++--
 .../windows/post_binary_sizes_to_dashboard.py |  2 +-
 tools/ci_build/patch_manylinux.py             |  2 +-
 tools/doc/rename_folders.py                   |  2 +-
 .../nuget/generate_nuspec_for_native_nuget.py | 10 +++---
 tools/python/dump_ort_model.py                |  2 +-
 tools/python/gen_contrib_doc.py               |  2 +-
 tools/python/gen_opkernel_doc.py              |  2 +-
 tools/python/onnx2tfevents.py                 |  8 ++---
 tools/python/ort_test_dir_utils.py            |  2 +-
 .../python/util/convert_onnx_models_to_ort.py |  2 +-
 .../util/mobile_helpers/usability_checker.py  | 13 +++----
 tools/python/util/onnx_model_utils.py         |  6 +++-
 .../operator_type_usage_processors.py         |  6 ++--
 tools/python/util/ort_format_model/types.py   |  2 +-
 114 files changed, 269 insertions(+), 260 deletions(-)

diff --git a/docs/python/_common/onnx_sphinx.py b/docs/python/_common/onnx_sphinx.py
index 7562d23289d90..dcebf2ced0b11 100644
--- a/docs/python/_common/onnx_sphinx.py
+++ b/docs/python/_common/onnx_sphinx.py
@@ -683,7 +683,7 @@ def get_onnx_example(op_name):
         try:
             mod = importlib.import_module(m)
             module = m
-        except ImportError:
+        except ImportError:  # noqa: PERF203
             continue
     if module is None:
         # Unable to find an example for 'op_name'.
diff --git a/docs/python/examples/plot_common_errors.py b/docs/python/examples/plot_common_errors.py
index dc7078831a257..a121f8ba6cf2d 100644
--- a/docs/python/examples/plot_common_errors.py
+++ b/docs/python/examples/plot_common_errors.py
@@ -86,7 +86,7 @@
     try:
         r = sess.run([output_name], {input_name: x})
         print(f"Shape={x.shape} and predicted labels={r}")
-    except (RuntimeError, InvalidArgument) as e:
+    except (RuntimeError, InvalidArgument) as e:  # noqa: PERF203
         print(f"ERROR with Shape={x.shape} - {e}")
 
 for x in [
@@ -99,7 +99,7 @@
     try:
         r = sess.run(None, {input_name: x})
         print(f"Shape={x.shape} and predicted probabilities={r[1]}")
-    except (RuntimeError, InvalidArgument) as e:
+    except (RuntimeError, InvalidArgument) as e:  # noqa: PERF203
         print(f"ERROR with Shape={x.shape} - {e}")
 
 #########################
@@ -114,5 +114,5 @@
     try:
         r = sess.run([output_name], {input_name: x})
         print(f"Shape={x.shape} and predicted labels={r}")
-    except (RuntimeError, InvalidArgument) as e:
+    except (RuntimeError, InvalidArgument) as e:  # noqa: PERF203
         print(f"ERROR with Shape={x.shape} - {e}")
diff --git a/onnxruntime/python/backend/backend.py b/onnxruntime/python/backend/backend.py
index 1edae383e93e6..9d16e9cb0917b 100644
--- a/onnxruntime/python/backend/backend.py
+++ b/onnxruntime/python/backend/backend.py
@@ -66,7 +66,7 @@ def is_opset_supported(cls, model):
                             " Got Domain '{}' version '{}'.".format(domain, opset.version)
                         )
                         return False, error_message
-                except AttributeError:
+                except AttributeError:  # noqa: PERF203
                     # for some CI pipelines accessing helper.OP_SET_ID_VERSION_MAP
                     # is generating attribute error. TODO investigate the pipelines to
                     # fix this error. Falling back to a simple version check when this error is encountered
diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py
index d6d8ba7b9ebbd..ce408a2ce3e09 100644
--- a/onnxruntime/python/onnxruntime_inference_collection.py
+++ b/onnxruntime/python/onnxruntime_inference_collection.py
@@ -188,11 +188,10 @@ def enable_fallback(self):
         self._enable_fallback = True
 
     def _validate_input(self, feed_input_names):
-        # import pdb; pdb.set_trace()
         missing_input_names = []
         for input in self._inputs_meta:
             if input.name not in feed_input_names and not input.type.startswith("optional"):
-                missing_input_names.append(input.name)
+                missing_input_names.append(input.name)  # noqa: PERF401
         if missing_input_names:
             raise ValueError(
                 f"Required inputs ({missing_input_names}) are missing from input feed ({feed_input_names})."
@@ -219,7 +218,7 @@ def run(self, output_names, input_feed, run_options=None):
             return self._sess.run(output_names, input_feed, run_options)
         except C.EPFail as err:
             if self._enable_fallback:
-                print(f"EP Error: {str(err)} using {self._providers}")
+                print(f"EP Error: {err!s} using {self._providers}")
                 print(f"Falling back to {self._fallback_providers} and retrying.")
                 self.set_providers(self._fallback_providers)
                 # Fallback only once.
@@ -260,7 +259,7 @@ def invoke(sess, output_names, input_dict_ort_values, run_options):
             return invoke(self._sess, output_names, input_dict_ort_values, run_options)
         except C.EPFail as err:
             if self._enable_fallback:
-                print(f"EP Error: {str(err)} using {self._providers}")
+                print(f"EP Error: {err!s} using {self._providers}")
                 print(f"Falling back to {self._fallback_providers} and retrying.")
                 self.set_providers(self._fallback_providers)
                 # Fallback only once.
diff --git a/onnxruntime/python/tools/custom_op_wrapper/create_custom_op_wrapper.py b/onnxruntime/python/tools/custom_op_wrapper/create_custom_op_wrapper.py
index e0967ef5545db..b7e398e7f75f3 100644
--- a/onnxruntime/python/tools/custom_op_wrapper/create_custom_op_wrapper.py
+++ b/onnxruntime/python/tools/custom_op_wrapper/create_custom_op_wrapper.py
@@ -96,7 +96,7 @@ def __call__(self, parser, namespace, io_strs, opt_str):
 
             try:
                 comp_strs = io_str.split(";")
-            except ValueError:
+            except ValueError:  # noqa: PERF203
                 parser.error(f"{opt_str}: {io_meta_name} info must be separated by ';'")
 
             if len(comp_strs) != 3:
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/batched_gemm_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/batched_gemm_test.py
index 73323d767aad9..71971a0c86b26 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/batched_gemm_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/batched_gemm_test.py
@@ -78,7 +78,7 @@ def _test_batched_gemm(
         for i in range(batch):
             try:
                 np.testing.assert_allclose(my_cs[i], ref_cs[i], rtol=bounds[i])
-            except Exception as err:
+            except Exception as err:  # noqa: PERF203
                 header = "*" * 30 + impl + "*" * 30
                 print(header, bounds[i])
                 print(err)
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_softmax_gemm_permute_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_softmax_gemm_permute_test.py
index 64c7c76a1ad33..0ff9f775c2cd8 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_softmax_gemm_permute_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_softmax_gemm_permute_test.py
@@ -182,7 +182,7 @@ def _test_gemm_softmax_gemm_permute(
                 is_zero_tol, atol, rtol = 1e-3, 2e-2, 1e-2
                 not_close_to_zeros = np.abs(ref) > is_zero_tol
                 np.testing.assert_allclose(out[not_close_to_zeros], ref[not_close_to_zeros], atol=atol, rtol=rtol)
-        except Exception as err:
+        except Exception as err:  # noqa: PERF203
             header = "*" * 30 + impl + "*" * 30
             print(header)
             print(err)
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_test.py
index 6cb984935c812..0dfaa059768fa 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_test.py
@@ -58,7 +58,7 @@ def _test_gemm(func, dtype: str, transa: bool, transb: bool, m: int, n: int, k:
 
         try:
             np.testing.assert_allclose(my_c, ref_c, rtol=bound)
-        except Exception as err:
+        except Exception as err:  # noqa: PERF203
             header = "*" * 30 + impl + "*" * 30
             print(header)
             print(err)
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/strided_batched_gemm_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/strided_batched_gemm_test.py
index 9b2b0b0871574..1021b3695990a 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/strided_batched_gemm_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/strided_batched_gemm_test.py
@@ -82,7 +82,7 @@ def _test_strided_batched_gemm(
         for i in range(batch):
             try:
                 np.testing.assert_allclose(my_c[i], ref_c[i], rtol=bounds[i])
-            except Exception as err:
+            except Exception as err:  # noqa: PERF203
                 header = "*" * 30 + impl + "*" * 30
                 print(header, bounds[i])
                 print(err)
diff --git a/onnxruntime/python/tools/pytorch_export_contrib_ops.py b/onnxruntime/python/tools/pytorch_export_contrib_ops.py
index aeb78f03dd721..ee86adb76602e 100644
--- a/onnxruntime/python/tools/pytorch_export_contrib_ops.py
+++ b/onnxruntime/python/tools/pytorch_export_contrib_ops.py
@@ -96,7 +96,7 @@ def unregister():
     for name in _registered_ops:
         try:
             torch.onnx.unregister_custom_op_symbolic(name, _OPSET_VERSION)
-        except AttributeError:
+        except AttributeError:  # noqa: PERF203
             # The symbolic_registry module was removed in PyTorch 1.13.
             # We are importing it here for backwards compatibility
             # because unregister_custom_op_symbolic is not available before PyTorch 1.12
diff --git a/onnxruntime/python/tools/qnn/add_trans_cast.py b/onnxruntime/python/tools/qnn/add_trans_cast.py
index b4fb74ca665e3..bd6b8701f8fb8 100644
--- a/onnxruntime/python/tools/qnn/add_trans_cast.py
+++ b/onnxruntime/python/tools/qnn/add_trans_cast.py
@@ -142,7 +142,7 @@ def gen_to_channel_first_perm(rank):
     perm.append(0)
     perm.append(rank - 1)
     for i in range(1, rank - 1):
-        perm.append(i)
+        perm.append(i)  # noqa: PERF402
 
     return perm
 
@@ -152,7 +152,7 @@ def gen_to_channel_last_perm(rank):
     perm = []
     perm.append(0)
     for i in range(2, rank):
-        perm.append(i)
+        perm.append(i)  # noqa: PERF402
     perm.append(1)
 
     return perm
diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index 6949d2dac3858..064be434c81d8 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -370,7 +370,7 @@ def augment_graph(self):
         self.tensors_to_calibrate, value_infos = self.select_tensors_to_calibrate(self.model)
         for tensor in self.tensors_to_calibrate:
             if tensor not in self.model_original_outputs:
-                self.model.graph.output.append(value_infos[tensor])
+                self.model.graph.output.append(value_infos[tensor])  # noqa: PERF401
 
         onnx.save(
             self.model,
diff --git a/onnxruntime/python/tools/quantization/onnx_model.py b/onnxruntime/python/tools/quantization/onnx_model.py
index cb7836ab283e8..d2e16f50e6e33 100644
--- a/onnxruntime/python/tools/quantization/onnx_model.py
+++ b/onnxruntime/python/tools/quantization/onnx_model.py
@@ -176,7 +176,7 @@ def get_children(self, node, input_name_to_nodes=None):
         for output in node.output:
             if output in input_name_to_nodes:
                 for node in input_name_to_nodes[output]:
-                    children.append(node)
+                    children.append(node)  # noqa: PERF402
         return children
 
     def get_parents(self, node, output_name_to_node=None):
@@ -186,7 +186,7 @@ def get_parents(self, node, output_name_to_node=None):
         parents = []
         for input in node.input:
             if input in output_name_to_node:
-                parents.append(output_name_to_node[input])
+                parents.append(output_name_to_node[input])  # noqa: PERF401
         return parents
 
     def get_parent(self, node, idx, output_name_to_node=None):
@@ -222,7 +222,7 @@ def find_nodes_by_initializer(self, graph, initializer):
         for node in graph.node:
             for node_input in node.input:
                 if node_input == initializer.name:
-                    nodes.append(node)
+                    nodes.append(node)  # noqa: PERF401
         return nodes
 
     @staticmethod
@@ -379,7 +379,7 @@ def remove_unused_constant(self):
                 and not self.is_graph_output(node.output[0])
                 and node.output[0] not in input_name_to_nodes
             ):
-                unused_nodes.append(node)
+                unused_nodes.append(node)  # noqa: PERF401
 
         self.remove_nodes(unused_nodes)
 
diff --git a/onnxruntime/python/tools/quantization/qdq_loss_debug.py b/onnxruntime/python/tools/quantization/qdq_loss_debug.py
index 67938de54a10b..dd4e831b4b28b 100644
--- a/onnxruntime/python/tools/quantization/qdq_loss_debug.py
+++ b/onnxruntime/python/tools/quantization/qdq_loss_debug.py
@@ -145,7 +145,7 @@ def collect_activations(
 
     intermediate_outputs = []
     for input_d in input_reader:
-        intermediate_outputs.append(inference_session.run(None, input_d))
+        intermediate_outputs.append(inference_session.run(None, input_d))  # noqa: PERF401
     if not intermediate_outputs:
         raise RuntimeError("No data is collected while running augmented model!")
 
diff --git a/onnxruntime/python/tools/quantization/shape_inference.py b/onnxruntime/python/tools/quantization/shape_inference.py
index 344009300ef61..4b5cb9b24263e 100644
--- a/onnxruntime/python/tools/quantization/shape_inference.py
+++ b/onnxruntime/python/tools/quantization/shape_inference.py
@@ -9,6 +9,7 @@
 import tempfile
 import traceback
 from pathlib import Path
+from typing import Optional
 
 import onnx
 
@@ -32,7 +33,7 @@ def quant_pre_process(
     verbose: int = 0,
     save_as_external_data: bool = False,
     all_tensors_to_one_file: bool = False,
-    external_data_location: str = None,
+    external_data_location: Optional[str] = None,
     external_data_size_threshold: int = 1024,
 ) -> None:
     """Shape inference and model optimization, in preparation for quantization.
diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py
index b9c0d4ab1f017..1e818ef7f7f56 100755
--- a/onnxruntime/python/tools/symbolic_shape_infer.py
+++ b/onnxruntime/python/tools/symbolic_shape_infer.py
@@ -747,7 +747,7 @@ def _compute_matmul_shape(self, node, output_dtype=None):
         else:
             lhs_reduce_dim = -1
             rhs_reduce_dim = -2
-            new_shape = [*self._broadcast_shapes(lhs_shape[:-2], rhs_shape[:-2]), lhs_shape[-2]] + [rhs_shape[-1]]
+            new_shape = [*self._broadcast_shapes(lhs_shape[:-2], rhs_shape[:-2]), lhs_shape[-2], rhs_shape[-1]]
         # merge reduce dim
         self._check_merged_dims(
             [lhs_shape[lhs_reduce_dim], rhs_shape[rhs_reduce_dim]],
@@ -1008,13 +1008,13 @@ def _infer_Einsum(self, node):  # noqa: N802
             right_ellipsis_index = right_equation.find(b"...")
             if right_ellipsis_index != -1:
                 for i in range(num_ellipsis_indices):
-                    new_sympy_shape.append(shape[i])
+                    new_sympy_shape.append(shape[i])  # noqa: PERF401
             for c in right_equation:
                 if c != 46:  # c != b'.'
-                    new_sympy_shape.append(letter_to_dim[c])
+                    new_sympy_shape.append(letter_to_dim[c])  # noqa: PERF401
         else:
             for i in range(num_ellipsis_indices):
-                new_sympy_shape.append(shape[i])
+                new_sympy_shape.append(shape[i])  # noqa: PERF401
             for c in left_equation:
                 if c != 44 and c != 46:  # c != b',' and c != b'.':
                     if c in num_letter_occurrences:
diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
index cf2c1fcb2d14c..7cf0839024477 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
@@ -138,7 +138,7 @@ def run_trt_standalone(trtexec, model_name, model_path, test_data_dir, all_input
         logger.info(loaded_input)
         shape = []
         for j in all_inputs_shape[i]:
-            shape.append(str(j))
+            shape.append(str(j))  # noqa: PERF401
         shape = "x".join(shape)
         shape = name + ":" + shape
         input_shape.append(shape)
@@ -266,7 +266,7 @@ def get_ort_session_inputs_and_outputs(name, session, ort_input):
         for i in range(len(session.get_inputs())):
             sess_inputs[session.get_inputs()[i].name] = ort_input[i]
         for i in range(len(session.get_outputs())):
-            sess_outputs.append(session.get_outputs()[i].name)
+            sess_outputs.append(session.get_outputs()[i].name)  # noqa: PERF401
     return (sess_inputs, sess_outputs)
 
 
@@ -406,7 +406,7 @@ def inference_ort(
                 runtime = runtime[1:]  # remove warmup
             runtimes += runtime
 
-        except Exception as e:
+        except Exception as e:  # noqa: PERF203
             logger.error(e)
             if track_memory:
                 end_memory_tracking(p, success)
@@ -605,7 +605,7 @@ def validate(all_ref_outputs, all_outputs, rtol, atol, percent_mismatch):
                 # abs(desired-actual) < rtol * abs(desired) + atol
                 try:
                     np.testing.assert_allclose(ref_o, o, rtol, atol)
-                except Exception as e:
+                except Exception as e:  # noqa: PERF203
                     if percentage_in_allowed_threshold(e, percent_mismatch):
                         continue
                     logger.error(e)
@@ -1194,7 +1194,7 @@ def read_success_from_file(success_file):
     with open(success_file) as success:
         csv_reader = csv.DictReader(success)
         for row in csv_reader:
-            success_results.append(row)
+            success_results.append(row)  # noqa: PERF402
 
     success_json = json.loads(json.dumps(success_results, indent=4))
     return success_json
@@ -2051,7 +2051,7 @@ def __call__(self, parser, namespace, values, option_string):
         for kv in values.split(","):
             try:
                 k, v = kv.split("=")
-            except ValueError:
+            except ValueError:  # noqa: PERF203
                 parser.error(f"argument {option_string}: Expected '=' between key and value")
 
             if k in dict_arg:
diff --git a/onnxruntime/python/tools/tensorrt/perf/post.py b/onnxruntime/python/tools/tensorrt/perf/post.py
index 350e8b3914ab7..dffe270b18d44 100644
--- a/onnxruntime/python/tools/tensorrt/perf/post.py
+++ b/onnxruntime/python/tools/tensorrt/perf/post.py
@@ -146,7 +146,7 @@ def get_memory(memory, model_group):
     memory_columns = [model_title]
     for provider in provider_list:
         if cpu not in provider:
-            memory_columns.append(provider + memory_ending)
+            memory_columns.append(provider + memory_ending)  # noqa: PERF401
     memory_db_columns = [
         model_title,
         cuda,
@@ -273,7 +273,7 @@ def get_latency(latency, model_group):
 
     latency_columns = [model_title]
     for provider in provider_list:
-        latency_columns.append(provider + avg_ending)
+        latency_columns.append(provider + avg_ending)  # noqa: PERF401
     latency_db_columns = table_headers
     latency = adjust_columns(latency, latency_columns, latency_db_columns, model_group)
     return latency
diff --git a/onnxruntime/python/tools/tensorrt/perf/setup_scripts/setup_onnx_zoo.py b/onnxruntime/python/tools/tensorrt/perf/setup_scripts/setup_onnx_zoo.py
index 4f763ad84426d..b36cd678788eb 100644
--- a/onnxruntime/python/tools/tensorrt/perf/setup_scripts/setup_onnx_zoo.py
+++ b/onnxruntime/python/tools/tensorrt/perf/setup_scripts/setup_onnx_zoo.py
@@ -75,7 +75,7 @@ def main():
 
     model_list = []
     for link in links:
-        model_list.append(get_model_info(link))
+        model_list.append(get_model_info(link))  # noqa: PERF401
     write_json(model_list)
 
 
diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index bd9a649ae74fd..023f4a74142dd 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -417,7 +417,7 @@ def run_pytorch(
                     result.update(get_latency_result(runtimes, batch_size))
                     logger.info(result)
                     results.append(result)
-                except RuntimeError as e:
+                except RuntimeError as e:  # noqa: PERF203
                     logger.exception(e)
                     torch.cuda.empty_cache()
 
@@ -572,7 +572,7 @@ def lxmert_forward():
                     result.update(get_latency_result(runtimes, batch_size))
                     logger.info(result)
                     results.append(result)
-                except RuntimeError as e:
+                except RuntimeError as e:  # noqa: PERF203
                     logger.exception(e)
                     from numba import cuda
 
diff --git a/onnxruntime/python/tools/transformers/benchmark_helper.py b/onnxruntime/python/tools/transformers/benchmark_helper.py
index 5fa64d1bc08b8..639b2f34623cd 100644
--- a/onnxruntime/python/tools/transformers/benchmark_helper.py
+++ b/onnxruntime/python/tools/transformers/benchmark_helper.py
@@ -249,7 +249,7 @@ def output_summary(results, csv_filename, args):
                 data_names.append(f"b{batch_size}")
             else:
                 for sequence_length in args.sequence_lengths:
-                    data_names.append(f"b{batch_size}_s{sequence_length}")
+                    data_names.append(f"b{batch_size}_s{sequence_length}")  # noqa: PERF401
 
         csv_writer = csv.DictWriter(csv_file, fieldnames=header_names + data_names)
         csv_writer.writeheader()
@@ -386,7 +386,7 @@ def allocateOutputBuffers(output_buffers, output_buffer_max_sizes, device):  # n
     # for each test run.
 
     for i in output_buffer_max_sizes:
-        output_buffers.append(torch.empty(i, dtype=torch.float32, device=device))
+        output_buffers.append(torch.empty(i, dtype=torch.float32, device=device))  # noqa: PERF401
 
 
 def set_random_seed(seed=123):
diff --git a/onnxruntime/python/tools/transformers/convert_tf_models_to_pytorch.py b/onnxruntime/python/tools/transformers/convert_tf_models_to_pytorch.py
index 1027cd7213df5..89ed140e815d8 100644
--- a/onnxruntime/python/tools/transformers/convert_tf_models_to_pytorch.py
+++ b/onnxruntime/python/tools/transformers/convert_tf_models_to_pytorch.py
@@ -197,7 +197,7 @@ def tf2pt_pipeline_test():
         input = torch.randint(low=0, high=config.vocab_size - 1, size=(4, 128), dtype=torch.long)
         try:
             model(input)
-        except RuntimeError as e:
+        except RuntimeError as e:  # noqa: PERF203
             logger.exception(e)
 
 
diff --git a/onnxruntime/python/tools/transformers/convert_to_packing_mode.py b/onnxruntime/python/tools/transformers/convert_to_packing_mode.py
index f5ec5b884f5aa..5c49d80d64e7c 100644
--- a/onnxruntime/python/tools/transformers/convert_to_packing_mode.py
+++ b/onnxruntime/python/tools/transformers/convert_to_packing_mode.py
@@ -124,7 +124,7 @@ def _replace_attention_with_packing_attention(self, token_offset: str, cumulativ
             attributes = []
             for attr in attention.attribute:
                 if attr.name in ["num_heads", "qkv_hidden_sizes", "scale"]:
-                    attributes.append(attr)
+                    attributes.append(attr)  # noqa: PERF401
 
             packed_attention.attribute.extend(attributes)
             packed_attention.domain = "com.microsoft"
diff --git a/onnxruntime/python/tools/transformers/float16.py b/onnxruntime/python/tools/transformers/float16.py
index c2a7a055e957d..f1f19f3eaaf5b 100644
--- a/onnxruntime/python/tools/transformers/float16.py
+++ b/onnxruntime/python/tools/transformers/float16.py
@@ -342,7 +342,7 @@ def convert_float_to_float16(
                         # For Resize/GroupNorm, attribute data type cannot be changed
                         if n.op_type not in ["Resize", "GroupNorm"]:
                             for attr in n.attribute:
-                                next_level.append(attr)
+                                next_level.append(attr)  # noqa: PERF402
                         else:
                             mixed_float_type_node_list.append(n)
 
@@ -351,7 +351,7 @@ def convert_float_to_float16(
             if isinstance(q, onnx_proto.AttributeProto):
                 next_level.append(q.g)
                 for n in q.graphs:
-                    next_level.append(n)
+                    next_level.append(n)  # noqa: PERF402
                 q.t.CopyFrom(convert_tensor_float_to_float16(q.t, min_positive_val, max_finite_val))
                 for n in q.tensors:
                     n = convert_tensor_float_to_float16(n, min_positive_val, max_finite_val)  # noqa: PLW2901
diff --git a/onnxruntime/python/tools/transformers/fusion_embedlayer.py b/onnxruntime/python/tools/transformers/fusion_embedlayer.py
index 5e8819765c6b4..a20febb9f0a9a 100644
--- a/onnxruntime/python/tools/transformers/fusion_embedlayer.py
+++ b/onnxruntime/python/tools/transformers/fusion_embedlayer.py
@@ -4,7 +4,7 @@
 # --------------------------------------------------------------------------
 
 from logging import getLogger
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 from fusion_base import Fusion
 from fusion_utils import FusionUtils
@@ -428,7 +428,7 @@ def create_fused_node(
         word_embedding_gather: NodeProto,
         position_embedding_gather: NodeProto,
         segment_embedding_gather: Union[None, NodeProto],
-        position_ids: str = None,
+        position_ids: Optional[str] = None,
         embedding_sum_output=False,
     ):
         """Create an EmbedLayerNormalization node. Note that segment embedding is optional.
diff --git a/onnxruntime/python/tools/transformers/fusion_group_norm.py b/onnxruntime/python/tools/transformers/fusion_group_norm.py
index c04c806eaaa86..f866bfce86d60 100644
--- a/onnxruntime/python/tools/transformers/fusion_group_norm.py
+++ b/onnxruntime/python/tools/transformers/fusion_group_norm.py
@@ -103,7 +103,7 @@ def fuse(self, add_node, input_name_to_nodes: Dict, output_name_to_node: Dict):
 
         group_norm_name = self.model.create_node_name("GroupNorm", name_prefix="GroupNorm")
 
-        if weight_elements not in [320, 640, 960, 1280, 1920, 2560] + [128, 256, 512]:
+        if weight_elements not in [320, 640, 960, 1280, 1920, 2560, 128, 256, 512]:
             logger.info("GroupNorm channels=%d", weight_elements)
 
         gamma = helper.make_tensor(
diff --git a/onnxruntime/python/tools/transformers/models/bert/eval_squad.py b/onnxruntime/python/tools/transformers/models/bert/eval_squad.py
index 66265d7b1ea71..c7194b377a526 100644
--- a/onnxruntime/python/tools/transformers/models/bert/eval_squad.py
+++ b/onnxruntime/python/tools/transformers/models/bert/eval_squad.py
@@ -150,7 +150,7 @@ def output_summary(results: List[Dict[str, Any]], csv_filename: str, metric_name
         key_names = []
         for sequence_length in sequence_lengths:
             for batch_size in batch_sizes:
-                key_names.append(f"b{batch_size}_s{sequence_length}")
+                key_names.append(f"b{batch_size}_s{sequence_length}")  # noqa: PERF401
 
         csv_writer = csv.DictWriter(csv_file, fieldnames=header_names + key_names)
         csv_writer.writeheader()
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py b/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
index e8553e2cae0f7..20160d0406f7a 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
@@ -364,7 +364,7 @@ def main(args):
                                 # Results of IO binding might be in GPU. Copy outputs to CPU for comparison.
                                 copy_outputs = []
                                 for output in ort_outputs:
-                                    copy_outputs.append(output.cpu().numpy())
+                                    copy_outputs.append(output.cpu().numpy())  # noqa: PERF401
 
                             if gpt2helper.compare_outputs(
                                 outputs,
@@ -404,7 +404,7 @@ def main(args):
                             "onnxruntime_latency": f"{ort_latency:.2f}",
                         }
                         csv_writer.writerow(row)
-                    except Exception:
+                    except Exception:  # noqa: PERF203
                         logger.error("Exception", exc_info=True)
                         return None
 
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
index b10f5ba763678..50cdc92e61b6e 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
@@ -75,7 +75,7 @@ def post_process(result, num_layer):
             for i in range(num_layer):
                 # Since transformers v4.*, past key and values are separated outputs.
                 # Here we concate them into one tensor to be compatible with Attention operator.
-                present.append(
+                present.append(  # noqa: PERF401
                     torch.cat(
                         (result[1][i][0].unsqueeze(0), result[1][i][1].unsqueeze(0)),
                         dim=0,
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py
index 3bcb80478e730..f70b6520e9f80 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py
@@ -134,7 +134,7 @@ def load_results_from_csv(csv_path):
     with open(csv_path, newline="") as csvfile:
         reader = csv.DictReader(csvfile)
         for row in reader:
-            rows.append(row)
+            rows.append(row)  # noqa: PERF402
     return rows
 
 
@@ -256,7 +256,7 @@ def run_significance_test(rows, output_csv_path):
                     utest_statistic, utest_pvalue = scipy.stats.mannwhitneyu(
                         a, b, use_continuity=True, alternative="two-sided"
                     )  # TODO: shall we use one-sided: less or greater according to "top1_match_rate"
-                except ValueError:  # ValueError: All numbers are identical in mannwhitneyu
+                except ValueError:  # ValueError: All numbers are identical in mannwhitneyu  # noqa: PERF203
                     utest_statistic = None
                     utest_pvalue = None
                 ttest_statistic, ttest_pvalue = scipy.stats.ttest_ind(a, b, axis=None, equal_var=True)
diff --git a/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py b/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py
index bf6c1e60308be..5f67498065ae2 100644
--- a/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py
+++ b/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py
@@ -645,7 +645,7 @@ def run_tests(
 
                         args = parse_arguments(f"{arguments} -t {test_times}".split(" "))
                         latency_results = launch_test(args)
-                    except KeyboardInterrupt as exc:
+                    except KeyboardInterrupt as exc:  # noqa: PERF203
                         raise RuntimeError("Keyboard Interrupted") from exc
                     except Exception:
                         traceback.print_exc()
@@ -687,7 +687,7 @@ def output_summary(results, csv_filename, data_field="average_latency_ms"):
         data_names = []
         for sequence_length in sequence_lengths:
             for batch_size in batch_sizes:
-                data_names.append(f"b{batch_size}_s{sequence_length}")
+                data_names.append(f"b{batch_size}_s{sequence_length}")  # noqa: PERF401
 
         csv_writer = csv.DictWriter(csv_file, fieldnames=header_names + data_names)
         csv_writer.writeheader()
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_cuda_txt2img.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_cuda_txt2img.py
index bd29e3e42c7e6..37c74217c6b4e 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_cuda_txt2img.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_cuda_txt2img.py
@@ -645,7 +645,7 @@ def __allocate_buffers(self, image_height, image_width, batch_size):
     @torch.no_grad()
     def __call__(
         self,
-        prompt: Union[str, List[str]] = None,
+        prompt: Optional[Union[str, List[str]]] = None,
         num_inference_steps: int = 50,
         guidance_scale: float = 7.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_tensorrt_txt2img.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_tensorrt_txt2img.py
index 80f257db2981b..d8abd56d0e659 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_tensorrt_txt2img.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_tensorrt_txt2img.py
@@ -798,7 +798,7 @@ def __allocate_buffers(self, image_height, image_width, batch_size):
     @torch.no_grad()
     def __call__(
         self,
-        prompt: Union[str, List[str]] = None,
+        prompt: Optional[Union[str, List[str]]] = None,
         num_inference_steps: int = 50,
         guidance_scale: float = 7.5,
         negative_prompt: Optional[Union[str, List[str]]] = None,
diff --git a/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py b/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py
index 027fc32d5a497..ceeb96e877753 100644
--- a/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py
+++ b/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py
@@ -9,7 +9,7 @@
 import sys
 import tempfile
 from pathlib import Path
-from typing import List, Union
+from typing import List, Optional, Union
 
 import numpy
 import onnx
@@ -38,7 +38,7 @@ def __init__(
         decoder: torch.nn.Module,
         lm_head: torch.nn.Module,
         config: Union[T5Config, MT5Config],
-        decoder_start_token_id: int = None,
+        decoder_start_token_id: Optional[int] = None,
     ):
         super().__init__()
         self.decoder = decoder
@@ -204,10 +204,10 @@ def create_dummy(
 
             past = []
             for _ in range(2 * num_layers):
-                past.append(torch.rand(self_attention_past_shape, dtype=float_type, device=device))
+                past.append(torch.rand(self_attention_past_shape, dtype=float_type, device=device))  # noqa: PERF401
 
             for _ in range(2 * num_layers):
-                past.append(torch.rand(cross_attention_past_shape, dtype=float_type, device=device))
+                past.append(torch.rand(cross_attention_past_shape, dtype=float_type, device=device))  # noqa: PERF401
         else:
             past = None
 
diff --git a/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py b/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py
index 7d6d038ffa642..e85757ded8d06 100644
--- a/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py
+++ b/onnxruntime/python/tools/transformers/models/whisper/whisper_decoder.py
@@ -9,7 +9,7 @@
 import sys
 import tempfile
 from pathlib import Path
-from typing import List, Union
+from typing import List, Optional, Union
 
 import numpy
 import onnx
@@ -36,7 +36,7 @@ def __init__(
         self,
         decoder: torch.nn.Module,
         config: WhisperConfig,
-        decoder_start_token_id: int = None,
+        decoder_start_token_id: Optional[int] = None,
     ):
         super().__init__()
         self.decoder = decoder
@@ -167,10 +167,10 @@ def create_dummy(
 
             past = []
             for _ in range(2 * num_layers):
-                past.append(torch.rand(self_attention_past_shape, dtype=float_type, device=device))
+                past.append(torch.rand(self_attention_past_shape, dtype=float_type, device=device))  # noqa: PERF401
 
             for _ in range(2 * num_layers):
-                past.append(torch.rand(cross_attention_past_shape, dtype=float_type, device=device))
+                past.append(torch.rand(cross_attention_past_shape, dtype=float_type, device=device))  # noqa: PERF401
         else:
             past = None
 
diff --git a/onnxruntime/python/tools/transformers/onnx_model.py b/onnxruntime/python/tools/transformers/onnx_model.py
index ab2ae5ceac946..fe6b877e562e7 100644
--- a/onnxruntime/python/tools/transformers/onnx_model.py
+++ b/onnxruntime/python/tools/transformers/onnx_model.py
@@ -79,7 +79,7 @@ def nodes(self):
         all_nodes = []
         for graph in self.graphs():
             for node in graph.node:
-                all_nodes.append(node)
+                all_nodes.append(node)  # noqa: PERF402
         return all_nodes
 
     def graph(self):
@@ -108,14 +108,14 @@ def get_graphs_input_names(self):
         input_names = []
         for graph in self.graphs():
             for input in graph.input:
-                input_names.append(input.name)
+                input_names.append(input.name)  # noqa: PERF401
         return input_names
 
     def get_graphs_output_names(self):
         output_names = []
         for graph in self.graphs():
             for output in graph.output:
-                output_names.append(output.name)
+                output_names.append(output.name)  # noqa: PERF401
         return output_names
 
     def get_graph_by_node(self, node):
@@ -217,7 +217,7 @@ def get_nodes_by_op_type(self, op_type):
         nodes = []
         for node in self.nodes():
             if node.op_type == op_type:
-                nodes.append(node)
+                nodes.append(node)  # noqa: PERF401
         return nodes
 
     def get_children(self, node, input_name_to_nodes=None):
@@ -228,7 +228,7 @@ def get_children(self, node, input_name_to_nodes=None):
         for output in node.output:
             if output in input_name_to_nodes:
                 for node in input_name_to_nodes[output]:
-                    children.append(node)
+                    children.append(node)  # noqa: PERF402
         return children
 
     def get_parents(self, node, output_name_to_node=None):
@@ -238,7 +238,7 @@ def get_parents(self, node, output_name_to_node=None):
         parents = []
         for input in node.input:
             if input in output_name_to_node:
-                parents.append(output_name_to_node[input])
+                parents.append(output_name_to_node[input])  # noqa: PERF401
         return parents
 
     def get_parent(self, node, i, output_name_to_node=None):
@@ -659,8 +659,8 @@ def convert_float_to_float16(self, use_symbolic_shape_infer=True, **kwargs):
                     for vi in model.graph.value_info:
                         if vi.name in name_vi:
                             del name_vi[vi.name]
-                    for _, vi in name_vi.items():
-                        model.graph.value_info.append(vi)
+                    for vi in name_vi.values():
+                        model.graph.value_info.append(vi)  # noqa: PERF402
             except Exception:
                 logger.warning(
                     "Failed to run symbolic shape inference. Please file an issue in https://github.com/microsoft/onnxruntime."
@@ -792,7 +792,7 @@ def remove_unused_constant(self):
         nodes = self.nodes()
         for node in nodes:
             if node.op_type == "Constant" and node.output[0] not in input_name_to_nodes:
-                unused_nodes.append(node)
+                unused_nodes.append(node)  # noqa: PERF401
 
         self.remove_nodes(unused_nodes)
 
@@ -829,10 +829,7 @@ def prune_graph(self, outputs=None, allow_remove_graph_inputs=True):
                 all_nodes.append(last_node)
                 all_nodes.extend(nodes)
 
-        nodes_to_remove = []
-        for node in self.model.graph.node:
-            if node not in all_nodes:
-                nodes_to_remove.append(node)
+        nodes_to_remove = [node for node in self.model.graph.node if node not in all_nodes]
 
         self.remove_nodes(nodes_to_remove)
 
@@ -840,7 +837,7 @@ def prune_graph(self, outputs=None, allow_remove_graph_inputs=True):
         output_to_remove = []
         for output in self.model.graph.output:
             if output.name not in outputs:
-                output_to_remove.append(output)
+                output_to_remove.append(output)  # noqa: PERF401
         for output in output_to_remove:
             self.model.graph.output.remove(output)
 
@@ -848,9 +845,7 @@ def prune_graph(self, outputs=None, allow_remove_graph_inputs=True):
         input_to_remove = []
         if allow_remove_graph_inputs:
             input_name_to_nodes = self.input_name_to_nodes()
-            for input in self.model.graph.input:
-                if input.name not in input_name_to_nodes:
-                    input_to_remove.append(input)
+            input_to_remove = [input for input in self.model.graph.input if input.name not in input_name_to_nodes]
             for input in input_to_remove:
                 self.model.graph.input.remove(input)
 
@@ -887,7 +882,7 @@ def update_graph(self, verbose=False, allow_remove_graph_inputs=False):
         if allow_remove_graph_inputs:
             for input in graph.input:
                 if input.name not in remaining_input_names:
-                    inputs_to_remove.append(input)
+                    inputs_to_remove.append(input)  # noqa: PERF401
             for input in inputs_to_remove:
                 graph.input.remove(input)
 
@@ -1063,7 +1058,7 @@ def get_graph_inputs_excluding_initializers(self):
         graph_inputs = []
         for input in self.model.graph.input:
             if self.get_initializer(input.name) is None:
-                graph_inputs.append(input)
+                graph_inputs.append(input)  # noqa: PERF401
         return graph_inputs
 
     def get_opset_version(self):
@@ -1217,7 +1212,7 @@ def use_float16(self):
                             sub_graphs.append(attr.g)
 
                         for g in attr.graphs:
-                            sub_graphs.append(g)
+                            sub_graphs.append(g)  # noqa: PERF402
 
                         if isinstance(attr.t, TensorProto) and attr.t.data_type == TensorProto.FLOAT16:
                             return True
diff --git a/onnxruntime/python/tools/transformers/onnx_model_unet.py b/onnxruntime/python/tools/transformers/onnx_model_unet.py
index 00fc0763d820c..09a6ecea9f94a 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_unet.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_unet.py
@@ -47,7 +47,7 @@ def remove_useless_div(self):
         nodes_to_remove = []
         for div in div_nodes:
             if self.find_constant_input(div, 1.0) == 1:
-                nodes_to_remove.append(div)
+                nodes_to_remove.append(div)  # noqa: PERF401
 
         for node in nodes_to_remove:
             self.replace_input_of_all_nodes(node.output[0], node.input[0])
diff --git a/onnxruntime/python/tools/transformers/shape_optimizer.py b/onnxruntime/python/tools/transformers/shape_optimizer.py
index bf507a0d8a0a3..c3fa0435dc9b4 100644
--- a/onnxruntime/python/tools/transformers/shape_optimizer.py
+++ b/onnxruntime/python/tools/transformers/shape_optimizer.py
@@ -16,7 +16,7 @@
 from collections import deque  # noqa: F401
 from datetime import datetime
 from pathlib import Path  # noqa: F401
-from typing import List
+from typing import List, Optional
 
 import numpy as np
 import onnx
@@ -78,7 +78,7 @@ def get_reshape_shape_inputs(self):
         shape_inputs = []
         for node in self.model.graph.node:
             if node.op_type == "Reshape":
-                shape_inputs.append(node.input[1])
+                shape_inputs.append(node.input[1])  # noqa: PERF401
 
         return shape_inputs
 
@@ -287,7 +287,7 @@ def optimize(
         input_mask: str,
         enable_shape_opt: bool,
         enable_reshape_opt: bool,
-        output_names: List[str] = None,
+        output_names: Optional[List[str]] = None,
         batch_size=1,
         sequence_length=128,
         verbose=False,
diff --git a/onnxruntime/test/python/onnxruntime_test_float8.py b/onnxruntime/test/python/onnxruntime_test_float8.py
index 7fc93ce684976..ed1cf922ddae9 100644
--- a/onnxruntime/test/python/onnxruntime_test_float8.py
+++ b/onnxruntime/test/python/onnxruntime_test_float8.py
@@ -37,7 +37,7 @@ class TestInferenceSession(unittest.TestCase):
     <https://onnx.ai/onnx/api/numpy_helper.html#onnx.numpy_helper.float8e5m2_to_float32>`_.
     """
 
-    dtypes = {"FLOAT": np.float32, "FLOAT16": np.float16}
+    dtypes = frozenset({"FLOAT": np.float32, "FLOAT16": np.float16})
     x = np.array(
         [0.4068359375, 352, 416, 336, 304, 272, -248, -100, 1e-4, 1e-2, 416, 432, 1e5, np.inf, -np.inf, np.nan],
         dtype=np.float32,
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index 988b5dc07cfa0..096f6c80043d4 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -406,7 +406,7 @@ def run_advanced_test():
                     run_base_test2()
                     run_advanced_test()
 
-                except OSError:
+                except OSError:  # noqa: PERF203
                     continue
                 else:
                     break
diff --git a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
index d0ea59b994cb7..67db411ddc246 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
@@ -49,7 +49,7 @@ def test_symbolic_shape_infer(self):
 
             # https://github.com/onnx/models/issues/562
             if any(model_name in str(filename) for model_name in skipped_models):
-                print(f"Skip symbolic shape inference on : {str(filename)}")
+                print(f"Skip symbolic shape inference on : {filename!s}")
                 continue
 
             print("Running symbolic shape inference on : " + str(filename))
diff --git a/onnxruntime/test/python/onnxruntime_test_training_unit_tests.py b/onnxruntime/test/python/onnxruntime_test_training_unit_tests.py
index 08cbc4a5d392d..540f39b797bdb 100644
--- a/onnxruntime/test/python/onnxruntime_test_training_unit_tests.py
+++ b/onnxruntime/test/python/onnxruntime_test_training_unit_tests.py
@@ -20,7 +20,8 @@ def setUp(self):
 
     @unittest.skip(
         "Temporarily disable this test. The graph below will trigger ORT to "
-        "sort backward graph before forward graph which gives incorrect result."
+        "sort backward graph before forward graph which gives incorrect result. "
+        "https://github.com/microsoft/onnxruntime/issues/16801"
     )
     def test_training_and_eval_dropout(self):
         class TwoDropoutNet(nn.Module):
diff --git a/onnxruntime/test/python/quantization/test_calibration.py b/onnxruntime/test/python/quantization/test_calibration.py
index 14be6fa45c8c1..93c684cbc21ca 100644
--- a/onnxruntime/test/python/quantization/test_calibration.py
+++ b/onnxruntime/test/python/quantization/test_calibration.py
@@ -35,7 +35,7 @@ def __init__(self):
         self.count = 4
         self.input_data_list = []
         for _ in range(self.count):
-            self.input_data_list.append(np.random.normal(0, 0.33, [1, 3, 1, 3]).astype(np.float32))
+            self.input_data_list.append(np.random.normal(0, 0.33, [1, 3, 1, 3]).astype(np.float32))  # noqa: PERF401
 
     def get_next(self):
         if self.preprocess_flag:
diff --git a/onnxruntime/test/python/quantization/test_qdq_loss_debug.py b/onnxruntime/test/python/quantization/test_qdq_loss_debug.py
index e9108f157f953..3087cb963333d 100644
--- a/onnxruntime/test/python/quantization/test_qdq_loss_debug.py
+++ b/onnxruntime/test/python/quantization/test_qdq_loss_debug.py
@@ -93,7 +93,7 @@ def __init__(self, input_shape=[1, 3, 1, 3]):  # noqa: B006
         self.count = 2
         self.input_data_list = []
         for _ in range(self.count):
-            self.input_data_list.append(np.random.normal(0, 0.33, input_shape).astype(np.float32))
+            self.input_data_list.append(np.random.normal(0, 0.33, input_shape).astype(np.float32))  # noqa: PERF401
 
     def get_next(self):
         if self.preprocess_flag:
@@ -144,7 +144,7 @@ def test_saved_tensors_match_internal_tensors(self):
         data_reader.rewind()
         oracle_outputs = []
         for input_d in data_reader:
-            oracle_outputs.append(infer_session.run(None, input_d))
+            oracle_outputs.append(infer_session.run(None, input_d))  # noqa: PERF401
 
         output_dict = {}
         output_info = infer_session.get_outputs()
diff --git a/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py b/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
index c42c42c3ca170..9986e81cd7c4b 100644
--- a/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
+++ b/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
@@ -342,7 +342,7 @@ def generate_test_data(
         path = os.path.join(output_path, "test_data_set_" + str(test_case))
         try:
             os.mkdir(path)
-        except OSError:
+        except OSError:  # noqa: PERF203
             print("Creation of the directory %s failed" % path)
         else:
             print("Successfully created the directory %s " % path)
diff --git a/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py b/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py
index 065783d5812a8..3e088f3870212 100644
--- a/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py
+++ b/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py
@@ -451,7 +451,7 @@ def generate_test_data(
         path = os.path.join(output_path, "test_data_set_" + str(test_case))
         try:
             os.mkdir(path)
-        except OSError:
+        except OSError:  # noqa: PERF203
             print("Creation of the directory %s failed" % path)
         else:
             print("Successfully created the directory %s " % path)
diff --git a/onnxruntime/test/python/transformers/test_parity_t5_mha.py b/onnxruntime/test/python/transformers/test_parity_t5_mha.py
index 23218e4943042..409f8de5e1a80 100644
--- a/onnxruntime/test/python/transformers/test_parity_t5_mha.py
+++ b/onnxruntime/test/python/transformers/test_parity_t5_mha.py
@@ -491,7 +491,7 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value):
         # attn_output = self.o(attn_output) # ORT places this matmul outside of MHA op
 
         present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
-        outputs = (attn_output,) + (present_key_value_state,)
+        outputs = (attn_output, present_key_value_state)
 
         return outputs
 
@@ -628,7 +628,7 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value):
         if past_key_value is not None and self.is_static_kv:
             output = torch.tensor(ort_output)
         else:
-            output = (torch.tensor(ort_output[0]),) + ((torch.tensor(ort_output[1]), torch.tensor(ort_output[2])),)
+            output = (torch.tensor(ort_output[0]), (torch.tensor(ort_output[1]), torch.tensor(ort_output[2])))
 
         return output
 
diff --git a/orttraining/orttraining/python/checkpointing_utils.py b/orttraining/orttraining/python/checkpointing_utils.py
index ffb066b2043c4..460b9982297d1 100644
--- a/orttraining/orttraining/python/checkpointing_utils.py
+++ b/orttraining/orttraining/python/checkpointing_utils.py
@@ -53,7 +53,7 @@ def __init__(self, checkpoint_files, clean_state_dict=None):
         self.weight_shape_map = dict()
         self.sharded_params = set()
 
-    def _split_name(self, name):
+    def _split_name(self, name: str):
         name_split = name.split("_view_")
         view_num = None
         if len(name_split) > 1:
@@ -69,7 +69,7 @@ def _split_name(self, name):
         elif name_split[0].endswith("_fp16"):
             mp_suffix = "_fp16"
         param_name = name_split[0]
-        if optimizer_key != "":  # noqa: PLC1901
+        if optimizer_key:
             param_name = param_name.split(optimizer_key)[1]
         param_name = param_name.split("_fp16")[0]
         return param_name, optimizer_key, view_num, mp_suffix
diff --git a/orttraining/orttraining/python/ort_trainer.py b/orttraining/orttraining/python/ort_trainer.py
index 8b06816e16ff4..7c90054a85dc5 100644
--- a/orttraining/orttraining/python/ort_trainer.py
+++ b/orttraining/orttraining/python/ort_trainer.py
@@ -1207,10 +1207,10 @@ def __init__(
         self,
         loss_scale_input_name,
         is_dynamic_scale,
-        loss_scale=float(1 << 16),  # noqa: B008
+        loss_scale=float(1 << 16),
         up_scale_window=2000,
         min_loss_scale=1.0,
-        max_loss_scale=float(1 << 24),  # noqa: B008
+        max_loss_scale=float(1 << 24),
     ):
         super().__init__()
         self.loss_scale_input_name_ = loss_scale_input_name
diff --git a/orttraining/orttraining/python/training/_utils.py b/orttraining/orttraining/python/training/_utils.py
index 475a956cc7416..4eb79443c8f1a 100644
--- a/orttraining/orttraining/python/training/_utils.py
+++ b/orttraining/orttraining/python/training/_utils.py
@@ -107,7 +107,7 @@ def dtype_torch_to_numpy(torch_dtype):
     elif torch_dtype == torch.bool:
         return np.bool_
     else:
-        raise ValueError(f"torch_dtype ({str(torch_dtype)}) type is not supported by Numpy")
+        raise ValueError(f"torch_dtype ({torch_dtype!s}) type is not supported by Numpy")
 
 
 def dtype_onnx_to_torch(onnx_type):
diff --git a/orttraining/orttraining/python/training/amp/loss_scaler.py b/orttraining/orttraining/python/training/amp/loss_scaler.py
index b842ec9346f9f..440372d9305ea 100644
--- a/orttraining/orttraining/python/training/amp/loss_scaler.py
+++ b/orttraining/orttraining/python/training/amp/loss_scaler.py
@@ -88,10 +88,10 @@ class DynamicLossScaler(LossScaler):
     def __init__(
         self,
         automatic_update=True,
-        loss_scale=float(1 << 16),  # noqa: B008
+        loss_scale=float(1 << 16),
         up_scale_window=2000,
         min_loss_scale=1.0,
-        max_loss_scale=float(1 << 24),  # noqa: B008
+        max_loss_scale=float(1 << 24),
     ):
         super().__init__(loss_scale)
         self.automatic_update = automatic_update
diff --git a/orttraining/orttraining/python/training/checkpoint.py b/orttraining/orttraining/python/training/checkpoint.py
index 3f2eeb53e0161..079c827ea70d9 100644
--- a/orttraining/orttraining/python/training/checkpoint.py
+++ b/orttraining/orttraining/python/training/checkpoint.py
@@ -145,7 +145,7 @@ def _order_paths(paths, D_groups, H_groups):
     world_rank = _utils.state_dict_trainer_options_world_rank_key()
 
     for path in paths:
-        trainer_options_path_tuples.append(
+        trainer_options_path_tuples.append(  # noqa: PERF401
             (_checkpoint_storage.load(path, key=_utils.state_dict_trainer_options_key()), path)
         )
 
@@ -365,7 +365,7 @@ def _get_parallellism_groups(data_parallel_size, horizontal_parallel_size, world
     for data_group_id in range(num_data_groups):
         data_group_ranks = []
         for r in range(data_parallel_size):
-            data_group_ranks.append(data_group_id + horizontal_parallel_size * r)
+            data_group_ranks.append(data_group_id + horizontal_parallel_size * r)  # noqa: PERF401
         data_groups.append(data_group_ranks)
 
     num_horizontal_groups = world_size // horizontal_parallel_size
@@ -373,7 +373,7 @@ def _get_parallellism_groups(data_parallel_size, horizontal_parallel_size, world
     for hori_group_id in range(num_horizontal_groups):
         hori_group_ranks = []
         for r in range(horizontal_parallel_size):
-            hori_group_ranks.append(hori_group_id * horizontal_parallel_size + r)
+            hori_group_ranks.append(hori_group_id * horizontal_parallel_size + r)  # noqa: PERF401
         horizontal_groups.append(hori_group_ranks)
 
     return data_groups, horizontal_groups
@@ -665,10 +665,10 @@ def __init__(self, checkpoint_files, clean_state_dict=None):
         self.clean_state_dict = clean_state_dict
         self.world_size = int(self.checkpoint_files[0].split("ZeRO")[1].split(".")[2]) + 1
         assert len(self.checkpoint_files) == self.world_size, f"Could not find {self.world_size} files"
-        self.weight_shape_map = dict()
+        self.weight_shape_map = {}
         self.sharded_params = set()
 
-    def _split_name(self, name):
+    def _split_name(self, name: str):
         name_split = name.split("_view_")
         view_num = None
         if len(name_split) > 1:
@@ -684,7 +684,7 @@ def _split_name(self, name):
         elif name_split[0].endswith("_fp16"):
             mp_suffix = "_fp16"
         param_name = name_split[0]
-        if optimizer_key != "":  # noqa: PLC1901
+        if optimizer_key:
             param_name = param_name.split(optimizer_key)[1]
         param_name = param_name.split("_fp16")[0]
         return param_name, optimizer_key, view_num, mp_suffix
diff --git a/orttraining/orttraining/python/training/onnxblock/optim/optim.py b/orttraining/orttraining/python/training/onnxblock/optim/optim.py
index 94d4c2791d779..8a5e387342ab2 100644
--- a/orttraining/orttraining/python/training/onnxblock/optim/optim.py
+++ b/orttraining/orttraining/python/training/onnxblock/optim/optim.py
@@ -187,7 +187,7 @@ def build(self, parameters):
 
         # Prepare the tensor sequence inputs for params and moments
         for input_name in [params_name, gradients_name, first_order_moments_name, second_order_moments_name]:
-            onnx_model.graph.input.append(
+            onnx_model.graph.input.append(  # noqa: PERF401
                 onnx.helper.make_tensor_sequence_value_info(input_name, trainable_parameters[0].data_type, None)
             )
 
diff --git a/orttraining/orttraining/python/training/optim/_megatron_modifier.py b/orttraining/orttraining/python/training/optim/_megatron_modifier.py
index b6c5823110382..707727120c5cd 100644
--- a/orttraining/orttraining/python/training/optim/_megatron_modifier.py
+++ b/orttraining/orttraining/python/training/optim/_megatron_modifier.py
@@ -48,7 +48,7 @@ def clip_master_grads(target, max_norm, norm_type=2):
                 fp32_params = []
                 for param_group in target.optimizer.param_groups:
                     for param in param_group["params"]:
-                        fp32_params.append(param)
+                        fp32_params.append(param)  # noqa: PERF402
                 #### THIS IS THE ORIGINAL IMPLEMENTATION ####
                 # return self.clip_grad_norm(fp32_params, max_norm, norm_type)
                 #### END OF THE ORIGINAL IMPLEMENTATION ####
@@ -69,10 +69,10 @@ def _check_overflow(target):
             params = []
             for group in target.fp16_groups:
                 for param in group:
-                    params.append(param)
+                    params.append(param)  # noqa: PERF402
             for group in target.fp32_from_fp32_groups:
                 for param in group:
-                    params.append(param)
+                    params.append(param)  # noqa: PERF402
             #### THIS IS THE ORIGINAL IMPLEMENTATION ####
             # self.overflow = self.loss_scaler.has_overflow(params)
             #### END OF THE ORIGINAL IMPLEMENTATION ####
diff --git a/orttraining/orttraining/python/training/optim/config.py b/orttraining/orttraining/python/training/optim/config.py
index d0b6d5fc33993..d63c7ab40a787 100644
--- a/orttraining/orttraining/python/training/optim/config.py
+++ b/orttraining/orttraining/python/training/optim/config.py
@@ -55,7 +55,7 @@ def __init__(self, name, params, defaults):
                 "Each dict inside 'params' must contain a {'params' : [model parameter names]} entry"
                 " and additional entries for custom hyper parameter values"
             )
-            for k, _ in group.items():
+            for k in group:
                 if k != "params":
                     assert (
                         k in defaults or k.replace("_coef", "") in defaults
diff --git a/orttraining/orttraining/python/training/ort_triton/_cache.py b/orttraining/orttraining/python/training/ort_triton/_cache.py
index 1a9f94d1060a8..ede9cd86a9da5 100644
--- a/orttraining/orttraining/python/training/ort_triton/_cache.py
+++ b/orttraining/orttraining/python/training/ort_triton/_cache.py
@@ -48,7 +48,7 @@ def _write(source_code, ext, extra=""):
 
 
 class PyCodeCache:
-    cache = dict()
+    cache = dict()  # noqa: RUF012
     clear = staticmethod(cache.clear)
 
     @classmethod
@@ -67,7 +67,7 @@ def load(cls, source_code) -> ModuleType:
 
 
 class ModuleCache:
-    cache = dict()
+    cache = dict()  # noqa: RUF012
     clear = staticmethod(cache.clear)
 
     @classmethod
diff --git a/orttraining/orttraining/python/training/ort_triton/_codegen.py b/orttraining/orttraining/python/training/ort_triton/_codegen.py
index 7d7c482dac67c..c071f01f87ea5 100644
--- a/orttraining/orttraining/python/training/ort_triton/_codegen.py
+++ b/orttraining/orttraining/python/training/ort_triton/_codegen.py
@@ -250,7 +250,7 @@ def ReduceKernelNode(  # noqa: N802
             elif isinstance(ir_node, ReduceForLoopEnd):
                 indent -= 4
 
-    _COMPUTE_CODE_TEMPLATES = {
+    _COMPUTE_CODE_TEMPLATES = {  # noqa: RUF012
         "Add": "{indent}{o0} = {i0} + {i1}\n",
         "Sub": "{indent}{o0} = {i0} - {i1}\n",
         "Mul": "{indent}{o0} = {i0} * {i1}\n",
diff --git a/orttraining/orttraining/python/training/ort_triton/_common.py b/orttraining/orttraining/python/training/ort_triton/_common.py
index da31c74730f42..65540202420b5 100644
--- a/orttraining/orttraining/python/training/ort_triton/_common.py
+++ b/orttraining/orttraining/python/training/ort_triton/_common.py
@@ -113,7 +113,7 @@ def _infer_dropout(node: NodeProto, input_infos: List[TensorInfo], graph: GraphP
 
 
 class TypeAndShapeInfer:
-    _INFER_FUNC_MAP = {
+    _INFER_FUNC_MAP = {  # noqa: RUF012
         "Add": _infer_elementwise,
         "Sub": _infer_elementwise,
         "Mul": _infer_elementwise,
diff --git a/orttraining/orttraining/python/training/ort_triton/_lowering.py b/orttraining/orttraining/python/training/ort_triton/_lowering.py
index 345e9fe445b4d..dacd5c2cac099 100644
--- a/orttraining/orttraining/python/training/ort_triton/_lowering.py
+++ b/orttraining/orttraining/python/training/ort_triton/_lowering.py
@@ -294,8 +294,8 @@ def _group_nodes(self):
                 producers[output] = node
             for input in node.input:
                 if input in producers:
-                    precessors[node.name].append(producers[input])
-        for _, value in precessors.items():
+                    precessors[node.name].append(producers[input])  # noqa: PERF401
+        for value in precessors.values():
             value.sort(key=sorted_nodes.index, reverse=True)
         for idx in range(len(sorted_nodes) - 1, -1, -1):
             node = sorted_nodes[idx]
@@ -441,7 +441,9 @@ def _insert_load_and_store(self, kernel_node: KernelNode):
                 assert isinstance(sub_nodes[nxt], ReduceForLoopEnd)
                 for reduce_node in sub_nodes[nxt].reduce_nodes:
                     if reduce_node.outputs[0].name in output_name_map:
-                        reduce_store_nodes.append(IONode(reduce_node.outputs[0], kernel_node.offset_calc, False))
+                        reduce_store_nodes.append(  # noqa: PERF401
+                            IONode(reduce_node.outputs[0], kernel_node.offset_calc, False)
+                        )
                 new_sub_nodes.append(sub_nodes[nxt])
                 nxt += 1
             cur = nxt
diff --git a/orttraining/orttraining/python/training/ort_triton/_sorted_graph.py b/orttraining/orttraining/python/training/ort_triton/_sorted_graph.py
index 13d8ca9641bc0..3c34e65f8c3a2 100644
--- a/orttraining/orttraining/python/training/ort_triton/_sorted_graph.py
+++ b/orttraining/orttraining/python/training/ort_triton/_sorted_graph.py
@@ -86,7 +86,7 @@ def __str__(self):
         name_map = {}
         for idx, input in enumerate(self._graph.input):
             shape_str = str(self._input_shapes[idx]).replace(" ", "")
-            graph_inputs.append(f"({str(input.type.tensor_type.elem_type)},{shape_str})")
+            graph_inputs.append(f"({input.type.tensor_type.elem_type!s},{shape_str})")
             name_map[input.name] = f"i{idx}"
         graph_inputs_str = ",".join(graph_inputs)
 
@@ -110,7 +110,7 @@ def __str__(self):
         for node_idx, node in enumerate(self._sorted_nodes):
             inputs = []
             for input in node.input:
-                inputs.append(name_map.get(input, input))
+                inputs.append(name_map.get(input, input))  # noqa: PERF401
             inputs_str = ",".join(inputs)
             outputs = []
             for idx, output in enumerate(node.output):
@@ -127,7 +127,7 @@ def __str__(self):
             attributes_str = ",".join(attributes)
             nodes.append(f"{node.op_type}[{attributes_str}]({inputs_str})->({outputs_str})")
         nodes_str = ",".join(nodes)
-        return f"{graph_inputs_str}|{str(len(self._graph.output))}|{constants_str}|{nodes_str}"
+        return f"{graph_inputs_str}|{len(self._graph.output)!s}|{constants_str}|{nodes_str}"
 
     def __hash__(self):
         return hash(str(self))
@@ -180,7 +180,7 @@ def _decompose(self):
             else:
                 input_infos = []
                 for input in node.input:
-                    input_infos.append(self._node_arg_infos[input])
+                    input_infos.append(self._node_arg_infos[input])  # noqa: PERF401
                 output_infos = TypeAndShapeInfer.infer(node, input_infos, self._graph)
                 for idx, output in enumerate(node.output):
                     self._node_arg_infos[output] = output_infos[idx]
diff --git a/orttraining/orttraining/python/training/ort_triton/_utils.py b/orttraining/orttraining/python/training/ort_triton/_utils.py
index c80e28f6f73df..35dba06c76aa7 100644
--- a/orttraining/orttraining/python/training/ort_triton/_utils.py
+++ b/orttraining/orttraining/python/training/ort_triton/_utils.py
@@ -52,7 +52,7 @@ def topological_sort(inputs: List[str], nodes: List[NodeProto]) -> List[NodeProt
                 continue
             for consumer in non_const_nodes:
                 if output in consumer.input:
-                    output_consumers[node.name].append(consumer)
+                    output_consumers[node.name].append(consumer)  # noqa: PERF401
 
     # Topological sort.
     visited = set()
diff --git a/orttraining/orttraining/python/training/ort_triton/kernel/_slice_scel.py b/orttraining/orttraining/python/training/ort_triton/kernel/_slice_scel.py
index 53df2b833bf8d..446344a7cde28 100644
--- a/orttraining/orttraining/python/training/ort_triton/kernel/_slice_scel.py
+++ b/orttraining/orttraining/python/training/ort_triton/kernel/_slice_scel.py
@@ -357,10 +357,10 @@ def transform_slice_scel(graph):
     all_nodes = []
     for node in graph.node:
         if node not in remove_nodes:
-            all_nodes.append(node)
+            all_nodes.append(node)  # noqa: PERF401
 
     for node in triton_nodes:
-        all_nodes.append(node)
+        all_nodes.append(node)  # noqa: PERF402
 
     graph.ClearField("node")
     graph.node.extend(all_nodes)
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py b/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py
index 661629b3bb5c6..156c3e001d88f 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py
@@ -56,8 +56,8 @@ def _to_gradient_definition(gradient):
 
 
 class CustomGradientRegistry:
-    _GRADIENTS = {}
-    _STOP_GRADIENT_EDGES = {}
+    _GRADIENTS = {}  # noqa: RUF012
+    _STOP_GRADIENT_EDGES = {}  # noqa: RUF012
 
     @classmethod
     def register(cls, domain, name, attributes, fn):
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
index b4bee8a17f6f8..ac87dc6abfbd0 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
@@ -88,7 +88,7 @@ def _export_with_no_ctx(graph, *args, **kwargs):
 
 
 class CustomOpSymbolicRegistry:
-    _SYMBOLICS = {}
+    _SYMBOLICS = {}  # noqa: RUF012
 
     @classmethod
     def register(cls, name, domain, fn):
diff --git a/orttraining/orttraining/python/training/ortmodule/_io.py b/orttraining/orttraining/python/training/ortmodule/_io.py
index 5bbaae1903f8b..f5260d70d1aee 100644
--- a/orttraining/orttraining/python/training/ortmodule/_io.py
+++ b/orttraining/orttraining/python/training/ortmodule/_io.py
@@ -138,7 +138,7 @@ def symbolic(g, self):
 
 
 class _PrimitiveType:
-    _primitive_types = {int, bool, float}
+    _primitive_types = {int, bool, float}  # noqa: RUF012
 
     @staticmethod
     def is_primitive_type(value):
@@ -153,7 +153,7 @@ def get_primitive_dtype(value):
         # If `value` is a boolean, save the value of the boolean in dtype.
         # This way, if the value changes from one forward call to the next, the schema will mismatch,
         # and the model will be re-exported.
-        return f"{str(type(value))}_{value}" if isinstance(value, bool) else str(type(value))
+        return f"{type(value)!s}_{value}" if isinstance(value, bool) else str(type(value))
 
 
 def flatten_kwargs(kwargs, device):
diff --git a/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py b/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
index a756527f09e72..a871ed08b9316 100644
--- a/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
+++ b/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
@@ -146,7 +146,7 @@ def initialize(self, model: ModelProto, user_input_names: List[str]) -> None:
             self._tensor_to_node_map.clear()
             for node in model.graph.node:
                 for output_name in node.output:
-                    if output_name != "":  # noqa: PLC1901
+                    if output_name != "":
                         self._tensor_to_node_map[output_name] = node
 
             self._initialize_embedding_padding_inspector(model, user_input_names)
@@ -440,7 +440,7 @@ def _print_embed_label_stats(self):
             self._stats.clear()
 
     def _try_get_node_from_its_output(self, name):
-        if name == "" or name not in self._tensor_to_node_map:  # noqa: PLC1901
+        if name == "" or name not in self._tensor_to_node_map:
             return None
 
         return self._tensor_to_node_map[name]
diff --git a/orttraining/orttraining/python/training/ortmodule/experimental/hierarchical_ortmodule/_hierarchical_ortmodule.py b/orttraining/orttraining/python/training/ortmodule/experimental/hierarchical_ortmodule/_hierarchical_ortmodule.py
index 993ba915edbba..1061135388bf8 100644
--- a/orttraining/orttraining/python/training/ortmodule/experimental/hierarchical_ortmodule/_hierarchical_ortmodule.py
+++ b/orttraining/orttraining/python/training/ortmodule/experimental/hierarchical_ortmodule/_hierarchical_ortmodule.py
@@ -36,7 +36,7 @@ def __init__(self, module, count, log_level, save_onnx, onnx_prefix):
         self._it = count - 1
         self._ortmodules = []
         for idx in range(count):
-            self._ortmodules.append(
+            self._ortmodules.append(  # noqa: PERF401
                 ORTModule(
                     module,
                     debug_options=DebugOptions(
@@ -113,9 +113,9 @@ def recursive_hook(module):
             # We cannot skip module in allowlist because it's possible that a module is called multiple times
             # so that we still need to know the number of different input sets and use _IteratedORTModule to handle it.
             handle_pool.append(module.register_forward_pre_hook(record_args))
-            for _, sub_module in module._modules.items():
+            for sub_module in module._modules.values():
                 if isinstance(sub_module, torch.nn.ModuleList):
-                    for _, sub_module_item in sub_module._modules.items():
+                    for sub_module_item in sub_module._modules.values():
                         recursive_hook(sub_module_item)
                 else:
                     recursive_hook(sub_module)
@@ -142,7 +142,7 @@ def try_export(module, args):
                 except Exception as e:
                     if self._log_level <= LogLevel.WARNING:
                         warnings.warn(
-                            f"Failed to export module with type {type(module).__name__}. Error message: {str(e)}",
+                            f"Failed to export module with type {type(module).__name__}. Error message: {e!s}",
                             UserWarning,
                         )
                     return False
@@ -176,9 +176,9 @@ def try_export(module, args):
                 # No sub-module exists, so this module is a leaf
                 return
 
-            for _, sub_module in sub_module_dict.items():
+            for sub_module in sub_module_dict.values():
                 if isinstance(sub_module, torch.nn.ModuleList):
-                    for _, sub_module_item in sub_module._modules.items():
+                    for sub_module_item in sub_module._modules.values():
                         check_exportable(sub_module_item)
                 else:
                     check_exportable(sub_module)
@@ -268,7 +268,7 @@ def recursive_wrap(module, save_onnx=False, onnx_prefix=""):
             recursive_wrap(self._original_module, self._save_onnx, self._name_prefix)
         if self._log_level <= LogLevel.WARNING:
             warnings.warn(
-                f"Wrapped module: {str(self._original_module)}.",
+                f"Wrapped module: {self._original_module!s}.",
                 UserWarning,
             )
         self._initialized = True
diff --git a/orttraining/orttraining/python/training/ortmodule/experimental/json_config/_load_config_from_json.py b/orttraining/orttraining/python/training/ortmodule/experimental/json_config/_load_config_from_json.py
index 6ddb159d18f0b..76c8ce3bf3220 100644
--- a/orttraining/orttraining/python/training/ortmodule/experimental/json_config/_load_config_from_json.py
+++ b/orttraining/orttraining/python/training/ortmodule/experimental/json_config/_load_config_from_json.py
@@ -49,7 +49,7 @@ def _update_allow():
 
     key_to_function_mapping = {"Strategy": _update_strategy, "Level": _update_level, "Allow": _update_allow}
 
-    for key, _ in data.PropagateCastOps.__dict__.items():
+    for key in data.PropagateCastOps.__dict__:
         key_to_function_mapping[key]()
 
 
@@ -162,7 +162,7 @@ def _update_onnx_path():
         "SaveONNXPath": _update_onnx_path,
     }
 
-    for key, _ in data.DebugOptions.__dict__.items():
+    for key in data.DebugOptions.__dict__:
         key_to_function_mapping[key]()
 
     debug_options = DebugOptions(log_level=log_level, save_onnx=save_onnx, onnx_prefix=onnx_prefix)
@@ -301,5 +301,5 @@ def load_from_json(ortmodule, path=None):
         # update the debug config for both train and eval modes
         ortmodule_config_accessor = ortmodule._torch_module._execution_manager(training_mode)
         # iterate over the json data instead of checking for keys in json to catch key errors
-        for key, _ in data.__dict__.items():
+        for key in data.__dict__:
             load_functions[key](ortmodule_config_accessor, data)
diff --git a/orttraining/orttraining/python/training/ortmodule/graph_transformer_registry.py b/orttraining/orttraining/python/training/ortmodule/graph_transformer_registry.py
index ed0827481072e..70056179c140e 100644
--- a/orttraining/orttraining/python/training/ortmodule/graph_transformer_registry.py
+++ b/orttraining/orttraining/python/training/ortmodule/graph_transformer_registry.py
@@ -9,7 +9,7 @@
 
 
 class GraphTransformerRegistry:
-    _TRANSFORMER_FUNCS = {}
+    _TRANSFORMER_FUNCS = {}  # noqa: RUF012
 
     @classmethod
     def register(cls, target_modules: str, devices: str, priority: int, fn: Callable[[GraphProto], None]):
diff --git a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/install.py b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/install.py
index bb0952dea56b7..225a01c39fcce 100644
--- a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/install.py
+++ b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/install.py
@@ -20,7 +20,7 @@ def _list_extensions(path):
     for root, _, files in os.walk(path):
         for name in files:
             if name.lower() == "setup.py":
-                extensions.append(os.path.join(root, name))
+                extensions.append(os.path.join(root, name))  # noqa: PERF401
     return extensions
 
 
diff --git a/orttraining/orttraining/python/training/orttrainer.py b/orttraining/orttraining/python/training/orttrainer.py
index a6c6c8af2723b..3a9dbc08466b6 100644
--- a/orttraining/orttraining/python/training/orttrainer.py
+++ b/orttraining/orttraining/python/training/orttrainer.py
@@ -933,7 +933,7 @@ def _training_session_run_helper(self, is_train, inputs, inputs_desc, outputs_de
             # so output will be on the same device as input.
             try:
                 torch.device(target_device)
-            except Exception:
+            except Exception:  # noqa: PERF203
                 # in this case, input/output must on CPU
                 assert input.device.type == "cpu"
                 target_device = "cpu"
diff --git a/orttraining/orttraining/python/training/orttrainer_options.py b/orttraining/orttraining/python/training/orttrainer_options.py
index fc8322855ddb4..c63ac6f82c87f 100644
--- a/orttraining/orttraining/python/training/orttrainer_options.py
+++ b/orttraining/orttraining/python/training/orttrainer_options.py
@@ -482,7 +482,7 @@ def __init__(self, options={}):  # noqa: B006
     def __repr__(self):
         return "{%s}" % str(
             ", ".join(
-                f"'{k}': {repr(v)}"
+                f"'{k}': {v!r}"
                 for (k, v) in self.__dict__.items()
                 if k not in ["_original_opts", "_validated_opts", "_main_class_name"]
             )
diff --git a/orttraining/orttraining/python/training/postprocess.py b/orttraining/orttraining/python/training/postprocess.py
index b2da6186b62cf..aafc6afce222c 100644
--- a/orttraining/orttraining/python/training/postprocess.py
+++ b/orttraining/orttraining/python/training/postprocess.py
@@ -26,7 +26,7 @@ def find_input_node(model, arg):
     for node in model.graph.node:
         for output in node.output:
             if output == arg:
-                result.append(node)
+                result.append(node)  # noqa: PERF401
     return result[0] if len(result) == 1 else None
 
 
@@ -35,7 +35,7 @@ def find_output_node(model, arg):
     for node in model.graph.node:
         for input in node.input:
             if input == arg:
-                result.append(node)
+                result.append(node)  # noqa: PERF401
     return result[0] if len(result) == 1 else result
 
 
@@ -189,7 +189,7 @@ def find_nodes(graph, op_type):
     nodes = []
     for node in graph.node:
         if node.op_type == op_type:
-            nodes.append(node)
+            nodes.append(node)  # noqa: PERF401
     return nodes
 
 
@@ -382,10 +382,10 @@ def layer_norm_transform(model):
     all_nodes = []
     for node in graph.node:
         if node not in remove_nodes:
-            all_nodes.append(node)
+            all_nodes.append(node)  # noqa: PERF401
 
     for node in layer_norm_nodes:
-        all_nodes.append(node)
+        all_nodes.append(node)  # noqa: PERF402
 
     graph.ClearField("node")
     graph.node.extend(all_nodes)
diff --git a/orttraining/orttraining/python/training/torchdynamo/ort_backend.py b/orttraining/orttraining/python/training/torchdynamo/ort_backend.py
index 2454079dc9c9d..701a4d6ebfc67 100644
--- a/orttraining/orttraining/python/training/torchdynamo/ort_backend.py
+++ b/orttraining/orttraining/python/training/torchdynamo/ort_backend.py
@@ -210,7 +210,7 @@ def _infer_ep_from_graph_module(graph_module: torch.fx.GraphModule) -> Tuple[str
                 if hasattr(output_arg, "meta") and "val" in output_arg.meta:
                     # Select outputs with "val" information. Without "val",
                     # it's not possible access output_arg.meta["val"].device.
-                    output_args.append(output_arg.meta["val"])
+                    output_args.append(output_arg.meta["val"])  # noqa: PERF401
             return _infer_ep_from_device(*output_args)
     graph_module_str = graph_module.print_readable(print_output=False)
     raise ValueError(f"No output node is found in graph_module: {graph_module_str}")
diff --git a/orttraining/orttraining/test/python/onnxruntime_test_postprocess.py b/orttraining/orttraining/test/python/onnxruntime_test_postprocess.py
index d5298cf8e860e..380db8cdab4cc 100644
--- a/orttraining/orttraining/test/python/onnxruntime_test_postprocess.py
+++ b/orttraining/orttraining/test/python/onnxruntime_test_postprocess.py
@@ -64,7 +64,7 @@ def find_nodes(self, model, node_type):
         nodes = []
         for node in model.graph.node:
             if node.op_type == node_type:
-                nodes.append(node)
+                nodes.append(node)  # noqa: PERF401
         return nodes
 
     def get_name(self, name):
diff --git a/orttraining/orttraining/test/python/orttraining_test_checkpoint_storage.py b/orttraining/orttraining/test/python/orttraining_test_checkpoint_storage.py
index 77ba7c41c1268..71d13fdcfd290 100644
--- a/orttraining/orttraining/test/python/orttraining_test_checkpoint_storage.py
+++ b/orttraining/orttraining/test/python/orttraining_test_checkpoint_storage.py
@@ -159,7 +159,7 @@ def test_checkpoint_storage_saved_dict_matches_loaded(checkpoint_storage_test_pa
 )
 def test_checkpoint_storage_saving_non_supported_types_fails(checkpoint_storage_test_parameterized_setup):
     to_save = checkpoint_storage_test_parameterized_setup
-    with pytest.raises(Exception):
+    with pytest.raises(Exception):  # noqa: B017
         _checkpoint_storage.save(to_save, pytest.checkpoint_path)
 
 
@@ -233,7 +233,7 @@ def test_checkpoint_storage_saving_and_loading_empty_dictionaries_succeeds(check
 
 
 def test_checkpoint_storage_load_file_that_does_not_exist_fails(checkpoint_storage_test_setup):
-    with pytest.raises(Exception):
+    with pytest.raises(Exception):  # noqa: B017
         _checkpoint_storage.load(pytest.checkpoint_path)
 
 
diff --git a/orttraining/orttraining/test/python/orttraining_test_data_loader.py b/orttraining/orttraining/test/python/orttraining_test_data_loader.py
index aa15b44ae0d66..d55ace62f2673 100644
--- a/orttraining/orttraining/test/python/orttraining_test_data_loader.py
+++ b/orttraining/orttraining/test/python/orttraining_test_data_loader.py
@@ -20,7 +20,7 @@ def ids_tensor(shape, vocab_size, rng=None, name=None):
 
     values = []
     for _ in range(total_dims):
-        values.append(rng.randint(0, vocab_size - 1))
+        values.append(rng.randint(0, vocab_size - 1))  # noqa: PERF401
 
     return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
 
@@ -36,7 +36,7 @@ def floats_tensor(shape, scale=1.0, rng=None, name=None):
 
     values = []
     for _ in range(total_dims):
-        values.append(rng.random() * scale)
+        values.append(rng.random() * scale)  # noqa: PERF401
 
     return torch.tensor(data=values, dtype=torch.float).view(shape).contiguous()
 
diff --git a/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py b/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py
index 8afbafccb8241..9f41927c0e4fb 100644
--- a/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py
+++ b/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py
@@ -213,7 +213,7 @@ def trial(module_to_wrap, args, expected_num_ortmodule, expected_num_iterated_or
         call_backward(y_ref)
         g_ref = []
         for param in m.parameters():
-            g_ref.append(param.grad.detach())
+            g_ref.append(param.grad.detach())  # noqa: PERF401
 
         m.zero_grad()
 
@@ -224,7 +224,7 @@ def trial(module_to_wrap, args, expected_num_ortmodule, expected_num_iterated_or
         call_backward(y)
         g = []
         for param in m.parameters():
-            g.append(param.grad.detach())
+            g.append(param.grad.detach())  # noqa: PERF401
 
         # Some sub-modules become ORTModule.
         assert expected_num_ortmodule == count_ortmodule(m)
diff --git a/orttraining/orttraining/test/python/orttraining_test_layer_norm_transform.py b/orttraining/orttraining/test/python/orttraining_test_layer_norm_transform.py
index 35d59c1750de4..de8806095f295 100644
--- a/orttraining/orttraining/test/python/orttraining_test_layer_norm_transform.py
+++ b/orttraining/orttraining/test/python/orttraining_test_layer_norm_transform.py
@@ -173,10 +173,10 @@ def layer_norm_transform(model_proto):
     all_nodes = []
     for node in graph_proto.node:
         if node not in removed_nodes:
-            all_nodes.append(node)
+            all_nodes.append(node)  # noqa: PERF401
 
     for node in layer_norm_nodes:
-        all_nodes.append(node)
+        all_nodes.append(node)  # noqa: PERF402
 
     graph_proto.ClearField("node")
     graph_proto.node.extend(all_nodes)
diff --git a/orttraining/orttraining/test/python/orttraining_test_model_transform.py b/orttraining/orttraining/test/python/orttraining_test_model_transform.py
index 3b07aa1f4daf0..70694d03f1b28 100644
--- a/orttraining/orttraining/test/python/orttraining_test_model_transform.py
+++ b/orttraining/orttraining/test/python/orttraining_test_model_transform.py
@@ -13,7 +13,7 @@ def find_single_output_node(model, arg):
     for node in model.graph.node:
         for input in node.input:
             if input == arg:
-                result.append(node)
+                result.append(node)  # noqa: PERF401
     return result[0] if len(result) == 1 else None
 
 
@@ -63,7 +63,7 @@ def fix_transpose(model):
                 for n in model.graph.node:
                     for input in n.input:
                         if input == weight.name:
-                            result.append(n)
+                            result.append(n)  # noqa: PERF401
                 if len(result) > 1:
                     continue
                 perm = node.attribute[0]
@@ -93,7 +93,7 @@ def fix_transpose(model):
     old_ws = []
     for t in transpose:
         if find_single_output_node(model, t[1].name) is None:
-            old_ws.append(find_weight_index(model, t[1].name))
+            old_ws.append(find_weight_index(model, t[1].name))  # noqa: PERF401
     old_ws.sort(reverse=True)
     for w_i in old_ws:
         del model.graph.initializer[w_i]
diff --git a/orttraining/orttraining/test/python/orttraining_test_onnx_ops_ortmodule.py b/orttraining/orttraining/test/python/orttraining_test_onnx_ops_ortmodule.py
index 27629133dfb3d..4f0925c5c855b 100644
--- a/orttraining/orttraining/test/python/orttraining_test_onnx_ops_ortmodule.py
+++ b/orttraining/orttraining/test/python/orttraining_test_onnx_ops_ortmodule.py
@@ -78,7 +78,7 @@ def run_step(model, x):
         self.assertIn('op_type: "%s"' % name, str(onnx_graph_inf))
         for onnx_model in [onnx_graph_inf, onnx_graph_train]:
             for oimp in onnx_model.opset_import:
-                if oimp.domain == "":  # noqa: PLC1901
+                if oimp.domain == "":
                     self.assertEqual(oimp.version, 15)
         if op_grad_type is not None:
             if isinstance(op_grad_type, tuple):
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index 625c1ce0d45e9..e67edeeccee4e 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -1497,10 +1497,10 @@ def forward(self, left, right):
     rhs_op = equation[pos1 + 1 : pos2]
     lhs_shape = []
     for c in lhs_op:
-        lhs_shape.append(SIZE_MAP[c.upper()])
+        lhs_shape.append(SIZE_MAP[c.upper()])  # noqa: PERF401
     rhs_shape = []
     for c in rhs_op:
-        rhs_shape.append(SIZE_MAP[c.upper()])
+        rhs_shape.append(SIZE_MAP[c.upper()])  # noqa: PERF401
 
     pt_model = NeuralNetEinsum(lhs_shape[-1]).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
@@ -1577,7 +1577,7 @@ def to_string(perm):
             random.shuffle(output_candidates)
             output_candidates = output_candidates[:8]
             for output_candidate in [list(candidate) for candidate in output_candidates]:
-                all_cases.append((lhs_candidate, rhs_candidate, output_candidate))
+                all_cases.append((lhs_candidate, rhs_candidate, output_candidate))  # noqa: PERF401
 
     for case in all_cases:
         equation = to_string(case[0]) + "," + to_string(case[1]) + "->" + to_string(case[2])
@@ -1587,10 +1587,10 @@ def to_string(perm):
         rhs_op = equation[pos1 + 1 : pos2]
         lhs_shape = []
         for c in lhs_op:
-            lhs_shape.append(SIZE_MAP[c.upper()])
+            lhs_shape.append(SIZE_MAP[c.upper()])  # noqa: PERF401
         rhs_shape = []
         for c in rhs_op:
-            rhs_shape.append(SIZE_MAP[c.upper()])
+            rhs_shape.append(SIZE_MAP[c.upper()])  # noqa: PERF401
 
         pt_model = NeuralNetEinsum(lhs_shape[-1]).to(device)
         ort_model = ORTModule(copy.deepcopy(pt_model))
@@ -5895,7 +5895,7 @@ def find_input_node_type(model, arg):
         result = []
         for node in model.graph.node:
             if arg in node.output:
-                result.append(node)
+                result.append(node)  # noqa: PERF401
         return result[0].op_type if len(result) == 1 else None
 
     gathergrad_input_optypes = [find_input_node_type(training_model, arg) for arg in gathergrad_node.input]
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_triton.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_triton.py
index bee38129f8a87..318de843efb8f 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_triton.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_triton.py
@@ -136,7 +136,7 @@ def _torch_layer_norm(input, weight, bias, **kwargs):
 
 
 class TorchFuncExecutor:
-    _INFER_FUNC_MAP = {
+    _INFER_FUNC_MAP = {  # noqa: RUF012
         "Add": _torch_add,
         "Sub": _torch_sub,
         "Mul": _torch_mul,
diff --git a/orttraining/orttraining/test/python/orttraining_test_orttrainer_bert_toy_onnx.py b/orttraining/orttraining/test/python/orttraining_test_orttrainer_bert_toy_onnx.py
index aae92b3245685..c5515f477d1fe 100644
--- a/orttraining/orttraining/test/python/orttraining_test_orttrainer_bert_toy_onnx.py
+++ b/orttraining/orttraining/test/python/orttraining_test_orttrainer_bert_toy_onnx.py
@@ -112,7 +112,7 @@ def optimizer_parameters(model):
     no_decay_param_group = []
     for initializer in model.graph.initializer:
         if any(key in initializer.name for key in no_decay_keys):
-            no_decay_param_group.append(initializer.name)
+            no_decay_param_group.append(initializer.name)  # noqa: PERF401
     params = [
         {
             "params": no_decay_param_group,
@@ -134,7 +134,7 @@ def load_bert_onnx_model():
 
 
 class CustomLossScaler(amp.LossScaler):
-    def __init__(self, loss_scale=float(1 << 16)):  # noqa: B008
+    def __init__(self, loss_scale=float(1 << 16)):
         super().__init__(loss_scale)
         self._initial_loss_scale = loss_scale
         self.loss_scale = loss_scale
@@ -151,7 +151,7 @@ def update(self, train_step_info):
 
 
 class LegacyCustomLossScaler:
-    def __init__(self, loss_scale=float(1 << 16)):  # noqa: B008
+    def __init__(self, loss_scale=float(1 << 16)):
         self._initial_loss_scale = loss_scale
         self.loss_scale_ = loss_scale
 
diff --git a/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py b/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py
index 0382cec990195..fa13625f0ddac 100644
--- a/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py
+++ b/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py
@@ -28,7 +28,7 @@
 
 def get_model_opset(model_onnx):
     for op in model_onnx.opset_import:
-        if op.domain == "":  # noqa: PLC1901
+        if op.domain == "":
             return op.version
     return None
 
@@ -390,10 +390,10 @@ def testOptimizerConfig(optim_name, lr, alpha, default_alpha):
 
     # 1:1 mapping between defaults and params's hyper parameters
     for param in params:
-        for k, _ in param.items():
+        for k in param:
             if k != "params":
                 assert k in cfg.defaults, "hyper parameter {k} not present in one of the parameter params"
-    for k, _ in cfg.defaults.items():
+    for k in cfg.defaults:
         for param in cfg.params:
             assert k in param, "hyper parameter {k} not present in one of the parameter params"
 
@@ -1039,7 +1039,7 @@ def testORTTrainerInternalUseContribOps(enable_onnx_contrib_ops):
     # Training loop
     data, targets = batcher_fn(train_data, 0)
     if not enable_onnx_contrib_ops and not pytorch_110:
-        with pytest.raises(Exception):
+        with pytest.raises(Exception):  # noqa: B017
             _, _ = trainer.train_step(data, targets)
     else:
         _, _ = trainer.train_step(data, targets)
diff --git a/orttraining/pytorch_frontend_examples/mnist_training.py b/orttraining/pytorch_frontend_examples/mnist_training.py
index 62de9c0f9c8c8..dc9b3f654400c 100644
--- a/orttraining/pytorch_frontend_examples/mnist_training.py
+++ b/orttraining/pytorch_frontend_examples/mnist_training.py
@@ -193,8 +193,6 @@ def main():
 
     for epoch in range(1, args.epochs + 1):
         train_with_trainer(args, trainer, device, train_loader, epoch)
-        import pdb  # noqa: F401
-
         test_with_trainer(args, trainer, device, test_loader)
 
 
diff --git a/orttraining/tools/amdgpu/script/rocprof.py b/orttraining/tools/amdgpu/script/rocprof.py
index e5b107ba285bf..a027ce4787943 100644
--- a/orttraining/tools/amdgpu/script/rocprof.py
+++ b/orttraining/tools/amdgpu/script/rocprof.py
@@ -15,7 +15,7 @@ def get_gpu_lines(path):
         reader = csv.reader(f, delimiter=",")
         for row in reader:
             if row[2].find("TotalDurationNs") < 0:
-                lines.append(row)
+                lines.append(row)  # noqa: PERF401
         return lines
 
 
diff --git a/orttraining/tools/ci_test/compare_results.py b/orttraining/tools/ci_test/compare_results.py
index 24854d6cf9c82..0ab0a1246a421 100644
--- a/orttraining/tools/ci_test/compare_results.py
+++ b/orttraining/tools/ci_test/compare_results.py
@@ -19,7 +19,7 @@ def eq():
     def float_le(tolerance=None):
         actual_tolerance = 0.0 if tolerance is None else tolerance
         return Comparison(
-            name="less than or equal to" + (f" (tolerance: {str(actual_tolerance)})" if tolerance is not None else ""),
+            name="less than or equal to" + (f" (tolerance: {actual_tolerance!s})" if tolerance is not None else ""),
             fn=(lambda actual, expected: float(actual) <= float(expected) + actual_tolerance),
         )
 
diff --git a/orttraining/tools/scripts/gpt2_model_transform.py b/orttraining/tools/scripts/gpt2_model_transform.py
index 06f03e06632b4..d7079591382a3 100644
--- a/orttraining/tools/scripts/gpt2_model_transform.py
+++ b/orttraining/tools/scripts/gpt2_model_transform.py
@@ -28,7 +28,7 @@ def find_input_node(model, arg):
     for node in model.graph.node:
         for output in node.output:
             if output == arg:
-                result.append(node)
+                result.append(node)  # noqa: PERF401
     return result[0] if len(result) == 1 else None
 
 
@@ -37,7 +37,7 @@ def find_output_node(model, arg):
     for node in model.graph.node:
         for input in node.input:
             if input == arg:
-                result.append(node)
+                result.append(node)  # noqa: PERF401
     return result[0] if len(result) == 1 else None
 
 
@@ -136,7 +136,7 @@ def process_concat(model):
         assert reshape_node.op_type == "Reshape"
         new_nodes[get_node_index(model, reshape_node)] = shape
         for n in fuse_nodes:
-            delete_nodes.append(get_node_index(model, n))
+            delete_nodes.append(get_node_index(model, n))  # noqa: PERF401
 
     # insert new shape to reshape
     index = 0
@@ -189,7 +189,7 @@ def fix_transpose(model):
                 for n in model.graph.node:
                     for input in n.input:
                         if input == weight.name:
-                            result.append(n)
+                            result.append(n)  # noqa: PERF401
                 if len(result) > 1:
                     continue
                 perm = node.attribute[0]
@@ -280,7 +280,7 @@ def remove_input_ids_check_subgraph(model):
 
     remove_node_index = []
     for n in removed_nodes:
-        remove_node_index.append(get_node_index(model, n))
+        remove_node_index.append(get_node_index(model, n))  # noqa: PERF401
 
     remove_node_index = list(set(remove_node_index))
     remove_node_index.sort(reverse=True)
diff --git a/orttraining/tools/scripts/layer_norm_transform.py b/orttraining/tools/scripts/layer_norm_transform.py
index 2ccc947a58832..c3948b6378098 100644
--- a/orttraining/tools/scripts/layer_norm_transform.py
+++ b/orttraining/tools/scripts/layer_norm_transform.py
@@ -141,10 +141,10 @@ def main():
     all_nodes = []
     for node in graph_proto.node:
         if node not in removed_nodes:
-            all_nodes.append(node)
+            all_nodes.append(node)  # noqa: PERF401
 
     for node in layer_norm_nodes:
-        all_nodes.append(node)
+        all_nodes.append(node)  # noqa: PERF402
 
     graph_proto.ClearField("node")
     graph_proto.node.extend(all_nodes)
diff --git a/orttraining/tools/scripts/model_transform.py b/orttraining/tools/scripts/model_transform.py
index 81e9f7b16be14..8ea2d5ab45315 100644
--- a/orttraining/tools/scripts/model_transform.py
+++ b/orttraining/tools/scripts/model_transform.py
@@ -26,7 +26,7 @@ def find_input_node(model, arg):
     for node in model.graph.node:
         for output in node.output:
             if output == arg:
-                result.append(node)
+                result.append(node)  # noqa: PERF401
     return result[0] if len(result) == 1 else None
 
 
@@ -35,7 +35,7 @@ def find_output_node(model, arg):
     for node in model.graph.node:
         for input in node.input:
             if input == arg:
-                result.append(node)
+                result.append(node)  # noqa: PERF401
     return result[0] if len(result) == 1 else None
 
 
@@ -94,7 +94,7 @@ def process_concat(model):
         if node.op_type == "Concat":
             input_nodes = []
             for input in node.input:
-                input_nodes.append(find_input_node(model, input))
+                input_nodes.append(find_input_node(model, input))  # noqa: PERF401
             # figure out target shape
             shape = []
             for input_node in input_nodes:
@@ -116,7 +116,7 @@ def process_concat(model):
             assert reshape_node.op_type == "Reshape"
             new_nodes[get_node_index(model, reshape_node)] = shape
             for n in fuse_nodes:
-                delete_nodes.append(get_node_index(model, n))
+                delete_nodes.append(get_node_index(model, n))  # noqa: PERF401
     # insert new shape to reshape
     index = 0
     for reshape_node_index in new_nodes:
@@ -218,7 +218,7 @@ def fix_transpose(model):
                 for n in model.graph.node:
                     for input in n.input:
                         if input == weight.name:
-                            result.append(n)
+                            result.append(n)  # noqa: PERF401
                 if len(result) > 1:
                     continue
                 perm = node.attribute[0]
@@ -242,7 +242,7 @@ def fix_transpose(model):
     old_ws = []
     for t in transpose:
         if find_output_node(model, t[1].name) is None:
-            old_ws.append(find_weight_index(model, t[1].name))
+            old_ws.append(find_weight_index(model, t[1].name))  # noqa: PERF401
     old_ws.sort(reverse=True)
     for w_i in old_ws:
         del model.graph.initializer[w_i]
diff --git a/orttraining/tools/scripts/opset12_model_transform.py b/orttraining/tools/scripts/opset12_model_transform.py
index e8c2263a39c32..cda82b41b50bb 100644
--- a/orttraining/tools/scripts/opset12_model_transform.py
+++ b/orttraining/tools/scripts/opset12_model_transform.py
@@ -34,7 +34,7 @@ def find_input_node(model, arg):
     for node in model.graph.node:
         for output in node.output:
             if output == arg:
-                result.append(node)
+                result.append(node)  # noqa: PERF401
     return result[0] if len(result) == 1 else None
 
 
diff --git a/orttraining/tools/scripts/performance_investigation.py b/orttraining/tools/scripts/performance_investigation.py
index c8550a4d73c49..dfda008f6d3c4 100644
--- a/orttraining/tools/scripts/performance_investigation.py
+++ b/orttraining/tools/scripts/performance_investigation.py
@@ -30,11 +30,11 @@ def process_file(onnx_file):
         if node.op_type == "ATen":
             for attr in node.attribute:
                 if attr.name == "operator":
-                    aten_ops.append(f"{node.name}: {attr.s.decode('utf-8')}")
+                    aten_ops.append(f"{node.name}: {attr.s.decode('utf-8')}")  # noqa: PERF401
         if node.op_type == "PythonOp":
             for attr in node.attribute:
                 if attr.name == "name":
-                    python_ops.append(f"{node.name}: {attr.s.decode('utf-8')}")
+                    python_ops.append(f"{node.name}: {attr.s.decode('utf-8')}")  # noqa: PERF401
 
         # Look for stand-alone Dropout node in *_execution_model_<mode>.onnx graph.
         # Examine whether it should be fused with surrounding Add ops into BiasDropout node.
diff --git a/orttraining/tools/scripts/pipeline_model_split.py b/orttraining/tools/scripts/pipeline_model_split.py
index d1ae9dd22bf00..fb13463f1b623 100644
--- a/orttraining/tools/scripts/pipeline_model_split.py
+++ b/orttraining/tools/scripts/pipeline_model_split.py
@@ -49,7 +49,7 @@ def split_graph(model, split_edge_groups):
                             element_types.append(1)
             for info in model.graph.value_info:
                 if info.name == id:
-                    output_shapes.append(info.type)
+                    output_shapes.append(info.type)  # noqa: PERF401
 
         send_input_signal_name = "send_input_signal" + str(cut_index)
         send_signal = model.graph.input.add()
@@ -279,14 +279,14 @@ def generate_subgraph(model, start_nodes, identity_node_list):
     # remove added identity node before copy to subgraph
     identity_node_index = []
     for n in identity_node_list:
-        identity_node_index.append(get_identity_index_for_deleting(main_graph.graph.node, n))
+        identity_node_index.append(get_identity_index_for_deleting(main_graph.graph.node, n))  # noqa: PERF401
     identity_node_index.sort(reverse=True)
 
     for i in reversed(range(len(main_graph.graph.node))):
         try:
             if i in identity_node_index:
                 del main_graph.graph.node[i]
-        except Exception:
+        except Exception:  # noqa: PERF203
             print("error deleting identity node", i)
 
     all_visited_nodes = []
@@ -316,19 +316,19 @@ def generate_subgraph(model, start_nodes, identity_node_list):
         # gather visited nodes
         visited_nodes = []
         for n in visited0:
-            visited_nodes.append(get_index(main_graph.graph.node, n))
+            visited_nodes.append(get_index(main_graph.graph.node, n))  # noqa: PERF401
         visited_nodes.sort(reverse=True)
 
         # gather visited inputs
         visited_inputs = []
         for n in inputs0:
-            visited_inputs.append(get_index(main_graph.graph.input, n))
+            visited_inputs.append(get_index(main_graph.graph.input, n))  # noqa: PERF401
         visited_inputs.sort(reverse=True)
 
         # gather visited outputs
         visited_outputs = []
         for n in outputs0:
-            visited_outputs.append(get_index(main_graph.graph.output, n))
+            visited_outputs.append(get_index(main_graph.graph.output, n))  # noqa: PERF401
         visited_outputs.sort(reverse=True)
 
         for i in reversed(range(len(main_graph.graph.node))):
@@ -337,7 +337,7 @@ def generate_subgraph(model, start_nodes, identity_node_list):
                     del subgraph.graph.node[i]
                 else:
                     del main_graph.graph.node[i]
-            except Exception:
+            except Exception:  # noqa: PERF203
                 print("error deleting node", i)
 
         for i in reversed(range(len(main_graph.graph.input))):
@@ -346,7 +346,7 @@ def generate_subgraph(model, start_nodes, identity_node_list):
                     del subgraph.graph.input[i]
                 else:
                     del main_graph.graph.input[i]
-            except Exception:
+            except Exception:  # noqa: PERF203
                 print("error deleting inputs", i)
 
         for i in reversed(range(len(main_graph.graph.output))):
@@ -355,7 +355,7 @@ def generate_subgraph(model, start_nodes, identity_node_list):
                     del subgraph.graph.output[i]
                 else:
                     del main_graph.graph.output[i]
-            except Exception:
+            except Exception:  # noqa: PERF203
                 print("error deleting outputs ", i)
 
         print("model", str(model_count), " length ", len(subgraph.graph.node))
diff --git a/pyproject.toml b/pyproject.toml
index 34dd42e5f09f5..dde001a1761ce 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,20 +45,22 @@ reportMissingImports = false
 # NOTE: Do not create an exclude list. Edit .lintrunner.toml instead
 target-version = "py38"
 select = [
+    "B", # flake8-bugbear
     "E", # pycodestyle
     "F", # Pyflakes
-    "W", # pycodestyle
-    "B", # flake8-bugbear
-    "N", # pep8-naming
     "ISC", # flake8-implicit-str-concat
-    "YTT", # flake8-2020
+    "N", # pep8-naming
+    "NPY", # numpy
+    "PERF", # Perflint
+    "PLC", # pylint conventions
+    "PLE", # pylint errors
+    "PLW", # pylint warnings
     "RUF", # Ruff-specific rules
     "SIM", # flake8-simplify
+    "T10", # flake8-debugger
     "UP", # pyupgrade
-    "PLE", # pylint errors
-    "PLW", # pylint warnings
-    "PLC", # pylint conventions
-    "NPY", # numpy
+    "W", # pycodestyle
+    "YTT", # flake8-2020
 ]
 # NOTE: Refrain from growing the ignore list unless for exceptional cases.
 # Always include a comment to explain why.
diff --git a/requirements-lintrunner.txt b/requirements-lintrunner.txt
index 7a19ee1edd86b..00bec6f91a271 100644
--- a/requirements-lintrunner.txt
+++ b/requirements-lintrunner.txt
@@ -1,9 +1,9 @@
 # This file is auto updated by dependabot
 lintrunner-adapters>=0.8.0
 # RUFF, RUFF-FIX
-ruff==0.0.261
+ruff==0.0.278
 # BLACK-ISORT
-black==23.3.0
+black==23.7.0
 isort==5.12.0
 # PYLINT
 pylint==2.17.2
diff --git a/tools/android_custom_build/build_custom_android_package.py b/tools/android_custom_build/build_custom_android_package.py
index 2e1d092aa0d7f..aa57cf341942c 100755
--- a/tools/android_custom_build/build_custom_android_package.py
+++ b/tools/android_custom_build/build_custom_android_package.py
@@ -120,7 +120,8 @@ def main():
         "--file",
         str(SCRIPT_DIR / "Dockerfile"),
         *docker_build_image_args,
-    ] + [str(SCRIPT_DIR)]
+        str(SCRIPT_DIR),
+    ]
 
     run(docker_build_image_cmd)
 
@@ -154,7 +155,10 @@ def main():
     # enable use of Ctrl-C to stop when running interactively
     docker_run_interactive_args = ["-it"] if sys.stdin.isatty() else []
 
-    docker_container_build_cmd = [args.docker_path, "run", *docker_run_interactive_args] + [
+    docker_container_build_cmd = [
+        args.docker_path,
+        "run",
+        *docker_run_interactive_args,
         f"--name={args.docker_container_name}" if args.docker_container_name is not None else "--rm",
         f"--volume={working_dir}:/workspace/shared",
         args.docker_image_tag,
diff --git a/tools/ci_build/compile_triton.py b/tools/ci_build/compile_triton.py
index eb476da058643..c1119aad49ae8 100644
--- a/tools/ci_build/compile_triton.py
+++ b/tools/ci_build/compile_triton.py
@@ -102,7 +102,7 @@ def convert_and_save(metadata, header_file, out_dir, out_obj_file):
         # convert constants
         constants = []
         for k, v in m["constants"].items():
-            constants.append(f'{{ "{k}", {str(v)}}}')
+            constants.append(f'{{ "{k}", {v!s}}}')
         meta_ele.append(f"{{ { ', '.join(constants) } }}")
 
         c_metadata.append(f"{{ { ', '.join(meta_ele) } }}")
diff --git a/tools/ci_build/get_docker_image.py b/tools/ci_build/get_docker_image.py
index 2bab9e61c7b06..2ce1764c96327 100755
--- a/tools/ci_build/get_docker_image.py
+++ b/tools/ci_build/get_docker_image.py
@@ -86,7 +86,7 @@ def main():
         manylinux_build_scripts_folder = Path(args.manylinux_src) / "docker" / "build_scripts"
         dest = Path(args.context) / "build_scripts"
         if dest.exists():
-            log.info(f"Deleting: {str(dest)}")
+            log.info(f"Deleting: {dest!s}")
             shutil.rmtree(str(dest))
         shutil.copytree(str(manylinux_build_scripts_folder), str(dest))
         src_entrypoint_file = str(Path(args.manylinux_src) / "docker" / "manylinux-entrypoint")
diff --git a/tools/ci_build/github/js/validate-npm-packages.py b/tools/ci_build/github/js/validate-npm-packages.py
index f2118e825e8e9..b009330764973 100644
--- a/tools/ci_build/github/js/validate-npm-packages.py
+++ b/tools/ci_build/github/js/validate-npm-packages.py
@@ -110,7 +110,7 @@
 print("====== output environment variables ======")
 print(f"##vso[task.setvariable variable=ORT_COMMON_FROM]{ort_common_from}")
 
-if tag == "latest" or tag == "" or tag == "rc":  # noqa: PLC1901
+if tag == "latest" or tag == "" or tag == "rc":
     if not RELEASE_NODE or not RELEASE_WEB or not RELEASE_REACT_NATIVE:
         raise Exception("@latest or @rc build must release all packages (node, web, react-native)")
     if count_ort_node_common_tgz != 1:
@@ -137,7 +137,7 @@
 print(f"ort_web_ver={ort_web_ver}")
 print(f"ort_react_native_ver={ort_react_native_ver}")
 
-if tag == "latest" or tag == "":  # noqa: PLC1901
+if tag == "latest" or tag == "":
     print("Publishing @latest ...")
     if not source_branch.startswith("refs/heads/rel-"):
         raise Exception('@latest build must publish from source branch "refs/heads/rel-*"')
@@ -164,5 +164,5 @@
     and "+" not in ort_web_ver.replace("-rev", "")
     and "+" not in ort_react_native_ver.replace("-rev", "")
 ):
-    if tag != "latest" and tag != "":  # noqa: PLC1901
+    if tag != "latest" and tag != "":
         raise Exception("default version without decorator can only be published in @latest tag")
diff --git a/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py b/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py
index acca4fb13c45a..3cb74ac2d271a 100644
--- a/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py
+++ b/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py
@@ -69,7 +69,7 @@ def write_to_db(binary_size_data, args):
     branch_name = os.environ.get("BUILD_SOURCEBRANCHNAME", "main")
     rows = []
     for row in binary_size_data:
-        rows.append(
+        rows.append(  # noqa: PERF401
             [
                 now_str,
                 args.build_id,
diff --git a/tools/ci_build/patch_manylinux.py b/tools/ci_build/patch_manylinux.py
index 525f2b2f30c1e..0d1cb37cc40ac 100644
--- a/tools/ci_build/patch_manylinux.py
+++ b/tools/ci_build/patch_manylinux.py
@@ -41,7 +41,7 @@ def main():
         manylinux_build_scripts_folder = Path(args.manylinux_src) / "docker" / "build_scripts"
         dest = Path(args.context) / "build_scripts"
         if dest.exists():
-            log.info(f"Deleting: {str(dest)}")
+            log.info(f"Deleting: {dest!s}")
             shutil.rmtree(str(dest))
 
         shutil.copytree(str(manylinux_build_scripts_folder), str(dest))
diff --git a/tools/doc/rename_folders.py b/tools/doc/rename_folders.py
index cc64775ae158d..09ff7c49552fc 100644
--- a/tools/doc/rename_folders.py
+++ b/tools/doc/rename_folders.py
@@ -16,7 +16,7 @@ def rename_folder(root):
     for r, dirs, _files in os.walk(root):
         for name in dirs:
             if name.startswith("_"):
-                found.append((r, name))
+                found.append((r, name))  # noqa: PERF401
     renamed = []
     for r, name in found:
         into = name.lstrip("_")
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index e6d47597697d6..a4e00b92823cd 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -87,7 +87,9 @@ def generate_file_list_for_ep(nuget_artifacts_dir, ep, files_list, include_pdbs,
         if child.name == "onnxruntime-android" or child.name == "onnxruntime-training-android":
             for child_file in child.iterdir():
                 if child_file.suffix in [".aar"]:
-                    files_list.append('<file src="' + str(child_file) + '" target="runtimes/android/native"/>')
+                    files_list.append(  # noqa: PERF401
+                        '<file src="' + str(child_file) + '" target="runtimes/android/native"/>'
+                    )
 
         if child.name == "onnxruntime-ios-xcframework":
             files_list.append('<file src="' + str(child) + "\\**" '" target="runtimes/ios/native"/>')  # noqa: ISC001
@@ -720,7 +722,7 @@ def generate_files(line_list, args):
                 ngraph_list_path = os.path.join(openvino_path, "deployment_tools\\ngraph\\lib\\")
                 for ngraph_element in os.listdir(ngraph_list_path):
                     if ngraph_element.endswith("dll"):
-                        files_list.append(
+                        files_list.append(  # noqa: PERF401
                             "<file src="
                             + '"'
                             + os.path.join(ngraph_list_path, ngraph_element)
@@ -730,7 +732,7 @@ def generate_files(line_list, args):
                         )
             for dll_element in os.listdir(dll_list_path):
                 if dll_element.endswith("dll"):
-                    files_list.append(
+                    files_list.append(  # noqa: PERF401
                         "<file src="
                         + '"'
                         + os.path.join(dll_list_path, dll_element)
@@ -760,7 +762,7 @@ def generate_files(line_list, args):
                 )
             for tbb_element in os.listdir(tbb_list_path):
                 if tbb_element.endswith("dll"):
-                    files_list.append(
+                    files_list.append(  # noqa: PERF401
                         "<file src="
                         + '"'
                         + os.path.join(tbb_list_path, tbb_element)
diff --git a/tools/python/dump_ort_model.py b/tools/python/dump_ort_model.py
index 2177c42f5bc35..b761adac413ce 100644
--- a/tools/python/dump_ort_model.py
+++ b/tools/python/dump_ort_model.py
@@ -33,7 +33,7 @@ def _dump_initializers(self, graph: fbs.Graph):
             tensor = graph.Initializers(idx)
             dims = []
             for dim in range(0, tensor.DimsLength()):
-                dims.append(tensor.Dims(dim))
+                dims.append(tensor.Dims(dim))  # noqa: PERF401
 
             print(f"{tensor.Name().decode()} data_type={tensor.DataType()} dims={dims}")
         print("--------")
diff --git a/tools/python/gen_contrib_doc.py b/tools/python/gen_contrib_doc.py
index c86cf84b0b492..accab96bd3593 100644
--- a/tools/python/gen_contrib_doc.py
+++ b/tools/python/gen_contrib_doc.py
@@ -34,7 +34,7 @@ def display_number(v):  # type: (int) -> Text
 
 
 def should_render_domain(domain, domain_filter):  # type: (Text) -> bool
-    if domain in (ONNX_DOMAIN, ONNX_ML_DOMAIN) or domain == "" or domain == "ai.onnx.ml":  # noqa: PLC1901
+    if domain in (ONNX_DOMAIN, ONNX_ML_DOMAIN) or domain == "" or domain == "ai.onnx.ml":
         return False
 
     if domain_filter and domain not in domain_filter:
diff --git a/tools/python/gen_opkernel_doc.py b/tools/python/gen_opkernel_doc.py
index 1075ed8192fdd..2d0d16cf9a0de 100644
--- a/tools/python/gen_opkernel_doc.py
+++ b/tools/python/gen_opkernel_doc.py
@@ -150,7 +150,7 @@ def main(output_path: pathlib.Path, provider_filter: [str]):
                             tnameindex += 1
                             tclist = []
                             for tc in sorted(tcset):
-                                tclist.append(tc)
+                                tclist.append(tc)  # noqa: PERF402
                             fout.write("**" + tname + "** = " + format_type_constraints(tclist))
                             if tnameindex < len(typemap):
                                 fout.write("<br/> ")
diff --git a/tools/python/onnx2tfevents.py b/tools/python/onnx2tfevents.py
index adf6ded4b56b3..cf49db7b876f9 100644
--- a/tools/python/onnx2tfevents.py
+++ b/tools/python/onnx2tfevents.py
@@ -117,7 +117,7 @@ def _add_io_node(node, type):
     for node in graph.node:
         _attr = []
         for s in node.attribute:
-            _attr.append(" = ".join([str(f[1]) for f in s.ListFields()]))
+            _attr.append(" = ".join([str(f[1]) for f in s.ListFields()]))  # noqa: PERF401
         attr = ", ".join(_attr).encode(encoding="utf_8")
         shape_proto = None
         elem_type = 0
@@ -154,7 +154,7 @@ class TransformerBase(ABC):
     the dependency between it and existing transformers.
     """
 
-    _TRANSFORMERS = []
+    _TRANSFORMERS = []  # noqa: RUF012
 
     @classmethod
     def register_transformer(cls, klass):
@@ -328,10 +328,10 @@ def transform(self, graph: GraphProto) -> None:
             if len([output for output in node.output if len(output) > 0]) > 1:
                 idx = self.ops.get(node.op_type, 0)
                 self.ops[node.op_type] = idx + 1
-                new_output = f"{get_prefix(node.output[0])}{node.op_type}_{str(idx)}_output"
+                new_output = f"{get_prefix(node.output[0])}{node.op_type}_{idx!s}_output"
                 for output in node.output:
                     if len(output) > 0:
-                        new_nodes.append(helper.make_node("ListUnpack", [new_output], [output]))
+                        new_nodes.append(helper.make_node("ListUnpack", [new_output], [output]))  # noqa: PERF401
                 node.ClearField("output")
                 node.output.extend([new_output])
         if len(new_nodes) > 0:
diff --git a/tools/python/ort_test_dir_utils.py b/tools/python/ort_test_dir_utils.py
index 5cb01267300eb..2fc4921a7bb67 100644
--- a/tools/python/ort_test_dir_utils.py
+++ b/tools/python/ort_test_dir_utils.py
@@ -239,7 +239,7 @@ def run_test_dir(model_or_dir):
             output_names = list(expected_outputs.keys())
             # handle case where there's a single expected output file but no name in it (empty string for name)
             # e.g. ONNX test models 20190729\opset8\tf_mobilenet_v2_1.4_224
-            if len(output_names) == 1 and output_names[0] == "":  # noqa: PLC1901
+            if len(output_names) == 1 and output_names[0] == "":
                 output_names = [o.name for o in sess.get_outputs()]
                 assert len(output_names) == 1, "There should be single output_name."
                 expected_outputs[output_names[0]] = expected_outputs[""]
diff --git a/tools/python/util/convert_onnx_models_to_ort.py b/tools/python/util/convert_onnx_models_to_ort.py
index 18bba78661796..d8329fca3c7bd 100644
--- a/tools/python/util/convert_onnx_models_to_ort.py
+++ b/tools/python/util/convert_onnx_models_to_ort.py
@@ -165,7 +165,7 @@ def is_model_file_to_convert(file_path: pathlib.Path):
             # new_size = os.path.getsize(ort_target_path)
             # print("Serialized {} to {}. Sizes: orig={} new={} diff={} new:old={:.4f}:1.0".format(
             #     onnx_target_path, ort_target_path, orig_size, new_size, new_size - orig_size, new_size / orig_size))
-        except Exception as e:
+        except Exception as e:  # noqa: PERF203
             print(f"Error converting {model}: {e}")
             if not allow_conversion_failures:
                 raise
diff --git a/tools/python/util/mobile_helpers/usability_checker.py b/tools/python/util/mobile_helpers/usability_checker.py
index e93b4bd6986e0..f8b0bfe707ead 100644
--- a/tools/python/util/mobile_helpers/usability_checker.py
+++ b/tools/python/util/mobile_helpers/usability_checker.py
@@ -8,6 +8,7 @@
 import tempfile
 from collections import deque
 from enum import IntEnum
+from typing import Optional
 
 import onnx
 
@@ -188,7 +189,7 @@ def check_partitioning(
     graph: onnx.GraphProto,
     supported_ops_checker: _SupportedOpsChecker,
     require_fixed_input_sizes: bool = False,
-    value_info: dict = None,
+    value_info: Optional[dict] = None,
 ):
     """
     Estimate the partitions the graph will be split into for nodes that is_node_supported_fn returns true for.
@@ -356,13 +357,13 @@ def close_group():
     return info
 
 
-def _check_ep_partitioning(model, supported_ops_config, value_info: dict = None):
+def _check_ep_partitioning(model, supported_ops_config, value_info: Optional[dict] = None):
     supported_ops = _SupportedOpsChecker(supported_ops_config)
     partition_info = check_partitioning(model.graph, supported_ops, value_info is not None, value_info)
     return partition_info
 
 
-def check_nnapi_partitions(model, value_info: dict = None):
+def check_nnapi_partitions(model, value_info: Optional[dict] = None):
     # if we're running in the ORT python package the file should be local. otherwise assume we're running from the
     # ORT repo
     script_dir = pathlib.Path(__file__).parent
@@ -376,7 +377,7 @@ def check_nnapi_partitions(model, value_info: dict = None):
     return _check_ep_partitioning(model, config_path, value_info)
 
 
-def check_coreml_partitions(model, value_info: dict = None):
+def check_coreml_partitions(model, value_info: Optional[dict] = None):
     # if we're running in the ORT python package the file should be local. otherwise assume we're running from the
     # ORT repo
     script_dir = pathlib.Path(__file__).parent
@@ -390,7 +391,7 @@ def check_coreml_partitions(model, value_info: dict = None):
     return _check_ep_partitioning(model, config_path, value_info)
 
 
-def check_shapes(graph: onnx.GraphProto, logger: logging.Logger = None):
+def check_shapes(graph: onnx.GraphProto, logger: Optional[logging.Logger] = None):
     """
     Check the shapes of graph inputs, values and graph outputs to determine if they have static or dynamic sizes.
     NNAPI and CoreML do not support dynamically sized values.
@@ -522,7 +523,7 @@ def check_ep(ep_name, checker_func):
     return nnapi_suitability != PartitioningInfo.TryWithEP.NO or coreml_suitability != PartitioningInfo.TryWithEP.NO
 
 
-def analyze_model(model_path: pathlib.Path, skip_optimize: bool = False, logger: logging.Logger = None):
+def analyze_model(model_path: pathlib.Path, skip_optimize: bool = False, logger: Optional[logging.Logger] = None):
     """
     Analyze the provided model to determine if it's likely to work well with the NNAPI or CoreML Execution Providers
     :param model_path: Model to analyze.
diff --git a/tools/python/util/onnx_model_utils.py b/tools/python/util/onnx_model_utils.py
index d2205385522e8..e662d1623f8bd 100644
--- a/tools/python/util/onnx_model_utils.py
+++ b/tools/python/util/onnx_model_utils.py
@@ -3,6 +3,7 @@
 
 import logging
 import pathlib
+from typing import Optional
 
 import onnx
 from onnx import version_converter
@@ -59,7 +60,10 @@ def get_opsets_imported(model: onnx.ModelProto):
 
 
 def update_onnx_opset(
-    model_path: pathlib.Path, opset: int, out_path: pathlib.Path = None, logger: logging.Logger = None
+    model_path: pathlib.Path,
+    opset: int,
+    out_path: Optional[pathlib.Path] = None,
+    logger: Optional[logging.Logger] = None,
 ):
     """
     Helper to update the opset of a model using onnx version_converter. Target opset must be greater than current opset.
diff --git a/tools/python/util/ort_format_model/operator_type_usage_processors.py b/tools/python/util/ort_format_model/operator_type_usage_processors.py
index f38bdeae75974..08966f3d7bbb5 100644
--- a/tools/python/util/ort_format_model/operator_type_usage_processors.py
+++ b/tools/python/util/ort_format_model/operator_type_usage_processors.py
@@ -205,7 +205,7 @@ def get_cpp_entry(self):
         domain = _ort_constant_for_domain(self.domain)
         for i in sorted(self._input_types.keys()):
             if self._input_types[i]:
-                entries.append(
+                entries.append(  # noqa: PERF401
                     "ORT_SPECIFY_OP_KERNEL_ARG_ALLOWED_TYPES({}, {}, Input, {}, {});".format(
                         domain, self.optype, i, ", ".join(sorted(self._input_types[i]))
                     )
@@ -213,7 +213,7 @@ def get_cpp_entry(self):
 
         for o in sorted(self._output_types.keys()):
             if self._output_types[o]:
-                entries.append(
+                entries.append(  # noqa: PERF401
                     "ORT_SPECIFY_OP_KERNEL_ARG_ALLOWED_TYPES({}, {}, Output, {}, {});".format(
                         domain, self.optype, o, ", ".join(sorted(self._output_types[o]))
                     )
@@ -637,7 +637,7 @@ class GloballyAllowedTypesOpTypeImplFilter(OpTypeImplFilterInterface):
     Operator implementation filter which uses globally allowed types.
     """
 
-    _valid_allowed_types = set(FbsTypeInfo.tensordatatype_to_string.values())
+    _valid_allowed_types = set(FbsTypeInfo.tensordatatype_to_string.values())  # noqa: RUF012
 
     def __init__(self, globally_allowed_types: typing.Set[str]):
         self._operator_processors = _create_operator_type_usage_processors()
diff --git a/tools/python/util/ort_format_model/types.py b/tools/python/util/ort_format_model/types.py
index 5d4f76969e5ea..ffeda6b2e7607 100644
--- a/tools/python/util/ort_format_model/types.py
+++ b/tools/python/util/ort_format_model/types.py
@@ -6,7 +6,7 @@
 
 class FbsTypeInfo:
     "Class to provide conversion between ORT flatbuffers schema values and C++ types"
-    tensordatatype_to_string = {
+    tensordatatype_to_string = {  # noqa: RUF012
         fbs.TensorDataType.TensorDataType.FLOAT: "float",
         fbs.TensorDataType.TensorDataType.UINT8: "uint8_t",
         fbs.TensorDataType.TensorDataType.INT8: "int8_t",

From 193415a1625abc8f955771c7cd8dd33650e93d4e Mon Sep 17 00:00:00 2001
From: Jiajia Qin <jiajia.qin@intel.com>
Date: Sat, 22 Jul 2023 04:13:01 +0800
Subject: [PATCH 18/34] [js/webgpu] reuse buffer for GpuDataManager (#16746)

### Description
<!-- Describe your changes. -->
Allocating new GPUBuffer in every session.run is not efficient. We
should make it only happen in the first run. In the following runs, we
should try to reuse those buffers.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
- This PR is for performance.
See mobilenetv2 becomes 9.58 ms from 12.9 ms.
---
 .../lib/wasm/jsep/webgpu/gpu-data-manager.ts  | 51 +++++++++++++++----
 1 file changed, 42 insertions(+), 9 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
index 526ae68ac916b..784b9a1d54e3e 100644
--- a/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
+++ b/js/web/lib/wasm/jsep/webgpu/gpu-data-manager.ts
@@ -45,6 +45,11 @@ export interface GpuDataManager {
    * actually released.
    */
   refreshPendingBuffers(): void;
+
+  /**
+   * destroy all gpu buffers. Call this when the session.release is called.
+   */
+  dispose(): void;
 }
 
 interface StorageCacheValue {
@@ -76,9 +81,12 @@ class GpuDataManagerImpl implements GpuDataManager {
   // pending buffers for computing
   private buffersPending: GPUBuffer[];
 
-  constructor(private backend: WebGpuBackend /* , private reuseBuffer: boolean */) {
+  private freeBuffers: Map<number, GPUBuffer[]>;
+
+  constructor(private backend: WebGpuBackend) {
     this.storageCache = new Map();
     this.downloadCache = new Map();
+    this.freeBuffers = new Map();
     this.buffersForUploadingPending = [];
     this.buffersPending = [];
   }
@@ -144,15 +152,20 @@ class GpuDataManagerImpl implements GpuDataManager {
 
   // eslint-disable-next-line no-bitwise
   create(size: number, usage = GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST): GpuData {
-    // !!!
-    // !!! IMPORTANT: TODO: whether we should keep the storage buffer every time, or always create new ones.
-    // !!!                  This need to be figured out by performance test results.
-    // !!!
-
     const bufferSize = calcNormalizedBufferSize(size);
 
-    // create gpu buffer
-    const gpuBuffer = this.backend.device.createBuffer({size: bufferSize, usage});
+    let gpuBuffer;
+    let buffers = this.freeBuffers.get(bufferSize);
+    if (!buffers) {
+      buffers = [];
+      this.freeBuffers.set(bufferSize, buffers);
+    }
+    if (buffers.length > 0) {
+      gpuBuffer = buffers.pop() as GPUBuffer;
+    } else {
+      // create gpu buffer
+      gpuBuffer = this.backend.device.createBuffer({size: bufferSize, usage});
+    }
 
     const gpuData = {id: createNewGpuDataId(), type: GpuDataType.default, buffer: gpuBuffer};
     this.storageCache.set(gpuData.id, {gpuData, originalSize: size});
@@ -223,11 +236,31 @@ class GpuDataManagerImpl implements GpuDataManager {
 
   refreshPendingBuffers(): void {
     for (const buffer of this.buffersForUploadingPending) {
+      // upload buffer is only useful in the session creation time. So we don't need to reuse them in session running.
       buffer.destroy();
     }
+    this.buffersForUploadingPending = [];
     for (const buffer of this.buffersPending) {
-      buffer.destroy();
+      // Put the pending buffer to freeBuffers list instead of really destroying it for buffer reusing.
+      this.freeBuffers.get(buffer.size)!.push(buffer);
     }
+    this.buffersPending = [];
+  }
+
+  dispose() {
+    this.freeBuffers.forEach((buffers) => {
+      buffers.forEach(buffer => {
+        buffer.destroy();
+      });
+    });
+
+    this.storageCache.forEach((storage) => {
+      storage.gpuData.buffer.destroy();
+    });
+
+    this.storageCache = new Map();
+    this.downloadCache = new Map();
+    this.freeBuffers = new Map();
   }
 }
 

From 210d29b40e3eb23b47751f3b4809616a9d825c33 Mon Sep 17 00:00:00 2001
From: Arthur Islamov <me@daken.ru>
Date: Sat, 22 Jul 2023 01:21:37 +0400
Subject: [PATCH 19/34] Allow --build_wasm on a mac system (#16761)

### Description
Changes allow downloading prebuilt protoc compiler when building
WebAssebly version on mac systems.
Otherwise it tries to build a js/wasm version of protoc and throws an
error while executing it: "protoc.js permission denied"


### Motivation and Context
I need to switch between my main working computer and a PC to make
changes to WebAssebly build. Would like not to do that anymore.
---
 cmake/deps.txt                                 |  1 +
 cmake/external/onnxruntime_external_deps.cmake | 10 +++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/cmake/deps.txt b/cmake/deps.txt
index 63bdf21332e69..7f8bbd377d4a7 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -32,6 +32,7 @@ protoc_win32;https://github.com/protocolbuffers/protobuf/releases/download/v21.1
 protoc_linux_x64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-linux-x86_64.zip;338462004aa5be9fba45b35b5b4be43f69b47a90
 protoc_linux_x86;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-linux-x86_32.zip;61fdbe7d6360e065ec6fea23bca2cca673115fb8
 protoc_linux_aarch64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-linux-aarch_64.zip;df9d45470b0b8cf939dd2f0ec6b88e9cafc4d617
+protoc_mac_universal;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-osx-universal_binary.zip;23710c3d1c2036d8d65a6a22234372fa2d7af9ef
 psimd;https://github.com/Maratyszcza/psimd/archive/072586a71b55b7f8c584153d223e95687148a900.zip;1f5454b01f06f9656b77e4a5e2e31d7422487013
 pthreadpool;https://github.com/Maratyszcza/pthreadpool/archive/1787867f6183f056420e532eec640cba25efafea.zip;e43e80781560c5ab404a4da20f34d846f5f5d101
 pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.10.1.zip;769b6aa67a77f17a770960f604b727645b6f6a13
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 4cb3182c6301e..b30a935952ca8 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -107,7 +107,7 @@ FetchContent_Declare(
 )
 
 # Download a protoc binary from Internet if needed
-if(CMAKE_CROSSCOMPILING AND NOT ONNX_CUSTOM_PROTOC_EXECUTABLE AND NOT CMAKE_OSX_ARCHITECTURES)
+if(CMAKE_CROSSCOMPILING AND NOT ONNX_CUSTOM_PROTOC_EXECUTABLE)
   # This part of code is only for users' convenience. The code couldn't handle all cases. Users always can manually
   # download protoc from Protobuf's Github release page and pass the local path to the ONNX_CUSTOM_PROTOC_EXECUTABLE
   # variable.
@@ -141,6 +141,14 @@ if(CMAKE_CROSSCOMPILING AND NOT ONNX_CUSTOM_PROTOC_EXECUTABLE AND NOT CMAKE_OSX_
       set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
 	  set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
     endif()
+  elseif (CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND CMAKE_HOST_SYSTEM_NAME STREQUAL "Darwin")
+    FetchContent_Declare(protoc_binary URL ${DEP_URL_protoc_mac_universal} URL_HASH SHA1=${DEP_SHA1_protoc_mac_universal})
+    FetchContent_Populate(protoc_binary)
+    if(protoc_binary_SOURCE_DIR)
+      message("Use prebuilt protoc")
+      set(ONNX_CUSTOM_PROTOC_EXECUTABLE ${protoc_binary_SOURCE_DIR}/bin/protoc)
+      set(PROTOC_EXECUTABLE ${ONNX_CUSTOM_PROTOC_EXECUTABLE})
+    endif()
   endif()
 endif()
 

From 488544b79ac7eb12672f5eca3181c9ff8d709828 Mon Sep 17 00:00:00 2001
From: Ted Themistokleous
 <107195283+TedThemistokleous@users.noreply.github.com>
Date: Fri, 21 Jul 2023 21:48:36 -0400
Subject: [PATCH 20/34] [MIGraphX EP] Fix CopyTensorAsync and add guards for
 stream sync CopyTensors (#16787)

Add compile guards to gate functionality based on MIGRAPHX_STREAM_SYNC
for adding the following

- remove excess hipStreamSyncronize to nullstream on CopyTensor calls
- Add proper call for stream synchronized CopyTensorAsync for
DeviceToHost case

Without this change subsequent CopyTensorAsync() calls will fail for
cards that don't use pinned memory thus causing hipMemcpy() calls to
occur before certain kernel operations occur.

![image](https://github.com/microsoft/onnxruntime/assets/107195283/4915c18a-fb2d-40c9-a50e-a7c6613c324b)

becomes

![image](https://github.com/microsoft/onnxruntime/assets/107195283/f661acf4-e2af-4c9a-b26a-30fca339cf1d)

---------

Co-authored-by: Ted Themistokleous <tthemist@amd.com>
---
 onnxruntime/core/providers/migraphx/gpu_data_transfer.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/providers/migraphx/gpu_data_transfer.cc b/onnxruntime/core/providers/migraphx/gpu_data_transfer.cc
index 1a5c3c6d912fd..72193ef6268c1 100644
--- a/onnxruntime/core/providers/migraphx/gpu_data_transfer.cc
+++ b/onnxruntime/core/providers/migraphx/gpu_data_transfer.cc
@@ -24,17 +24,14 @@ common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const
       // Copy only if the two addresses are different.
       if (dst_data != src_data) {
         HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToDevice));
-        HIP_CALL_THROW(hipStreamSynchronize(nullptr));
       }
     } else {
       // copy from other CPU memory to GPU, this is blocking
       HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyHostToDevice));
-      HIP_CALL_THROW(hipStreamSynchronize(nullptr));  // TODO: still need stream sync? since already blocking
     }
   } else if (src_device.Type() == OrtDevice::GPU) {
     // copying from GPU to CPU memory, this is blocking
     HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToHost));
-    HIP_CALL_THROW(hipStreamSynchronize(nullptr));  // TODO: still need stream sync? since already blocking
   } else {
     // copying between cpu memory
     memcpy(dst_data, src_data, bytes);
@@ -63,6 +60,7 @@ common::Status GPUDataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst,
       HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyHostToDevice));
     }
   } else if (src_device.Type() == OrtDevice::GPU) {
+#ifndef MIGRAPHX_STREAM_SYNC
     if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::HIP_PINNED) {
       // copying from GPU to pinned memory, this is non-blocking
       HIP_CALL_THROW(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, static_cast<hipStream_t>(stream.GetHandle())));
@@ -70,6 +68,9 @@ common::Status GPUDataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst,
       // copying from GPU to CPU memory, this is blocking
       HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToHost));
     }
+#else
+    HIP_CALL_THROW(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, static_cast<hipStream_t>(stream.GetHandle())));
+#endif
   } else {
     // copying between cpu memory
     memcpy(dst_data, src_data, bytes);

From dafe11839eb740cb6a8e169286c9a5aa8f9b9a12 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 22 Jul 2023 13:36:38 -0700
Subject: [PATCH 21/34] Bump word-wrap from 1.2.3 to 1.2.4 in /js (#16754)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [word-wrap](https://github.com/jonschlinkert/word-wrap) from 1.2.3
to 1.2.4.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/jonschlinkert/word-wrap/releases">word-wrap's
releases</a>.</em></p>
<blockquote>
<h2>1.2.4</h2>
<h2>What's Changed</h2>
<ul>
<li>Remove default indent by <a
href="https://github.com/mohd-akram"><code>@​mohd-akram</code></a> in <a
href="https://redirect.github.com/jonschlinkert/word-wrap/pull/24">jonschlinkert/word-wrap#24</a></li>
<li>🔒fix: CVE 2023 26115 (2) by <a
href="https://github.com/OlafConijn"><code>@​OlafConijn</code></a> in <a
href="https://redirect.github.com/jonschlinkert/word-wrap/pull/41">jonschlinkert/word-wrap#41</a></li>
<li>:lock: fix: CVE-2023-26115 by <a
href="https://github.com/aashutoshrathi"><code>@​aashutoshrathi</code></a>
in <a
href="https://redirect.github.com/jonschlinkert/word-wrap/pull/33">jonschlinkert/word-wrap#33</a></li>
<li>chore: publish workflow by <a
href="https://github.com/OlafConijn"><code>@​OlafConijn</code></a> in <a
href="https://redirect.github.com/jonschlinkert/word-wrap/pull/42">jonschlinkert/word-wrap#42</a></li>
</ul>
<h2>New Contributors</h2>
<ul>
<li><a
href="https://github.com/mohd-akram"><code>@​mohd-akram</code></a> made
their first contribution in <a
href="https://redirect.github.com/jonschlinkert/word-wrap/pull/24">jonschlinkert/word-wrap#24</a></li>
<li><a
href="https://github.com/OlafConijn"><code>@​OlafConijn</code></a> made
their first contribution in <a
href="https://redirect.github.com/jonschlinkert/word-wrap/pull/41">jonschlinkert/word-wrap#41</a></li>
<li><a
href="https://github.com/aashutoshrathi"><code>@​aashutoshrathi</code></a>
made their first contribution in <a
href="https://redirect.github.com/jonschlinkert/word-wrap/pull/33">jonschlinkert/word-wrap#33</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/jonschlinkert/word-wrap/compare/1.2.3...1.2.4">https://github.com/jonschlinkert/word-wrap/compare/1.2.3...1.2.4</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/jonschlinkert/word-wrap/commit/f64b188c7261d26b99e1e2075d6b12f21798e83a"><code>f64b188</code></a>
run verb to generate README</li>
<li><a
href="https://github.com/jonschlinkert/word-wrap/commit/03ea08256ba0c8e8b02b1b304f0f5bd2b1863207"><code>03ea082</code></a>
Merge pull request <a
href="https://redirect.github.com/jonschlinkert/word-wrap/issues/42">#42</a>
from jonschlinkert/chore/publish-workflow</li>
<li><a
href="https://github.com/jonschlinkert/word-wrap/commit/420dce9a2412b21881202b73a3c34f0edc53cb2e"><code>420dce9</code></a>
Merge pull request <a
href="https://redirect.github.com/jonschlinkert/word-wrap/issues/41">#41</a>
from jonschlinkert/fix/CVE-2023-26115-2</li>
<li><a
href="https://github.com/jonschlinkert/word-wrap/commit/bfa694edf55bb84ff84512f13da6d68bf7593f06"><code>bfa694e</code></a>
Update .github/workflows/publish.yml</li>
<li><a
href="https://github.com/jonschlinkert/word-wrap/commit/ace0b3c78f81aaf43040bab3bc91d3c5546d3fd2"><code>ace0b3c</code></a>
chore: bump version to 1.2.4</li>
<li><a
href="https://github.com/jonschlinkert/word-wrap/commit/6fd727594676f3e1b196b08a320908bec2f4ca02"><code>6fd7275</code></a>
chore: add publish workflow</li>
<li><a
href="https://github.com/jonschlinkert/word-wrap/commit/30d6daf60fce429f5f559252fa86ee78200652c4"><code>30d6daf</code></a>
chore: fix test</li>
<li><a
href="https://github.com/jonschlinkert/word-wrap/commit/655929cabea6299dddf3b4a21fc3713fca701b48"><code>655929c</code></a>
chore: remove package-lock</li>
<li><a
href="https://github.com/jonschlinkert/word-wrap/commit/49e08bbc32a84da5d79e6b7e0fa74ff6217f6d81"><code>49e08bb</code></a>
chore: added an additional testcase</li>
<li><a
href="https://github.com/jonschlinkert/word-wrap/commit/9f626935f3fac6ec0f3c4b26baea4eb9740d9645"><code>9f62693</code></a>
fix: cve 2023-26115</li>
<li>Additional commits viewable in <a
href="https://github.com/jonschlinkert/word-wrap/compare/1.2.3...1.2.4">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=word-wrap&package-manager=npm_and_yarn&previous-version=1.2.3&new-version=1.2.4)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
Dependabot will merge this PR once it's up-to-date and CI passes on it,
as requested by @fs-eire.

[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the
[Security Alerts
page](https://github.com/microsoft/onnxruntime/network/alerts).

</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 js/package-lock.json | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/js/package-lock.json b/js/package-lock.json
index deb97d1a076c7..16e415a889831 100644
--- a/js/package-lock.json
+++ b/js/package-lock.json
@@ -5603,9 +5603,9 @@
       "dev": true
     },
     "node_modules/word-wrap": {
-      "version": "1.2.3",
-      "resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.3.tgz",
-      "integrity": "sha512-Hz/mrNwitNRh/HUAtM/VT/5VH+ygD6DV7mYKZAtHOrbs8U7lvPS6xf7EJKMF0uW1KJCl0H701g3ZGus+muE5vQ==",
+      "version": "1.2.4",
+      "resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.4.tgz",
+      "integrity": "sha512-2V81OA4ugVo5pRo46hAoD2ivUJx8jXmWXfUkY4KFNw0hEptvN0QfH3K4nHiwzGeKl5rFKedV48QVoqYavy4YpA==",
       "dev": true,
       "engines": {
         "node": ">=0.10.0"
@@ -9895,9 +9895,9 @@
       "dev": true
     },
     "word-wrap": {
-      "version": "1.2.3",
-      "resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.3.tgz",
-      "integrity": "sha512-Hz/mrNwitNRh/HUAtM/VT/5VH+ygD6DV7mYKZAtHOrbs8U7lvPS6xf7EJKMF0uW1KJCl0H701g3ZGus+muE5vQ==",
+      "version": "1.2.4",
+      "resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.4.tgz",
+      "integrity": "sha512-2V81OA4ugVo5pRo46hAoD2ivUJx8jXmWXfUkY4KFNw0hEptvN0QfH3K4nHiwzGeKl5rFKedV48QVoqYavy4YpA==",
       "dev": true
     },
     "worker-loader": {

From b92f02ad488acac1ad1ee7bf7ba870c93fffc74a Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 22 Jul 2023 13:36:49 -0700
Subject: [PATCH 22/34] Bump word-wrap from 1.2.3 to 1.2.4 in /js/react_native
 (#16755)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [word-wrap](https://github.com/jonschlinkert/word-wrap) from 1.2.3
to 1.2.4.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/jonschlinkert/word-wrap/releases">word-wrap's
releases</a>.</em></p>
<blockquote>
<h2>1.2.4</h2>
<h2>What's Changed</h2>
<ul>
<li>Remove default indent by <a
href="https://github.com/mohd-akram"><code>@​mohd-akram</code></a> in <a
href="https://redirect.github.com/jonschlinkert/word-wrap/pull/24">jonschlinkert/word-wrap#24</a></li>
<li>🔒fix: CVE 2023 26115 (2) by <a
href="https://github.com/OlafConijn"><code>@​OlafConijn</code></a> in <a
href="https://redirect.github.com/jonschlinkert/word-wrap/pull/41">jonschlinkert/word-wrap#41</a></li>
<li>:lock: fix: CVE-2023-26115 by <a
href="https://github.com/aashutoshrathi"><code>@​aashutoshrathi</code></a>
in <a
href="https://redirect.github.com/jonschlinkert/word-wrap/pull/33">jonschlinkert/word-wrap#33</a></li>
<li>chore: publish workflow by <a
href="https://github.com/OlafConijn"><code>@​OlafConijn</code></a> in <a
href="https://redirect.github.com/jonschlinkert/word-wrap/pull/42">jonschlinkert/word-wrap#42</a></li>
</ul>
<h2>New Contributors</h2>
<ul>
<li><a
href="https://github.com/mohd-akram"><code>@​mohd-akram</code></a> made
their first contribution in <a
href="https://redirect.github.com/jonschlinkert/word-wrap/pull/24">jonschlinkert/word-wrap#24</a></li>
<li><a
href="https://github.com/OlafConijn"><code>@​OlafConijn</code></a> made
their first contribution in <a
href="https://redirect.github.com/jonschlinkert/word-wrap/pull/41">jonschlinkert/word-wrap#41</a></li>
<li><a
href="https://github.com/aashutoshrathi"><code>@​aashutoshrathi</code></a>
made their first contribution in <a
href="https://redirect.github.com/jonschlinkert/word-wrap/pull/33">jonschlinkert/word-wrap#33</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/jonschlinkert/word-wrap/compare/1.2.3...1.2.4">https://github.com/jonschlinkert/word-wrap/compare/1.2.3...1.2.4</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/jonschlinkert/word-wrap/commit/f64b188c7261d26b99e1e2075d6b12f21798e83a"><code>f64b188</code></a>
run verb to generate README</li>
<li><a
href="https://github.com/jonschlinkert/word-wrap/commit/03ea08256ba0c8e8b02b1b304f0f5bd2b1863207"><code>03ea082</code></a>
Merge pull request <a
href="https://redirect.github.com/jonschlinkert/word-wrap/issues/42">#42</a>
from jonschlinkert/chore/publish-workflow</li>
<li><a
href="https://github.com/jonschlinkert/word-wrap/commit/420dce9a2412b21881202b73a3c34f0edc53cb2e"><code>420dce9</code></a>
Merge pull request <a
href="https://redirect.github.com/jonschlinkert/word-wrap/issues/41">#41</a>
from jonschlinkert/fix/CVE-2023-26115-2</li>
<li><a
href="https://github.com/jonschlinkert/word-wrap/commit/bfa694edf55bb84ff84512f13da6d68bf7593f06"><code>bfa694e</code></a>
Update .github/workflows/publish.yml</li>
<li><a
href="https://github.com/jonschlinkert/word-wrap/commit/ace0b3c78f81aaf43040bab3bc91d3c5546d3fd2"><code>ace0b3c</code></a>
chore: bump version to 1.2.4</li>
<li><a
href="https://github.com/jonschlinkert/word-wrap/commit/6fd727594676f3e1b196b08a320908bec2f4ca02"><code>6fd7275</code></a>
chore: add publish workflow</li>
<li><a
href="https://github.com/jonschlinkert/word-wrap/commit/30d6daf60fce429f5f559252fa86ee78200652c4"><code>30d6daf</code></a>
chore: fix test</li>
<li><a
href="https://github.com/jonschlinkert/word-wrap/commit/655929cabea6299dddf3b4a21fc3713fca701b48"><code>655929c</code></a>
chore: remove package-lock</li>
<li><a
href="https://github.com/jonschlinkert/word-wrap/commit/49e08bbc32a84da5d79e6b7e0fa74ff6217f6d81"><code>49e08bb</code></a>
chore: added an additional testcase</li>
<li><a
href="https://github.com/jonschlinkert/word-wrap/commit/9f626935f3fac6ec0f3c4b26baea4eb9740d9645"><code>9f62693</code></a>
fix: cve 2023-26115</li>
<li>Additional commits viewable in <a
href="https://github.com/jonschlinkert/word-wrap/compare/1.2.3...1.2.4">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=word-wrap&package-manager=npm_and_yarn&previous-version=1.2.3&new-version=1.2.4)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
Dependabot will merge this PR once CI passes on it, as requested by
@fs-eire.

[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the
[Security Alerts
page](https://github.com/microsoft/onnxruntime/network/alerts).

</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 js/react_native/yarn.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/js/react_native/yarn.lock b/js/react_native/yarn.lock
index 29e8e007f2406..f93f795557821 100644
--- a/js/react_native/yarn.lock
+++ b/js/react_native/yarn.lock
@@ -6640,9 +6640,9 @@ which@^2.0.1, which@^2.0.2:
     isexe "^2.0.0"
 
 word-wrap@~1.2.3:
-  version "1.2.3"
-  resolved "https://registry.yarnpkg.com/word-wrap/-/word-wrap-1.2.3.tgz#610636f6b1f703891bd34771ccb17fb93b47079c"
-  integrity sha512-Hz/mrNwitNRh/HUAtM/VT/5VH+ygD6DV7mYKZAtHOrbs8U7lvPS6xf7EJKMF0uW1KJCl0H701g3ZGus+muE5vQ==
+  version "1.2.4"
+  resolved "https://registry.yarnpkg.com/word-wrap/-/word-wrap-1.2.4.tgz#cb4b50ec9aca570abd1f52f33cd45b6c61739a9f"
+  integrity sha512-2V81OA4ugVo5pRo46hAoD2ivUJx8jXmWXfUkY4KFNw0hEptvN0QfH3K4nHiwzGeKl5rFKedV48QVoqYavy4YpA==
 
 wrap-ansi@^6.2.0:
   version "6.2.0"

From 3252ff2cb7e9e29d400f4c3ed0fb123f2806e7df Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Sun, 23 Jul 2023 10:07:21 +0800
Subject: [PATCH 23/34] Change DML GPU pool in Windows GPU workflow use Visual
 Studio 2022 (#16784)

### Description
1. use the pool with VS2022
2. upgrade System.Memory to 4.5.5


### Motivation and Context
Solve the build error while using VS2022:
`[Failure] Msbuild failed when processing the file
'D:\a\_work\1\s\csharp\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj'
with message: Method not found: 'System.ReadOnlySpan`1<Char>
Microsoft.IO.Path.GetFileName(System.ReadOnlySpan`1<Char>)'`

Ref:
https://stackoverflow.com/questions/73399777/azure-build-failing-due-to-method-not-found-system-readonlyspan1char-micros
---
 ...rosoft.ML.OnnxRuntime.InferenceSample.Forms.Android.csproj | 4 ++--
 .../Microsoft.ML.OnnxRuntime.InferenceSample.Forms.iOS.csproj | 4 ++--
 .../Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj  | 4 ++--
 .../Microsoft.ML.OnnxRuntime.Tests.Droid.csproj               | 4 ++--
 .../Microsoft.ML.OnnxRuntime.Tests.iOS.csproj                 | 4 ++--
 tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml | 4 ++--
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/csharp/sample/InferenceSample/Microsoft.ML.OnnxRuntime.InferenceSample.Forms.Android/Microsoft.ML.OnnxRuntime.InferenceSample.Forms.Android.csproj b/csharp/sample/InferenceSample/Microsoft.ML.OnnxRuntime.InferenceSample.Forms.Android/Microsoft.ML.OnnxRuntime.InferenceSample.Forms.Android.csproj
index ec9e60710fedd..5fa0349e0f9a3 100644
--- a/csharp/sample/InferenceSample/Microsoft.ML.OnnxRuntime.InferenceSample.Forms.Android/Microsoft.ML.OnnxRuntime.InferenceSample.Forms.Android.csproj
+++ b/csharp/sample/InferenceSample/Microsoft.ML.OnnxRuntime.InferenceSample.Forms.Android/Microsoft.ML.OnnxRuntime.InferenceSample.Forms.Android.csproj
@@ -100,7 +100,7 @@
     </ProjectReference>
   </ItemGroup>
   <ItemGroup>
-    <PackageReference Include="System.Memory" Version="4.5.3" IncludeAssets="None" />
+    <PackageReference Include="System.Memory" Version="4.5.5" IncludeAssets="None" />
     <AndroidNativeLibrary Condition=" Exists('..\..\..\..\build\Android\arm64-v8a\Release\libonnxruntime.so') " Include="..\..\..\..\build\Android\arm64-v8a\Release\libonnxruntime.so">
       <Link>libs\arm64-v8a\libonnxruntime.so</Link>
     </AndroidNativeLibrary>
@@ -115,4 +115,4 @@
     </AndroidNativeLibrary>
   </ItemGroup>
   <Import Project="$(MSBuildExtensionsPath)\Xamarin\Android\Xamarin.Android.CSharp.targets" />
-</Project>
\ No newline at end of file
+</Project>
diff --git a/csharp/sample/InferenceSample/Microsoft.ML.OnnxRuntime.InferenceSample.Forms.iOS/Microsoft.ML.OnnxRuntime.InferenceSample.Forms.iOS.csproj b/csharp/sample/InferenceSample/Microsoft.ML.OnnxRuntime.InferenceSample.Forms.iOS/Microsoft.ML.OnnxRuntime.InferenceSample.Forms.iOS.csproj
index ce261096c1a00..1b50a2842f242 100644
--- a/csharp/sample/InferenceSample/Microsoft.ML.OnnxRuntime.InferenceSample.Forms.iOS/Microsoft.ML.OnnxRuntime.InferenceSample.Forms.iOS.csproj
+++ b/csharp/sample/InferenceSample/Microsoft.ML.OnnxRuntime.InferenceSample.Forms.iOS/Microsoft.ML.OnnxRuntime.InferenceSample.Forms.iOS.csproj
@@ -147,7 +147,7 @@
     </ProjectReference>
   </ItemGroup>
   <ItemGroup>
-    <PackageReference Include="System.Memory" Version="4.5.3" IncludeAssets="None" />
+    <PackageReference Include="System.Memory" Version="4.5.5" IncludeAssets="None" />
     <NativeReference Condition=" '$(Platform)' == 'iPhoneSimulator' And Exists('$(OnnxNativeFrameworkPathSimulator)') " Include="$(OnnxNativeFrameworkPathSimulator)">
       <Kind>Framework</Kind>
       <ForceLoad>True</ForceLoad>
@@ -160,4 +160,4 @@
     </NativeReference>
   </ItemGroup>
   <Import Project="$(MSBuildExtensionsPath)\Xamarin\iOS\Xamarin.iOS.CSharp.targets" />
-</Project>
\ No newline at end of file
+</Project>
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
index 3c9f5cf6743a7..29ccf55f081d5 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
@@ -331,13 +331,13 @@
   </ItemGroup>
 
   <ItemGroup Condition="$(TargetFramework.StartsWith('netstandard'))">
-    <PackageReference Include="System.Memory" Version="4.5.3" />
+    <PackageReference Include="System.Memory" Version="4.5.5" />
     <PackageReference Include="Microsoft.SourceLink.GitHub" Version="1.0.0" PrivateAssets="All" />
     <Compile Include="**\*.netstandard.cs" Link="platform\netstandard\%(Filename)%(Extension)" />
   </ItemGroup>
 
   <ItemGroup Condition=" $(TargetFramework.StartsWith('net')) AND !$(TargetFramework.StartsWith('netstandard')) ">
-    <PackageReference Include="System.Memory" Version="4.5.3" />
+    <PackageReference Include="System.Memory" Version="4.5.5" />
     <PackageReference Include="Microsoft.SourceLink.GitHub" Version="1.0.0" PrivateAssets="All" />
     <Compile Include="**\*.net.cs" Link="platform\net\%(Filename)%(Extension)" />
   </ItemGroup>
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Droid/Microsoft.ML.OnnxRuntime.Tests.Droid.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Droid/Microsoft.ML.OnnxRuntime.Tests.Droid.csproj
index 020a4745e260f..11855032584a3 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Droid/Microsoft.ML.OnnxRuntime.Tests.Droid.csproj
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Droid/Microsoft.ML.OnnxRuntime.Tests.Droid.csproj
@@ -151,7 +151,7 @@
     </ProjectReference>
   </ItemGroup>
   <ItemGroup>
-    <PackageReference Include="System.Memory" Version="4.5.3" IncludeAssets="None" />
+    <PackageReference Include="System.Memory" Version="4.5.5" IncludeAssets="None" />
     <AndroidNativeLibrary Condition=" Exists('..\..\..\build\Android\arm64-v8a\Release\libonnxruntime.so') " Include="..\..\..\build\Android\arm64-v8a\Release\libonnxruntime.so">
       <Link>libs\arm64-v8a\libonnxruntime.so</Link>
     </AndroidNativeLibrary>
@@ -173,4 +173,4 @@
     <Target Name="AfterBuild">
     </Target>
   -->
-</Project>
\ No newline at end of file
+</Project>
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.iOS/Microsoft.ML.OnnxRuntime.Tests.iOS.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.iOS/Microsoft.ML.OnnxRuntime.Tests.iOS.csproj
index 6df666f25cd4f..352de5db00920 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.iOS/Microsoft.ML.OnnxRuntime.Tests.iOS.csproj
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.iOS/Microsoft.ML.OnnxRuntime.Tests.iOS.csproj
@@ -103,7 +103,7 @@
     <PackageReference Include="Xamarin.Forms">
       <Version>5.0.0.2083</Version>
     </PackageReference>
-    <PackageReference Include="System.Memory" Version="4.5.3" IncludeAssets="None" />
+    <PackageReference Include="System.Memory" Version="4.5.5" IncludeAssets="None" />
     <PackageReference Include="Xamarin.TestCloud.Agent">
       <Version>0.22.2</Version>
     </PackageReference>
@@ -205,4 +205,4 @@
     </ProjectReference>
   </ItemGroup>
   <Import Project="$(MSBuildExtensionsPath)\Xamarin\iOS\Xamarin.iOS.CSharp.targets" />
-</Project>
\ No newline at end of file
+</Project>
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
index ab0a8f8f00660..6f4386df60b24 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
@@ -69,7 +69,7 @@ stages:
 - stage: dml
   dependsOn: []
   jobs:
-    - template: templates/win-ci-vs-2019.yml
+    - template: templates/jobs/win-ci-vs-2022-job.yml
       parameters:
         BuildConfig: 'RelWithDebInfo'
         EnvSetupScript: setup_env.bat
@@ -82,7 +82,7 @@ stages:
         RunStaticCodeAnalysis: false
         ORT_EP_NAME: DML
         WITH_CACHE: true
-        MachinePool: onnxruntime-Win2019-GPU-dml-A10
+        MachinePool: onnxruntime-Win2022-GPU-dml-A10
 
 - stage: kernelDocumentation
   dependsOn: []

From 40277b7f372986e3ab280ea199b6fb5a21b502e4 Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Sun, 23 Jul 2023 15:02:09 +0800
Subject: [PATCH 24/34] Fix orttraining-linux-gpu-ci-pipeline -
 LargeSizeTensorUInt64Index tests (#16820)

### Disable large index tests due to limited GPU mem

Recently following two tests fail due to GPU mem not enough, not sure
what else program running using GPU as well. So disable them for now to
unblock the required CI.

```
1: [  FAILED  ] 2 tests, listed below:
1: [  FAILED  ] CrossEntropyTest.SoftmaxCrossEntropyLossInternal_LargeSizeTensorUInt64Index
1: [  FAILED  ] CrossEntropyTest.SoftmaxCrossEntropyLossInternalGrad_LargeSizeTensorUInt64Index


2023-07-23T02:15:39.7559251Z 1: [ RUN      ] CrossEntropyTest.SoftmaxCrossEntropyLossInternal_LargeSizeTensorUInt64Index
2023-07-23T02:16:53.0904576Z 1: 2023-07-23 02:16:53.089586592 [E:onnxruntime:SoftmaxCrossEntropyLossInternal, sequential_executor.cc:514 ExecuteKernel] Non-zero status code returned while running SoftmaxCrossEntropyLossInternal node. Name:'node1' Status Message: /onnxruntime_src/onnxruntime/core/framework/bfc_arena.cc:376 void* **onnxruntime::BFCArena::AllocateRawInternal(size_t, bool, onnxruntime::Stream*, bool, onnxruntime::WaitNotificationFn) Failed to allocate memory for requested buffer of size 4294973440**
2023-07-23T02:16:53.0905775Z 1:
2023-07-23T02:16:53.0906087Z 1: /onnxruntime_src/onnxruntime/test/providers/base_tester.cc:323: Failure
2023-07-23T02:16:53.0906698Z 1: Expected equality of these values:
2023-07-23T02:16:53.0907086Z 1:   expect_result
2023-07-23T02:16:53.0907564Z 1:     Which is: 4-byte object <00-00 00-00>
2023-07-23T02:16:53.0973055Z 1:   ExpectResult::kExpectFailure
2023-07-23T02:16:53.0973984Z 1:     Which is: 4-byte object <01-00 00-00>
2023-07-23T02:16:53.0975375Z 1: Run failed but expected success: Non-zero status code returned while running SoftmaxCrossEntropyLossInternal node. Name:'node1' Status Message: /onnxruntime_src/onnxruntime/core/framework/bfc_arena.cc:376 void* onnxruntime::BFCArena::AllocateRawInternal(size_t, bool, onnxruntime::Stream*, bool, onnxruntime::WaitNotificationFn) Failed to allocate memory for requested buffer of size 4294973440
2023-07-23T02:16:53.0976198Z 1:
2023-07-23T02:16:53.0976483Z 1: Google Test trace:
2023-07-23T02:16:53.0976818Z 1: /onnxruntime_src/onnxruntime/test/common/random_generator.h:49: ORT test random seed: 8910
2023-07-23T02:16:53.0977229Z 1: /onnxruntime_src/onnxruntime/test/common/random_generator.h:49: ORT test random seed: 8910
2023-07-23T02:16:53.0977639Z 1: /onnxruntime_src/onnxruntime/test/common/random_generator.h:49: ORT test random seed: 2345
2023-07-23T02:16:53.0978035Z 1: /onnxruntime_src/onnxruntime/test/common/random_generator.h:49: ORT test random seed: 5678
2023-07-23T02:16:53.0978441Z 1: /onnxruntime_src/onnxruntime/test/common/random_generator.h:49: ORT test random seed: 1234
2023-07-23T02:16:53.1303810Z 1: /onnxruntime_src/orttraining/orttraining/test/training_ops/cuda/cross_entropy_test.cc:443: Failure
2023-07-23T02:16:53.1304644Z 1: Expected equality of these values:
2023-07-23T02:16:53.1304974Z 1:   ret.first
2023-07-23T02:16:53.1305685Z 1:     Which is: 4-byte object <04-00 00-00>
2023-07-23T02:16:53.1306030Z 1:   COMPARE_RESULT::SUCCESS
2023-07-23T02:16:53.1306414Z 1:     Which is: 4-byte object <00-00 00-00>
2023-07-23T02:16:53.1306754Z 1: Unsupported compare with CompareOrtValueNumerals.
2023-07-23T02:16:53.1307487Z 1: Google Test trace:
2023-07-23T02:16:53.1307848Z 1: /onnxruntime_src/onnxruntime/test/common/random_generator.h:49: ORT test random seed: 8910
2023-07-23T02:16:53.1308252Z 1: /onnxruntime_src/onnxruntime/test/common/random_generator.h:49: ORT test random seed: 8910
2023-07-23T02:16:53.1308652Z 1: /onnxruntime_src/onnxruntime/test/common/random_generator.h:49: ORT test random seed: 2345
2023-07-23T02:16:53.1309068Z 1: /onnxruntime_src/onnxruntime/test/common/random_generator.h:49: ORT test random seed: 5678
2023-07-23T02:16:53.1309460Z 1: /onnxruntime_src/onnxruntime/test/common/random_generator.h:49: ORT test random seed: 1234
2023-07-23T02:16:53.1309889Z 1: /onnxruntime_src/orttraining/orttraining/test/training_ops/cuda/cross_entropy_test.cc:443: Failure
2023-07-23T02:16:53.1310239Z 1: Expected equality of these values:
2023-07-23T02:16:53.1310527Z 1:   ret.first
2023-07-23T02:16:53.1310893Z 1:     Which is: 4-byte object <04-00 00-00>
2023-07-23T02:16:53.1311208Z 1:   COMPARE_RESULT::SUCCESS
2023-07-23T02:16:53.1311600Z 1:     Which is: 4-byte object <00-00 00-00>
2023-07-23T02:16:53.1311921Z 1: Unsupported compare with CompareOrtValueNumerals.
2023-07-23T02:16:53.1312229Z 1: Google Test trace:
2023-07-23T02:16:53.1312556Z 1: /onnxruntime_src/onnxruntime/test/common/random_generator.h:49: ORT test random seed: 8910
2023-07-23T02:16:53.1312951Z 1: /onnxruntime_src/onnxruntime/test/common/random_generator.h:49: ORT test random seed: 8910
2023-07-23T02:16:53.1313362Z 1: /onnxruntime_src/onnxruntime/test/common/random_generator.h:49: ORT test random seed: 2345
2023-07-23T02:16:53.1313749Z 1: /onnxruntime_src/onnxruntime/test/common/random_generator.h:49: ORT test random seed: 5678
2023-07-23T02:16:53.1314156Z 1: /onnxruntime_src/onnxruntime/test/common/random_generator.h:49: ORT test random seed: 1234
2023-07-23T02:16:53.4476437Z 1: [  FAILED  ] CrossEntropyTest.SoftmaxCrossEntropyLossInternal_LargeSizeTensorUInt64Index (73692 ms)

```



### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../orttraining/test/training_ops/cuda/cross_entropy_test.cc  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/orttraining/orttraining/test/training_ops/cuda/cross_entropy_test.cc b/orttraining/orttraining/test/training_ops/cuda/cross_entropy_test.cc
index a80e07a2950df..d9800ce0e0d3e 100644
--- a/orttraining/orttraining/test/training_ops/cuda/cross_entropy_test.cc
+++ b/orttraining/orttraining/test/training_ops/cuda/cross_entropy_test.cc
@@ -641,7 +641,7 @@ TEST(CrossEntropyTest, DISABLED_SoftmaxCrossEntropyLoss_LargeSizeTensor) {
 #ifndef _WIN32
 // Disable the large size tests for Windows because it is too slow, running on Linux would be enough.
 // This test requires lots of memory, currently, it can run with 16GB V100 GPU.
-TEST(CrossEntropyTest, SoftmaxCrossEntropyLossInternal_LargeSizeTensorUInt64Index) {
+TEST(CrossEntropyTest, DISABLED_SoftmaxCrossEntropyLossInternal_LargeSizeTensorUInt64Index) {
   // The element count is bigger than the upper limit of int32_t.
   constexpr int64_t bsz = 419431;
   constexpr int64_t vocab_size = 5120;
@@ -1073,7 +1073,7 @@ TEST(CrossEntropyTest, SoftmaxCrossEntropyLossInternalGrad_TinySizeTensorFloatIn
 #ifndef _WIN32
 // Disable the large size tests for Windows because it is too slow, running on Linux would be enough.
 // This test requires lots of memory, currently, it can run with 16GB V100 GPU.
-TEST(CrossEntropyTest, SoftmaxCrossEntropyLossInternalGrad_LargeSizeTensorUInt64Index) {
+TEST(CrossEntropyTest, DISABLED_SoftmaxCrossEntropyLossInternalGrad_LargeSizeTensorUInt64Index) {
   // The element count is bigger than the upper limit of int32_t.
   constexpr int64_t bsz = 419431;
   constexpr int64_t vocab_size = 5120;

From 21ef14476b714c3c77a2051ea220df15a8527c5a Mon Sep 17 00:00:00 2001
From: Chi Lo <54722500+chilo-ms@users.noreply.github.com>
Date: Sun, 23 Jul 2023 16:16:17 -0700
Subject: [PATCH 25/34] Bug fix for nested control flow ops for TRT EP (#16343)

Current TRT EP can support model which has nested control flow ops
(multiple level subgraphs). But it fails at a case where the subgraph
has outer scope value that is defined several levels up in the top-level
graph, in this case, the outer scope value is the input of the top-level
graph. The outer scope values are not properly handled during TRT EP's
subgraph reconstruction stage and fails at `graph.resolve()`.

The way ORT gets capability from EPs is a bottom-up approach meaning
inner most subgraph gets handled first. TRT EP reconstructs each
subgraph level by level and following modifications are made to fix the
outer scope values issue:

- `SetGraphOuterScopeValuesAndInputs()` and `SetAllGraphInputs()` are
added to handle outer scope values and add those values as graph inputs
if needed in order to make `graph.resolve()` happy.
- Change to use `GetNodeArgIncludingParentGraphs` so that when creating
the fused TRT node for some subgraphs in`
Graph::CreateFusedSubGraphNode()`, it can get the NodeArgs for outer
scope values from top-level graph.


This PR fixes https://github.com/microsoft/onnxruntime/issues/16217
---
 onnxruntime/core/graph/graph.cc               |   8 +-
 .../shared_library/provider_interfaces.h      |   9 +
 .../shared_library/provider_wrappedtypes.h    |   9 +
 .../tensorrt/tensorrt_execution_provider.cc   |  31 +++
 .../tensorrt/tensorrt_execution_provider.h    |  46 ++++
 .../tensorrt_execution_provider_helper.cc     | 230 ++++++++++++++++
 .../core/session/provider_bridge_ort.cc       |   9 +
 ...time_test_python_nested_control_flow_op.py | 259 ++++++++++++++++++
 tools/ci_build/build.py                       |   5 +
 9 files changed, 604 insertions(+), 2 deletions(-)
 create mode 100644 onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc
 create mode 100644 onnxruntime/test/python/onnxruntime_test_python_nested_control_flow_op.py

diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index d75a7a519254e..33056006410c8 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -3878,13 +3878,17 @@ Node& Graph::CreateFusedSubGraphNode(const IndexedSubGraph& sub_graph, const std
 
   int cur_idx = 0;
   for (const auto& arg_name : func_meta_def->inputs) {
-    input_args.push_back(GetNodeArg(arg_name));
+    // In some cases, it needs to get the NodeArgs from ancestors.
+    // For example, if the subgraph we are going to build is the subgraph of the original graph
+    // and the NodeArgs of the outer scope values are defined in the top-level original graph.
+    input_args.push_back(GetNodeArgIncludingParentGraphs(arg_name));
     input_indexes[arg_name] = cur_idx++;
   }
 
   cur_idx = 0;
   for (const auto& arg_name : func_meta_def->outputs) {
-    output_args.push_back(GetNodeArg(arg_name));
+    // In some cases, it needs to get the NodeArgs from ancestors.
+    output_args.push_back(GetNodeArgIncludingParentGraphs(arg_name));
     output_indexes[arg_name] = cur_idx++;
   }
 
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index fa6ad26cc248a..27226005a9c0b 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -658,6 +658,8 @@ struct ProviderHost {
   virtual std::unique_ptr<Node__EdgeIterator> Node__OutputEdgesEnd(const Node* p) noexcept = 0;
 
   virtual void Node__ForEachDef(const Node* p, std::function<void(const NodeArg&, bool is_input)> func, bool include_missing_optional_defs) = 0;
+  virtual const std::unordered_map<std::string, gsl::not_null<Graph*>>& Node__GetAttributeNameToMutableSubgraphMap(Node* p) = 0;
+  virtual std::unordered_map<std::string, gsl::not_null<const Graph*>> Node__GetAttributeNameToSubgraphMap(const Node* p) const = 0;
 
   // NodeArg
   virtual const std::string& NodeArg__Name(const NodeArg* p) noexcept = 0;
@@ -695,6 +697,8 @@ struct ProviderHost {
   virtual std::unique_ptr<ONNX_NAMESPACE::GraphProto> Graph__ToGraphProto(const Graph* p) = 0;
 
   virtual NodeArg& Graph__GetOrCreateNodeArg(Graph* p, const std::string& name, const ONNX_NAMESPACE::TypeProto* p_arg_type) = 0;
+  virtual void Graph__AddOuterScopeNodeArg(Graph* p, const std::string& name) = 0;
+  virtual void Graph__SetInputs(Graph* p, gsl::span<const NodeArg* const> inputs) = 0;
 
   virtual Status Graph__Resolve(Graph* p) = 0;
   virtual void Graph__AddInitializedTensor(Graph* p, const ONNX_NAMESPACE::TensorProto& tensor) = 0;
@@ -708,10 +712,15 @@ struct ProviderHost {
 
   virtual const Node* Graph__ParentNode(const Graph* p) const = 0;
   virtual const Graph* Graph__ParentGraph(const Graph* p) const = 0;
+  virtual Graph* Graph__MutableParentGraph(Graph* p) = 0;
   virtual const std::string& Graph__Name(const Graph* p) const noexcept = 0;
   virtual const Path& Graph__ModelPath(const Graph* p) const = 0;
   virtual const std::vector<const NodeArg*>& Graph__GetInputsIncludingInitializers(const Graph* p) const noexcept = 0;
   virtual bool Graph__IsSubgraph(const Graph* p) = 0;
+  virtual int Graph__MaxNodeIndex(const Graph* p) const noexcept = 0;
+  virtual Node* Graph__GetNode(Graph* p, NodeIndex node_index) noexcept = 0;
+  virtual const Node* Graph__GetNode(const Graph* p, NodeIndex node_index) const = 0;
+  virtual const NodeArg* Graph__GetNodeArg(const Graph* p, const std::string& name) const = 0;
 
   // GraphViewer
   virtual void GraphViewer__operator_delete(GraphViewer* p) = 0;
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index e09c2c495886a..f0ab7869b7d50 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -653,6 +653,8 @@ struct Node final {
   EdgeConstIterator OutputEdgesEnd() const noexcept { return g_host->Node__OutputEdgesEnd(this); }
 
   void ForEachDef(std::function<void(const NodeArg&, bool is_input)> func, bool include_missing_optional_defs = false) const { g_host->Node__ForEachDef(this, func, std::move(include_missing_optional_defs)); }
+  const std::unordered_map<std::string, gsl::not_null<Graph*>>& GetAttributeNameToMutableSubgraphMap() { return g_host->Node__GetAttributeNameToMutableSubgraphMap(this); }
+  std::unordered_map<std::string, gsl::not_null<const Graph*>> GetAttributeNameToSubgraphMap() const { return g_host->Node__GetAttributeNameToSubgraphMap(this); }
 
   PROVIDER_DISALLOW_ALL(Node)
 };
@@ -707,6 +709,8 @@ struct Graph final {
   std::unique_ptr<ONNX_NAMESPACE::GraphProto> ToGraphProto() const { return g_host->Graph__ToGraphProto(this); }
 
   NodeArg& GetOrCreateNodeArg(const std::string& name, const ONNX_NAMESPACE::TypeProto* p_arg_type) { return g_host->Graph__GetOrCreateNodeArg(this, name, p_arg_type); }
+  void AddOuterScopeNodeArg(const std::string& name) { g_host->Graph__AddOuterScopeNodeArg(this, name); }
+  void SetInputs(gsl::span<const NodeArg* const> inputs) { g_host->Graph__SetInputs(this, inputs); }
 
   Status Resolve() { return g_host->Graph__Resolve(this); }
   void AddInitializedTensor(const ONNX_NAMESPACE::TensorProto& tensor) { return g_host->Graph__AddInitializedTensor(this, tensor); }
@@ -721,10 +725,15 @@ struct Graph final {
 
   const Node* ParentNode() const { return g_host->Graph__ParentNode(this); }
   const Graph* ParentGraph() const { return g_host->Graph__ParentGraph(this); }
+  Graph* MutableParentGraph() { return g_host->Graph__MutableParentGraph(this); }
   const std::string& Name() const noexcept { return g_host->Graph__Name(this); }
   const Path& ModelPath() const { return g_host->Graph__ModelPath(this); }
   const std::vector<const NodeArg*>& GetInputsIncludingInitializers() const noexcept { return g_host->Graph__GetInputsIncludingInitializers(this); }
   bool IsSubgraph() const { return g_host->Graph__IsSubgraph(this); }
+  int MaxNodeIndex() const noexcept { return g_host->Graph__MaxNodeIndex(this); }
+  const Node* GetNode(NodeIndex node_index) const noexcept { return g_host->Graph__GetNode(this, node_index); }
+  Node* GetNode(NodeIndex node_index) noexcept { return g_host->Graph__GetNode(this, node_index); }
+  const NodeArg* GetNodeArg(const std::string& name) const { return g_host->Graph__GetNodeArg(this, name); }
 
   PROVIDER_DISALLOW_ALL(Graph)
 };
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 6142b6d393d7e..1567628a66829 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -1283,6 +1283,7 @@ SubGraphCollection_t TensorrtExecutionProvider::GetSupportedList(SubGraphCollect
       } else {
         auto model_build = graph.CreateModel(*GetLogger());
         auto& graph_build = model_build->MainGraph();
+        bool has_control_flow_op = false;
 
         // Add node and node args
         // If node output is also parent graph output, the  output will be added to the
@@ -1321,6 +1322,10 @@ SubGraphCollection_t TensorrtExecutionProvider::GetSupportedList(SubGraphCollect
             }
           }
 
+          if (control_flow_op_set_.find(node->OpType()) != control_flow_op_set_.end()) {
+            has_control_flow_op = true;
+          }
+
           // If the node has subgraph, it's possible that the ORT graph of that subgraph and the GraphProto in the node attributes are not in sync because of graph optimization.
           // Therefore, we need to force GraphProto attributes to be updated in order to get the valid GraphProto.
           if (node->GetAttributes().size() > 0) {
@@ -1345,6 +1350,13 @@ SubGraphCollection_t TensorrtExecutionProvider::GetSupportedList(SubGraphCollect
           }
         }
 
+        if (has_control_flow_op) {
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Handle outer scope values for the subgraph " << graph_build.Name();
+          BuildSubGraphContext(graph_build);
+          SetGraphOuterScopeValuesAndInputs(graph_build, graph.GetGraph());
+          SetAllGraphInputs(graph_build);
+        }
+
         ORT_ENFORCE(graph_build.Resolve().IsOK());
 
         // Add parent graph output to the subgraph
@@ -1657,6 +1669,20 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
           std::iota(std::begin(subgraph_nodes_vector), std::end(subgraph_nodes_vector), 0);
           SubGraphCollection_t parser_subgraph_nodes_vector = {{subgraph_nodes_vector, false}};
           bool subgraph_early_termination = false;
+
+          // Another subgraph of "If" control flow has been parsed by GetCapability before and all subgraph's nodes assigned to TRT EP.
+          if (AllNodesAssignedToSpecificEP(*sub_graph_veiwer, kTensorrtExecutionProvider)) {
+            all_subgraphs_are_supported = true;
+            break;
+          }
+          // Another subgraph of "If" control flow has been parsed by GetCapability and not all subgraph's nodes assigned to TRT EP.
+          // (Note: GetExecutionProviderType() returns "" meaning node has not yet been assigned to any EPs)
+          else if (!AllNodesAssignedToSpecificEP(*sub_graph_veiwer, "")) {
+            all_subgraphs_are_supported = false;
+            break;
+          }
+
+          // Another subgraph of "If" control flow has not yet been parsed by GetCapability.
           subgraph_supported_nodes_vector = GetSupportedList(parser_subgraph_nodes_vector, 0, max_partition_iterations_, *sub_graph_veiwer, &subgraph_early_termination);
           all_subgraphs_are_supported = IsSubGraphFullySupported(subgraph_supported_nodes_vector, number_of_ort_subgraph_nodes);
           break;
@@ -1677,6 +1703,9 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
         }
       }
       LOGS_DEFAULT(INFO) << "[TensorRT EP] Whole graph will run on TensorRT execution provider";
+
+      // The context map is only used during EP compile time, release it to save memory space.
+      subgraph_context_map_.clear();
       return result;
     }
   }
@@ -1700,6 +1729,8 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
     LOGS_DEFAULT(INFO) << "[TensorRT EP] Graph is partitioned and number of subgraphs running on TensorRT execution provider is " << number_of_subgraphs;
   }
 
+  // The context map is only used during EP compile time, release it to save memory space.
+  subgraph_context_map_.clear();
   return result;
 }
 
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index 56eda7ad83537..13c5eff08bc46 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -138,6 +138,15 @@ struct TensorrtFuncState {
   bool cuda_graph_enable = 0;
 };
 
+// Holds important information for building valid ORT graph.
+struct SubGraphContext {
+  std::unordered_set<std::string> output_args;
+  std::unordered_map<std::string, const NodeArg*> inputs_and_initializers;
+  std::unordered_map<std::string, const NodeArg*> manually_added_graph_inputs;
+};
+
+using SubGraphContextMap = std::unordered_map<std::string, std::unique_ptr<SubGraphContext>>;
+
 // Logical device representation.
 class TensorrtExecutionProvider : public IExecutionProvider {
  public:
@@ -224,6 +233,7 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   const int min_num_runs_before_cuda_graph_capture_ = 1;  // required min regular runs before graph capture for the necessary memory allocations.
 
   std::unordered_set<std::string> control_flow_op_set_ = {"If", "Loop", "Scan"};
+  mutable std::unordered_map<std::string, std::unique_ptr<SubGraphContext>> subgraph_context_map_;
   std::unordered_map<std::string, tensorrt_ptr::unique_pointer<nvonnxparser::IParser>> parsers_;
   std::unordered_map<std::string, std::unique_ptr<nvinfer1::ICudaEngine>> engines_;
   std::unordered_map<std::string, std::unique_ptr<nvinfer1::IExecutionContext>> contexts_;
@@ -273,6 +283,42 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   /**Check whether all the nodes of subgraph are supported*/
   bool IsSubGraphFullySupported(SubGraphCollection_t supported_nodes_vector, const int number_of_ort_nodes) const;
 
+  /**
+   * Set inputs, initializers and outputs for all subgraphs during TensorrtExecutionProvider::GetSupportedList()
+   * and save those information in subgraph context data structure. It's useful for building a valid graph and
+   * make Graph::Resolve() happy especially when dealing with nested control-flow op graph.
+   */
+  void BuildSubGraphContext(const Graph& build_graph) const;
+
+  /**
+   * Set outer scope values for subgraphs and add thoes values as top-level graph's inputs if needed.
+   */
+  void SetGraphOuterScopeValuesAndInputs(Graph& build_graph, const Graph& graph) const;
+
+  /**
+   * If ORT TRT manually sets graph input in TensorrtExecutionProvider::SetGraphOuterScopeValuesAndInputs(),
+   * we have to manully set all the graph inputs in order to pass Graph::Resolve().
+   */
+  void SetAllGraphInputs(Graph& graph) const;
+
+  /**
+   * The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage
+   * Graph::ResolveContext::IsInputInitializerOrOutput(). We have to implement this fuction again.
+   */
+  bool IsInputInitializerOrOutput(const Graph& graph, const std::string& name, bool check_ancestors) const;
+
+  /**
+   * The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage
+   * Graph::ResolveContext::IsOuterScopeValue(). We have to implement this fuction again.
+   */
+  bool IsOuterScopeValue(const Graph& graph, const std::string& name) const;
+
+  /**
+   * The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage
+   * Graph::ResolveContext::IsLocalValue(). We have to implement this fuction again.
+   */
+  bool IsLocalValue(const Graph& graph, const std::string& name) const;
+
   bool IsGraphCaptureAllowed() const;
   void CaptureBegin();
   void CaptureEnd();
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc
new file mode 100644
index 0000000000000..ecc72b1c65476
--- /dev/null
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_helper.cc
@@ -0,0 +1,230 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/shared_library/provider_api.h"
+#include "tensorrt_execution_provider.h"
+#include <iostream>
+
+namespace onnxruntime {
+
+// The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage
+// Graph::ResolveContext::IsInputInitializerOrOutput(). We have to implement this fuction again.
+bool TensorrtExecutionProvider::IsInputInitializerOrOutput(const Graph& graph,
+                                                           const std::string& name,
+                                                           bool check_ancestors) const {
+  const Graph* parent_graph = nullptr;
+  return IsLocalValue(graph, name) ||
+         (check_ancestors && (parent_graph = graph.ParentGraph()) != nullptr &&
+          IsInputInitializerOrOutput(*parent_graph, name, check_ancestors));
+}
+
+// The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage
+// Graph::ResolveContext::IsOuterScopeValue(). We have to implement this function again.
+bool TensorrtExecutionProvider::IsOuterScopeValue(const Graph& graph,
+                                                  const std::string& name) const {
+  const Graph* parent_graph = nullptr;
+  return (parent_graph = graph.ParentGraph()) != nullptr &&
+         IsInputInitializerOrOutput(*parent_graph, name, true);
+}
+
+// The newly-built graph has not yet being resolved by Graph::Resolve(), so we can't leverage
+// Graph::ResolveContext::IsLocalValue(). We have to implement this function again.
+bool TensorrtExecutionProvider::IsLocalValue(const Graph& graph,
+                                             const std::string& name) const {
+  if (subgraph_context_map_.find(graph.Name()) == subgraph_context_map_.end()) {
+    return false;
+  }
+  SubGraphContext* context = subgraph_context_map_.at(graph.Name()).get();
+  return context->output_args.find(name) != context->output_args.cend() ||
+         context->inputs_and_initializers.find(name) != context->inputs_and_initializers.cend();
+}
+
+/**
+ * Set inputs, initializers and outputs for all subgraphs during TensorrtExecutionProvider::GetSupportedList()
+ * and save those information in subgraph context data structure. It's useful for building a valid graph and
+ * make Graph::Resolve() happy especially when dealing with nested control-flow op graph.
+ */
+void TensorrtExecutionProvider::BuildSubGraphContext(const Graph& graph) const {
+  // Iterate all the nodes and recurse into inner most subgraph first
+  for (int i = 0; i < graph.MaxNodeIndex(); ++i) {
+    auto node = graph.GetNode(i);
+    if (node == nullptr) {
+      continue;
+    }
+
+    auto subgraph_map = node->GetAttributeNameToSubgraphMap();
+    for (auto& entry : subgraph_map) {
+      const Graph* subgraph = entry.second;
+      BuildSubGraphContext(*subgraph);
+    }
+  }
+
+  // Subgraph context has been built before, no need to do it again
+  if (subgraph_context_map_.find(graph.Name()) != subgraph_context_map_.end()) {
+    return;
+  }
+
+  subgraph_context_map_.emplace(graph.Name(), std::make_unique<SubGraphContext>());
+  SubGraphContext* context = subgraph_context_map_.at(graph.Name()).get();
+
+  // Collect all nodes' outputs and nodes' name
+  for (int i = 0; i < graph.MaxNodeIndex(); ++i) {
+    auto node = graph.GetNode(i);
+    if (node == nullptr) {
+      continue;
+    }
+
+    for (const auto& output : node->OutputDefs()) {
+      context->output_args.insert(output->Name());
+    }
+  }
+
+  // Go thru all node's inputs
+  for (int i = 0; i < graph.MaxNodeIndex(); ++i) {
+    auto node = graph.GetNode(i);
+    if (node == nullptr) {
+      continue;
+    }
+
+    for (const auto& input : node->InputDefs()) {
+      if (context->output_args.find(input->Name()) != context->output_args.end()) {
+        continue;
+      }
+      // This input arg is not the output of another node so must come from either a graph input or an initializer.
+      context->inputs_and_initializers[input->Name()] = input;
+    }
+  }
+}
+
+// Set outer scope values for subgraphs and add thoes values as top-level graph's inputs if needed.
+void TensorrtExecutionProvider::SetGraphOuterScopeValuesAndInputs(Graph& graph_build,
+                                                                  const Graph& graph) const {
+  // Iterate all the nodes and recurse into inner most subgraph first for both newly built graph and original graph
+  for (int i = 0; i < graph_build.MaxNodeIndex(); ++i) {
+    auto graph_build_node = graph_build.GetNode(i);
+    if (graph_build_node == nullptr) {
+      continue;
+    }
+
+    auto graph_build_map = graph_build_node->GetAttributeNameToMutableSubgraphMap();
+    std::unordered_map<std::string, gsl::not_null<const Graph*>> subgraph_map;
+    const Node* graph_node = nullptr;
+
+    // Find corresponding original graph node's subgraphs
+    for (int j = 0; j < graph.MaxNodeIndex(); ++j) {
+      if (graph.GetNode(j) && graph.GetNode(j)->Name() == graph_build_node->Name()) {
+        graph_node = graph.GetNode(j);
+        subgraph_map = graph_node->GetAttributeNameToSubgraphMap();
+        break;
+      }
+    }
+
+    for (auto& entry : graph_build_map) {
+      auto attr_name = entry.first;
+      Graph* subgraph_build = entry.second;
+      if (subgraph_map.find(attr_name) != subgraph_map.end()) {
+        // recurse into subgraph
+        const Graph* subgraph = subgraph_map.at(attr_name);
+        SetGraphOuterScopeValuesAndInputs(*subgraph_build, *subgraph);
+      }
+    }
+  }
+
+  // Start from the inner most subgraph first and check whether its outer scope values are existed in the
+  // newly built graph. If not, we need to add those outer scope values as explicit inputs to the top-level
+  // of newly built graph.
+  if (graph_build.ParentNode()) {
+    auto top_level_graph = &graph_build;
+    while (top_level_graph->MutableParentGraph()) {
+      top_level_graph = top_level_graph->MutableParentGraph();
+    }
+    if (subgraph_context_map_.find(top_level_graph->Name()) == subgraph_context_map_.end()) {
+      LOGS_DEFAULT(ERROR) << "[TensorRT EP] Can't find top-level graph context. \
+                              Please check BuildSubGraphContext() has built the graph context correctly.";
+      return;
+    }
+
+    SubGraphContext* context = subgraph_context_map_.at(top_level_graph->Name()).get();
+
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Subgraph name is " << graph_build.Name();
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Its parent node is " << graph.ParentNode()->Name();
+    LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Its parent node's implicit inputs:";
+
+    // Iterate all the implicit inputs to set outer scope value for the newly built subgraph
+    for (const auto& input : graph.ParentNode()->ImplicitInputDefs()) {
+      LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] \t" << input->Name();
+
+      // The node arg in parent node's implicit inputs could be used for parent node's other subgraph, for example
+      // "If" op has two subgraphs. So we need to make sure that the node arg is used in current subgraph only.
+      // (GetNodeArg searches for specific node arg in all node args in the graph)
+      if (graph_build.GetNodeArg(input->Name())) {
+        graph_build.AddOuterScopeNodeArg(input->Name());
+        LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] \t" << input->Name() << " is used in this subgraph";
+
+        if (context &&
+            (context->manually_added_graph_inputs.find(input->Name()) != context->manually_added_graph_inputs.end())) {
+          LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] \t" << input->Name() << " is already been added as an explicit input to graph";
+          continue;
+        }
+
+        // Handle the case where this outer scope value is not existed in any outer scope levels of the
+        // newly built graph (the newly built graph is the subgraph of the original graph). Need to add
+        // the outer scope value as an explicit input to the top-level of newly built graph.
+        if (!IsOuterScopeValue(graph_build, input->Name())) {
+          const auto& name = input->Name();
+          auto graph_inputs_including_initializers = top_level_graph->GetInputsIncludingInitializers();
+          auto added_graph_input = std::find_if(graph_inputs_including_initializers.begin(),
+                                                graph_inputs_including_initializers.end(),
+                                                [&name](const NodeArg* entry) { return entry->Name() == name; });
+
+          if (added_graph_input == graph_inputs_including_initializers.end()) {
+            if (context) {
+              auto type_proto = ONNX_NAMESPACE::TypeProto::Create();
+              type_proto->copy_from(input->TypeAsProto());
+              auto& n_input = top_level_graph->GetOrCreateNodeArg(name, type_proto.get());
+              context->manually_added_graph_inputs[n_input.Name()] = &n_input;
+              LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] \t" << n_input.Name() << " is added as an explicit input into the newly built graph";
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// If ORT TRT manually sets graph input in TensorrtExecutionProvider::SetGraphOuterScopeValuesAndInputs(),
+// we have to manully set all the graph inputs in order to pass Graph::Resolve()
+void TensorrtExecutionProvider::SetAllGraphInputs(Graph& graph) const {
+  // If ORT TRT doesn't manully set graph input in TensorrtExecutionProvider::SetGraphOuterScopeValuesAndInputs(),
+  // Graph::Resolve() will help set graph inputs in Graph::SetGraphInputsOutputs(), so no need to set graph inputs here.
+  if (subgraph_context_map_.find(graph.Name()) == subgraph_context_map_.end() ||
+      subgraph_context_map_[graph.Name()].get()->manually_added_graph_inputs.size() == 0) {
+    return;
+  }
+
+  SubGraphContext* context = subgraph_context_map_[graph.Name()].get();
+  std::vector<const NodeArg*> graph_inputs_including_initializers;
+  std::unordered_set<std::string> graph_inputs_including_initializers_set;
+
+  for (const auto& entry : context->inputs_and_initializers) {
+    graph_inputs_including_initializers.push_back(entry.second);
+    graph_inputs_including_initializers_set.insert(entry.first);
+  }
+
+  for (const auto& entry : context->manually_added_graph_inputs) {
+    if (graph_inputs_including_initializers_set.find(entry.first) == graph_inputs_including_initializers_set.end()) {
+      graph_inputs_including_initializers.push_back(entry.second);
+      graph_inputs_including_initializers_set.insert(entry.first);
+    }
+  }
+
+  for (const auto& node_arg : graph.GetInputsIncludingInitializers()) {
+    if (graph_inputs_including_initializers_set.find(node_arg->Name()) == graph_inputs_including_initializers_set.end()) {
+      graph_inputs_including_initializers.push_back(node_arg);
+      graph_inputs_including_initializers_set.insert(node_arg->Name());
+    }
+  }
+
+  graph.SetInputs(graph_inputs_including_initializers);
+}
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 4003f984beda7..be549f5e665a0 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -763,6 +763,8 @@ struct ProviderHostImpl : ProviderHost {
   std::unique_ptr<Node__EdgeIterator> Node__OutputEdgesEnd(const Node* p) noexcept override { return std::make_unique<Node__EdgeIterator_Impl>(p->OutputEdgesEnd()); }
 
   void Node__ForEachDef(const Node* p, std::function<void(const NodeArg&, bool is_input)> func, bool include_missing_optional_defs) override { p->ForEachDef(func, std::move(include_missing_optional_defs)); }
+  const std::unordered_map<std::string, gsl::not_null<Graph*>>& Node__GetAttributeNameToMutableSubgraphMap(Node* p) noexcept override { return p->GetAttributeNameToMutableSubgraphMap(); }
+  std::unordered_map<std::string, gsl::not_null<const Graph*>> Node__GetAttributeNameToSubgraphMap(const Node* p) const override { return p->GetAttributeNameToSubgraphMap(); }
 
   // NodeArg (wrapped)
   const std::string& NodeArg__Name(const NodeArg* p) noexcept override { return p->Name(); }
@@ -803,8 +805,10 @@ struct ProviderHostImpl : ProviderHost {
   // Graph (wrapped)
   std::unique_ptr<GraphViewer> Graph__CreateGraphViewer(const Graph* p) override { return std::make_unique<GraphViewer>(*p); }
   std::unique_ptr<ONNX_NAMESPACE::GraphProto> Graph__ToGraphProto(const Graph* p) override { return std::make_unique<ONNX_NAMESPACE::GraphProto>(p->ToGraphProto()); }
+  void Graph__SetInputs(Graph* p, gsl::span<const NodeArg* const> inputs) override { p->SetInputs(inputs); }
 
   NodeArg& Graph__GetOrCreateNodeArg(Graph* p, const std::string& name, const ONNX_NAMESPACE::TypeProto* p_arg_type) override { return p->GetOrCreateNodeArg(name, p_arg_type); }
+  void Graph__AddOuterScopeNodeArg(Graph* p, const std::string& name) override { p->AddOuterScopeNodeArg(name); }
 
   Status Graph__Resolve(Graph* p) override { return p->Resolve(); }
   void Graph__AddInitializedTensor(Graph* p, const ONNX_NAMESPACE::TensorProto& tensor) override { p->AddInitializedTensor(tensor); }
@@ -820,10 +824,15 @@ struct ProviderHostImpl : ProviderHost {
 
   const Node* Graph__ParentNode(const Graph* p) const override { return p->ParentNode(); }
   const Graph* Graph__ParentGraph(const Graph* p) const override { return p->ParentGraph(); }
+  Graph* Graph__MutableParentGraph(Graph* p) override { return p->MutableParentGraph(); }
   const std::string& Graph__Name(const Graph* p) const noexcept override { return p->Name(); }
   const Path& Graph__ModelPath(const Graph* p) const override { return p->ModelPath(); }
   const std::vector<const NodeArg*>& Graph__GetInputsIncludingInitializers(const Graph* p) const noexcept override { return p->GetInputsIncludingInitializers(); }
   bool Graph__IsSubgraph(const Graph* p) override { return p->IsSubgraph(); }
+  int Graph__MaxNodeIndex(const Graph* p) const noexcept override { return p->MaxNodeIndex(); }
+  Node* Graph__GetNode(Graph* p, NodeIndex node_index) noexcept override { return p->GetNode(node_index); }
+  const Node* Graph__GetNode(const Graph* p, NodeIndex node_index) const override { return p->GetNode(node_index); }
+  const NodeArg* Graph__GetNodeArg(const Graph* p, const std::string& name) const override { return p->GetNodeArg(name); }
 
   // GraphViewer (wrapped)
   void GraphViewer__operator_delete(GraphViewer* p) override { delete p; }
diff --git a/onnxruntime/test/python/onnxruntime_test_python_nested_control_flow_op.py b/onnxruntime/test/python/onnxruntime_test_python_nested_control_flow_op.py
new file mode 100644
index 0000000000000..bf354ad9f9e10
--- /dev/null
+++ b/onnxruntime/test/python/onnxruntime_test_python_nested_control_flow_op.py
@@ -0,0 +1,259 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import unittest
+from copy import deepcopy
+from typing import Optional, Sequence, Tuple
+
+import numpy as np
+from onnx import ModelProto, NodeProto, TensorProto, ValueInfoProto, checker, helper
+
+import onnxruntime as ort
+
+
+def make_vi_like(vi: ValueInfoProto, name: str) -> ValueInfoProto:
+    """Makes a copy of `vi` with a new name."""
+    new_vi = deepcopy(vi)
+    new_vi.name = name
+    return new_vi
+
+
+def make_optional_tensor_value_info(name: str, elem_type: int, shape: Sequence[int]) -> ValueInfoProto:
+    """Makes a `ValueInfoProto` with optional type."""
+    tensor_type_proto = helper.make_tensor_type_proto(
+        elem_type=elem_type,
+        shape=shape,
+    )
+    opt_type_proto = helper.make_optional_type_proto(tensor_type_proto)
+
+    vi = helper.make_tensor_value_info(name, elem_type, shape)
+    vi.type.CopyFrom(opt_type_proto)
+    return vi
+
+
+def make_optional_vi(vi: ValueInfoProto, name: Optional[str] = None) -> ValueInfoProto:
+    """Makes a copy of `vi` with optional type."""
+    name = name or vi.name + ".opt"
+    vi_type = vi.type.tensor_type
+    vi_shape = [d.dim_param if d.dim_param else d.dim_value for d in vi_type.shape.dim]
+    opt_vi = make_optional_tensor_value_info(name, vi_type.elem_type, vi_shape)
+    return opt_vi
+
+
+def make_const(vi: ValueInfoProto, name: str, value: int = 0) -> Tuple[ValueInfoProto, NodeProto, TensorProto]:
+    """Creates a constant 1D tensor from `vi`."""
+    const_vi = make_vi_like(vi, name)
+    const_shape = [d.dim_value for d in vi.type.tensor_type.shape.dim]
+    const_shape_tensor = helper.make_tensor(f"{name}.shape", TensorProto.INT64, [len(const_shape)], const_shape)
+    const_fill = helper.make_tensor(f"{name}.const.value", const_vi.type.tensor_type.elem_type, [1], [value])
+    const_node = helper.make_node(
+        "ConstantOfShape",
+        inputs=[const_shape_tensor.name],
+        outputs=[const_vi.name],
+        name=f"ConstantOfShape.{name}",
+        value=const_fill,
+    )
+    return const_vi, const_node, const_shape_tensor
+
+
+# This is a three-layer nested control flow ops model.
+# The innermost subgraphs have the outer scope values that are the inputs, x2 and x3, of the top-level graph.
+def make_opt_nested_greater_or_equal() -> ModelProto:
+    """
+    Creates a nested graph with (`optional(x1)`, `x2`, x3`) tensor inputs.
+
+    `x3` is similar to an optional input with default value of -1.
+    """
+    # Inputs/outputs
+    x1_vi = helper.make_tensor_value_info("x1", TensorProto.FLOAT, [1, 2])
+    x2_vi = helper.make_tensor_value_info("x2", TensorProto.FLOAT, [1, 2])
+    x3_vi = helper.make_tensor_value_info("x3", TensorProto.FLOAT, [1])
+    y_vi = helper.make_tensor_value_info("y", TensorProto.FLOAT, [1, 2])
+    opt_x1_vi = make_optional_vi(x1_vi, name="x1.opt")
+
+    # Add `x1` and `x2` subgraph
+    y1_vi = make_vi_like(y_vi, "x1.add.x2")
+    input_get_elem_node = helper.make_node(
+        "OptionalGetElement",
+        inputs=[opt_x1_vi.name],
+        outputs=[x1_vi.name],
+        name="OptionalGetElement.Input",
+    )
+    add_node = helper.make_node("Add", inputs=[x1_vi.name, x2_vi.name], outputs=[y1_vi.name], name="Add_Op")
+    add_x1_x2_subgraph = helper.make_graph(
+        [input_get_elem_node, add_node],
+        name="add-x1-x2-subgraph",
+        inputs=[],
+        outputs=[y1_vi],
+        value_info=[opt_x1_vi, x1_vi, x2_vi],
+    )
+
+    # Add `x2` and const subgraph
+    y2_vi = make_vi_like(y_vi, "x2.add.const")
+    const_vi, const_node, const_shape_tensor = make_const(x1_vi, "x1.const", value=1)
+    add_const_node = helper.make_node(
+        "Add",
+        inputs=[const_vi.name, x2_vi.name],
+        outputs=[y2_vi.name],
+        name="Add_Const",
+    )
+    add_x2_const_subgraph = helper.make_graph(
+        [const_node, add_const_node],
+        name="add-x2-const-subgraph",
+        inputs=[],
+        outputs=[y2_vi],
+        value_info=[const_vi, x2_vi],
+        initializer=[const_shape_tensor],
+    )
+
+    # Add `x3` and const subgraph
+    add_const_out_vi = make_vi_like(x3_vi, "out.1")
+    y3_vi = make_vi_like(y_vi, "x3.add.const")
+    const_vi, const_node, const_shape_tensor = make_const(x3_vi, "x3.const", value=2)
+    add_const_node = helper.make_node(
+        "Add",
+        inputs=[const_vi.name, x3_vi.name],
+        outputs=[add_const_out_vi.name],
+        name="Add_Const.1",
+    )
+    expand_shape = helper.make_tensor(f"{add_const_out_vi}.shape", TensorProto.INT64, [2], [1, 2])
+    expand_node = helper.make_node(
+        "Expand",
+        inputs=[add_const_out_vi.name, expand_shape.name],
+        outputs=[y3_vi.name],
+        name="Expand.out",
+    )
+    add_x3_const_subgraph = helper.make_graph(
+        [const_node, add_const_node, expand_node],
+        name="add-x3-const-subgraph",
+        inputs=[],
+        outputs=[y3_vi],
+        value_info=[x3_vi, const_vi, add_const_out_vi],
+        initializer=[const_shape_tensor, expand_shape],
+    )
+
+    # Subgraph flow based on `x3` value
+    y3_if_vi = make_vi_like(y_vi, "x3.if.out")
+    x3_eq_vi, x3_const_node, x3_const_shape_tensor = make_const(x3_vi, "x3.equal", value=0)
+    x3_ge_vi = helper.make_tensor_value_info(
+        "x3_ge",
+        TensorProto.BOOL,
+        shape=[1],
+    )
+    x3_ge_node = helper.make_node(
+        "GreaterOrEqual",
+        inputs=[x3_vi.name, x3_eq_vi.name],
+        outputs=[x3_ge_vi.name],
+        name="GreaterOrEqual.Target",
+    )
+    x3_has_elem_vi = helper.make_tensor_value_info(
+        "x3_has_elem",
+        TensorProto.BOOL,
+        shape=[],  # scalar
+    )
+    x3_has_elem_node = helper.make_node(
+        "Squeeze",
+        inputs=[x3_ge_vi.name],
+        outputs=[x3_has_elem_vi.name],
+        name="Squeeze.x3",
+    )
+    if_input_node = helper.make_node(
+        "If",
+        inputs=[x3_has_elem_vi.name],  # condition
+        outputs=[y3_if_vi.name],
+        name="If.OptionalHasElement.x3",
+        then_branch=add_x3_const_subgraph,
+        else_branch=add_x2_const_subgraph,
+    )
+    x3_subgraph = helper.make_graph(
+        [x3_const_node, x3_ge_node, x3_has_elem_node, if_input_node],
+        name="x3-subgraph",
+        inputs=[],
+        outputs=[y3_if_vi],
+        value_info=[x3_vi, x3_eq_vi, x3_ge_vi, x3_has_elem_vi],
+        initializer=[x3_const_shape_tensor],
+    )
+
+    # Construct main graph
+    x1_has_elem_vi = helper.make_tensor_value_info(
+        "x1_has_elem",
+        TensorProto.BOOL,
+        shape=[],  # scalar
+    )
+    x1_has_elem_node = helper.make_node(
+        "OptionalHasElement",
+        inputs=[opt_x1_vi.name],
+        outputs=[x1_has_elem_vi.name],
+        name="OptionalHasElement.x1",
+    )
+    if_input_node = helper.make_node(
+        "If",
+        inputs=[x1_has_elem_vi.name],  # condition
+        outputs=[y_vi.name],
+        name="If.OptionalHasElement.x1",
+        then_branch=add_x1_x2_subgraph,
+        else_branch=x3_subgraph,
+    )
+    graph = helper.make_graph(
+        [x1_has_elem_node, if_input_node],
+        "opt-graph",
+        [opt_x1_vi, x2_vi, x3_vi],
+        [y_vi],
+        value_info=[x1_has_elem_vi],
+    )
+
+    m = helper.make_model(
+        graph,
+        opset_imports=[
+            helper.make_opsetid("", 15),
+        ],
+    )
+
+    checker.check_model(m, full_check=True)
+
+    return m
+
+
+def test_nested_optional_greater_or_equal(use_trt: bool = False) -> None:
+    m = make_opt_nested_greater_or_equal()
+
+    providers = ["CUDAExecutionProvider"]
+    if use_trt:
+        providers.insert(0, "TensorrtExecutionProvider")
+    session = ort.InferenceSession(
+        m.SerializeToString(),
+        providers=providers,
+    )
+
+    x1_name, x2_name, x3_name = (i.name for i in m.graph.input)
+    session.run(
+        [m.graph.output[0].name],
+        {
+            x1_name: None,
+            x2_name: np.ones((1, 2), dtype=np.float32),
+            x3_name: np.array([-1], dtype=np.float32),
+        },
+    )
+
+    return
+
+
+# ORT has a similar unit test Test3LayerNestedSubgraph where this 3-layer nested graph consumes the same initializer in different subgraphs.
+# However, this unit test is slightly different. This is also a 3-layer nested graph but consumes the outer scope values (which are the inputs
+# of the top-level graph) in different subgraphs.
+class TestNestedControlFlowOpsGraph(unittest.TestCase):
+    # We currently only test CUDA/TRT EP due to users only raise this issue when using CUDA/TRT EP.
+    @unittest.skipIf(
+        "TensorrtExecutionProvider" not in ort.get_available_providers()
+        and "CUDAExecutionProvider" not in ort.get_available_providers(),
+        reason="Test CUDA/TRT EP only",
+    )
+    def test_3_level_control_flow_ops_graph(self):
+        if "CUDAExecutionProvider" in ort.get_available_providers():
+            test_nested_optional_greater_or_equal(use_trt=False)
+        if "TensorrtExecutionProvider" in ort.get_available_providers():
+            test_nested_optional_greater_or_equal(use_trt=True)
+
+
+if __name__ == "__main__":
+    unittest.main(module=__name__, buffer=True)
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 401c791bae6ac..85a3488fa8b60 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -1788,6 +1788,11 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
             if not args.disable_ml_ops and not args.use_tensorrt:
                 run_subprocess([sys.executable, "onnxruntime_test_python_mlops.py"], cwd=cwd, dll_path=dll_path)
 
+            if args.use_tensorrt:
+                run_subprocess(
+                    [sys.executable, "onnxruntime_test_python_nested_control_flow_op.py"], cwd=cwd, dll_path=dll_path
+                )
+
             try:
                 import onnx  # noqa: F401
 

From 8ede2f139e72bcdd1092c06314e1eb2b9f37775e Mon Sep 17 00:00:00 2001
From: PeixuanZuo <94887879+PeixuanZuo@users.noreply.github.com>
Date: Mon, 24 Jul 2023 13:57:48 +0800
Subject: [PATCH 26/34] [ROCm] Optimize ROCm CI pipeline 2 (#16691)

- Set `KERNEL_EXPLORER_TEST_USE_CUPY=1` to replace numpy with cupy on
kernel explorer test.

KERNEL_EXPLORER_TEST_USE_CUPY=0 The CPU utilization is shown as below:

![image](https://github.com/microsoft/onnxruntime/assets/94887879/91724b78-0b4e-4cbd-ad88-83cad9976472)

KERNEL_EXPLORER_TEST_USE_CUPY=1 The CPU utilization is shown as below:

![image](https://github.com/microsoft/onnxruntime/assets/94887879/58239911-667c-4d5f-bb78-deca60d0266f)


- Use `Bash@3`.
- Update shell script.
---
 .../orttraining-pai-ci-pipeline.yml           | 21 ++++----
 .../migraphx-ci-pipeline-env.Dockerfile       |  3 +-
 tools/ci_build/github/pai/pai_clean_device.sh | 53 +++++++++++--------
 .../pai/pai_huggingface_bert_large_test.sh    | 21 +++++---
 .../pai/rocm-ci-pipeline-env.Dockerfile       |  4 +-
 5 files changed, 60 insertions(+), 42 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
index 4e4073ae8496a..1295f841420ef 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml
@@ -133,12 +133,11 @@ jobs:
       DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
       Repository: onnxruntimetrainingrocm-cibuild-rocm$(RocmVersion)-test
 
-  - task: CmdLine@2
+  - task: Bash@3
     inputs:
-      script: |-
-        echo "Select agent: $(Agent.Name), GPU: $HIP_VISIBLE_DEVICES, render: $DRIVER_RENDER"
-        bash $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_clean_device.sh $(Agent.Name) $HIP_VISIBLE_DEVICES
-      workingDirectory: $(Build.SourcesDirectory)
+      targetType: filePath
+      filePath: $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_clean_device.sh
+      arguments: -n $(Agent.Name) -d $HIP_VISIBLE_DEVICES -r $DRIVER_RENDER
     displayName: 'Check ROCm Environment'
 
   - task: CmdLine@2
@@ -182,6 +181,7 @@ jobs:
               set -ex; \
               export KERNEL_EXPLORER_BUILD_DIR=/build/$(BuildConfig); \
               export KERNEL_EXPLORER_BATCHED_GEMM_MAX_BATCH_SIZE=8; \
+              export KERNEL_EXPLORER_TEST_USE_CUPY=1; \
               pytest /onnxruntime_src/onnxruntime/python/tools/kernel_explorer/ -n 8 --reruns 1 --durations=100"
       workingDirectory: $(Build.SourcesDirectory)
     displayName: 'Run kernel explorer tests'
@@ -206,7 +206,7 @@ jobs:
               set -ex; \
               export PYTHONPATH=/build/$(BuildConfig); \
               python -m onnxruntime.training.ortmodule.torch_cpp_extensions.install; \
-              bash /onnxruntime_src/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh $(RocmVersion)"
+              bash /onnxruntime_src/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh -v $(RocmVersion)"
       workingDirectory: $(Build.SourcesDirectory)
     displayName: 'Run Python Hugging-Face BERT-L test'
     condition: succeededOrFailed()
@@ -250,11 +250,12 @@ jobs:
     displayName: 'Run orttraining_ortmodule_tests.py'
     condition: succeededOrFailed()
 
-  - task: CmdLine@2
+
+  - task: Bash@3
     inputs:
-      script: |-
-        bash $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_clean_device.sh $(Agent.Name) $HIP_VISIBLE_DEVICES
-      workingDirectory: $(Build.SourcesDirectory)
+      targetType: filePath
+      filePath: $(Build.SourcesDirectory)/tools/ci_build/github/pai/pai_clean_device.sh
+      arguments: -n $(Agent.Name) -d $HIP_VISIBLE_DEVICES -r $DRIVER_RENDER
     displayName: 'Clean ROCm Environment'
     condition: always()
 
diff --git a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
index 90e9731a35998..d3bca26875c07 100644
--- a/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
+++ b/tools/ci_build/github/linux/docker/migraphx-ci-pipeline-env.Dockerfile
@@ -6,7 +6,8 @@ ARG MIGRAPHX_VERSION=rocm-5.5.0
 ENV DEBIAN_FRONTEND noninteractive
 ENV MIGRAPHX_DISABLE_FAST_GELU=1
 
-RUN apt-get clean && apt-get update -y && apt-get upgrade -y && apt-get install -y locales unzip
+RUN apt-get update -y && apt-get upgrade -y && apt-get autoremove -y && \
+    apt-get install -y locales unzip && apt-get clean -y
 RUN locale-gen en_US.UTF-8
 RUN update-locale LANG=en_US.UTF-8
 ENV LC_ALL C.UTF-8
diff --git a/tools/ci_build/github/pai/pai_clean_device.sh b/tools/ci_build/github/pai/pai_clean_device.sh
index 4ebbcf492040d..98b680d4f465c 100755
--- a/tools/ci_build/github/pai/pai_clean_device.sh
+++ b/tools/ci_build/github/pai/pai_clean_device.sh
@@ -1,38 +1,47 @@
 #!/bin/bash
 set -ex
 
-agentName=$1
-target_device=$2
-echo "agent name $agentName"
-echo "agent target device : $target_device"
+usage() { echo "Usage: $0 [-n <agent name>] [-d <target device>] [-r <driver render>]" 1>&2; exit 1; }
+
+while getopts "n:d:r:" parameter_Option
+do case "${parameter_Option}"
+in
+n) AGENT_NAME=${OPTARG};;
+d) TARGET_DEVICE=${OPTARG};;
+r) DRIVER_RENDER=${OPTARG};;
+*) usage ;;
+esac
+done
+
+echo "Agent Name: $AGENT_NAME, Target Device: $TARGET_DEVICE, Driver Render: $DRIVER_RENDER"
 
-echo -e "\n ---- rocm-smi"
+echo -e "\n ---- Execute rocm-smi"
 rocm-smi
 
-echo -e "\n ---- rocm-smi --showpids"
+echo -e "\n ---- Execute rocm-smi --showpids"
 rocm-smi --showpids
 
-echo -e "\n ---- rocm-smi --showpidgpus"
+echo -e "\n ---- Execute rocm-smi --showpidgpus"
 rocm-smi --showpidgpus
 
-echo -e "\n ---- rocm-smi --showpids detail"
+echo -e "\n ---- Execute rocm-smi --showpids detail"
 rocm-smi --showpids | awk '$1 ~/[0-9]+/{if((NR>6)) {print $1}}' | xargs -I {} ps {}
 
-echo -e "\n ---- rocm-smi --showmeminfo"
+echo -e "\n ---- Execute rocm-smi --showmeminfo"
 rocm-smi --showmeminfo vram vis_vram gtt
 
-echo -e "\n ---- Clean up the process that is using the target device"
-gpu_details=$(rocm-smi --showpidgpus)
-pid_lines=$(echo "$gpu_details" | grep -n "DRM device" | cut -d ":" -f 1)
-pid_lines_array=($pid_lines)
-
-for ((i = 0; i < ${#pid_lines_array[@]}; i++)); do
-    pid_line=${pid_lines_array[$i]}
-    pid=$(echo "$gpu_details" | awk '{print $2}' | sed -n "${pid_line}p")
-    gpu_line=$((pid_line + 1))
-    pid_gpu=$(echo "$gpu_details" | sed -n "${gpu_line}p" | sed -e 's/^[ ]*//g' | sed -e 's/[ ]*$//g')
-    if [ "$pid_gpu" == "$target_device" ]; then
-        echo "kill pid: $pid, gpu: $pid_gpu"
-        kill -9 $pid
+echo -e "\n ---- Clean up processes that use the target device $TARGET_DEVICE"
+GPU_USED_BY_PIDS=$(rocm-smi --showpidgpus)
+PID_NUMBERS_LINES=$(echo "$GPU_USED_BY_PIDS" | grep -n "DRM device" | cut -d ":" -f 1)
+PID_NUMBERS_LINES_ARRAY=($PID_NUMBERS_LINES)
+
+for ((i = 0; i < ${#PID_NUMBERS_LINES_ARRAY[@]}; i++)); do
+    PID_NUMBER_LINE=${PID_NUMBERS_LINES_ARRAY[$i]}
+    PID_NUMBER=$(echo "$GPU_USED_BY_PIDS" | awk '{print $2}' | sed -n "${PID_NUMBER_LINE}p")
+    GPU_USED_BY_PID_LINE=$((PID_NUMBER_LINE + 1))
+    GPU_USED_BY_PID=$(echo "$GPU_USED_BY_PIDS" | sed -n "${GPU_USED_BY_PID_LINE}p" | sed -e 's/^[ ]*//g' | sed -e 's/[ ]*$//g')
+    if [ "$GPU_USED_BY_PID" == "$TARGET_DEVICE" ]; then
+        echo "kill pid: $PID_NUMBER, using gpu: $GPU_USED_BY_PID"
+        kill -9 "$PID_NUMBER"
     fi
 done
diff --git a/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh b/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh
index 3b8e828fe94dd..fb4dbeb2e73d3 100644
--- a/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh
+++ b/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh
@@ -2,15 +2,22 @@
 
 set -ex
 
-rocm_version=$1
-mi200_gpus=$(rocm-smi --showproductname | grep -c "MI250" | xargs)
+usage() { echo "Usage: $0 [-v <ROCm version>]" 1>&2; exit 1; }
 
-echo "mi200_gpus: $mi200_gpus"
+while getopts "v:" parameter_Option
+do case "${parameter_Option}"
+in
+v) ROCM_VERSION=${OPTARG};;
+*) usage ;;
+esac
+done
 
-if [ "$mi200_gpus" -gt "0" ]; then
-  result_file=ci-mi200.huggingface.bert-large-rocm${rocm_version}.json
+MI200_DEVICE_NUMBERS=$(rocm-smi --showproductname | grep -c "MI250" | xargs)
+
+if [ "$MI200_DEVICE_NUMBERS" -gt "0" ]; then
+  RESULT_FILE=ci-mi200.huggingface.bert-large-rocm${ROCM_VERSION}.json
 else
-  result_file=ci-mi100.huggingface.bert-large-rocm${rocm_version}.json
+  RESULT_FILE=ci-mi100.huggingface.bert-large-rocm${ROCM_VERSION}.json
 fi
 
 python \
@@ -33,4 +40,4 @@ cat ci-pipeline-actual.json
 
 python /onnxruntime_src/orttraining/tools/ci_test/compare_huggingface.py \
   ci-pipeline-actual.json \
-  /onnxruntime_src/orttraining/tools/ci_test/results/${result_file}
+  /onnxruntime_src/orttraining/tools/ci_test/results/"$RESULT_FILE"
diff --git a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
index 540aeaf351e82..7540856913e27 100644
--- a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
+++ b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile
@@ -1,6 +1,6 @@
-FROM rocm/pytorch:rocm5.5_ubuntu20.04_py3.8_pytorch_1.13.1
+FROM rocm/cupy:rocm5.5.0_ubuntu20.04_py3.8_pytorch2.0.0_cupy13.0.0
 
-RUN apt-get update -y && apt-get upgrade -y
+RUN apt-get update -y && apt-get upgrade -y && apt-get autoremove -y && apt-get clean -y
 
 WORKDIR /stage
 

From 5d17bcd776f326506360187b48808c2c005dbc94 Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Tue, 25 Jul 2023 00:08:53 +0800
Subject: [PATCH 27/34] [WebNN EP] Support Greater and Less ops (#16782)

---
 onnxruntime/core/providers/webnn/builders/helper.h          | 2 ++
 .../providers/webnn/builders/impl/logical_op_builder.cc     | 6 ++++++
 .../core/providers/webnn/builders/op_builder_factory.cc     | 2 ++
 3 files changed, 10 insertions(+)

diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h
index 93d36e7761545..f5bb86787d688 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.h
+++ b/onnxruntime/core/providers/webnn/builders/helper.h
@@ -153,6 +153,7 @@ static const InlinedHashMap<std::string, std::string> op_map = {
     {"Gemm", "gemm"},
     {"GlobalAveragePool", "averagePool2d"},
     {"GlobalMaxPool", "maxPool2d"},
+    {"Greater", "greater"},
     {"GroupNormalization", "meanVarianceNormalization"},
     {"HardSigmoid", "hardSigmoid"},
     {"HardSwish", "hardSwish"},
@@ -160,6 +161,7 @@ static const InlinedHashMap<std::string, std::string> op_map = {
     {"InstanceNormalization", "meanVarianceNormalization"},
     {"LayerNormalization", "meanVarianceNormalization"},
     {"LeakyRelu", "leakyRelu"},
+    {"Less", "lesser"},
     {"MatMul", "matmul"},
     {"MaxPool", "maxPool2d"},
     {"Mul", "mul"},
diff --git a/onnxruntime/core/providers/webnn/builders/impl/logical_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/logical_op_builder.cc
index ef3b7a60d1497..7cc513db14aed 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/logical_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/logical_op_builder.cc
@@ -33,6 +33,10 @@ Status LogicalOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, cons
   emscripten::val output = emscripten::val::object();
   if (op_type == "Equal") {
     output = model_builder.GetBuilder().call<emscripten::val>("equal", input0, input1);
+  } else if (op_type == "Greater") {
+    output = model_builder.GetBuilder().call<emscripten::val>("greater", input0, input1);
+  } else if (op_type == "Less") {
+    output = model_builder.GetBuilder().call<emscripten::val>("lesser", input0, input1);
   } else {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                            "LogicalOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type);
@@ -49,6 +53,8 @@ void CreateLogicalOpBuilder(const std::string& op_type, OpBuilderRegistrations&
   static std::vector<std::string> op_types =
       {
           "Equal",
+          "Greater",
+          "Less",
       };
 
   op_registrations.builders.push_back(std::make_unique<LogicalOpBuilder>());
diff --git a/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc b/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
index 2536ae0ae44d6..4ff87ade85f4b 100644
--- a/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
+++ b/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
@@ -97,6 +97,8 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
 
   {  // Logical
     CreateLogicalOpBuilder("Equal", op_registrations);
+    CreateLogicalOpBuilder("Greater", op_registrations);
+    CreateLogicalOpBuilder("Less", op_registrations);
   }
 
   {  // Normalization

From d8d8349a1bbb6099eb2a7a1d30dff3d819363cc7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20M=C3=BCller?=
 <44298237+gedoensmax@users.noreply.github.com>
Date: Mon, 24 Jul 2023 15:17:11 -0700
Subject: [PATCH 28/34] fix: add missing nullptr of SessionOptions V2 (#16794)

/builds/devtechproviz/dl/ort-builder/onnxruntime/onnxruntime/python/onnxruntime_pybind_state.cc:388:14:
error: missing initializer for member
'OrtTensorRTProviderOptionsV2::trt_cuda_graph_enable'
[-Werror=missing-field-initializers]
  388 |             0};
      |

### Description
<!-- Describe your changes. -->



### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 onnxruntime/python/onnxruntime_pybind_state.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index da1266ec1d3ba..12b020f32b22f 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -385,6 +385,7 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
             nullptr,
             nullptr,
             nullptr,
+            nullptr,
             0};
         for (auto option : it->second) {
           if (option.first == "device_id") {

From 4b6d9fa8518f5dd81c0887ca7ef274f847b47bf4 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 24 Jul 2023 16:13:59 -0700
Subject: [PATCH 29/34] Bump actions/deploy-pages from 1 to 2 (#16402)

---
 .github/workflows/publish-gh-pages.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/publish-gh-pages.yml b/.github/workflows/publish-gh-pages.yml
index 5ddb1e3bb03d1..0f1e4cf840511 100644
--- a/.github/workflows/publish-gh-pages.yml
+++ b/.github/workflows/publish-gh-pages.yml
@@ -98,4 +98,4 @@ jobs:
     steps:
       - name: Deploy to GitHub Pages
         id: deployment
-        uses: actions/deploy-pages@v1
+        uses: actions/deploy-pages@v2

From b0279b14d88ed8e70372c5b336ecaa273b207133 Mon Sep 17 00:00:00 2001
From: Wei-Sheng Chin <wschin@outlook.com>
Date: Mon, 24 Jul 2023 16:54:01 -0700
Subject: [PATCH 30/34] [DORT] Enable Dynamic Shape in DORT and Use Different
 InferenceSession's when Inputs Are Not Compatible (#16753)

Sometimes, ONNX exporter generates rank- or shape-dependent sub-graphs.
Thus, error could occur when running the ONNX model with different
inputs. This PR
([78e736d](https://github.com/microsoft/onnxruntime/pull/16753/commits/78e736d857fa168653bab10afa1d1aefa4abbfba))
addresses this problem by
- if needed, exporting multiple ONNX models with different inputs for
the same GraphModule.
- implementing a naive mechanism to determine of existing ONNX models
(and the associated InferenceSession) can be reused.

On the other hand, in the second commit
[b5a9b5f](https://github.com/microsoft/onnxruntime/pull/16753/commits/b5a9b5f849b139ba287dc3d2545d66c982a9def3),
this PR also enables dynamic shapes in DORT by
- passing dynamic_shapes = True to exporter (see how
DEFAULT_DYNAMIC_BACKEND is created)
- calling torch._dynamo.optimize(dynamic_ort_aot, dynamic=True) (see how
dynamic_ort_aot is created).
---
 .../training/torchdynamo/ort_backend.py       | 243 +++++++++++++-----
 .../training/torchdynamo/register_backend.py  |  60 ++++-
 .../test/python/orttraining_test_dort.py      | 140 +++++++++-
 .../orttraining_test_dort_custom_ops.py       |  40 ++-
 4 files changed, 405 insertions(+), 78 deletions(-)

diff --git a/orttraining/orttraining/python/training/torchdynamo/ort_backend.py b/orttraining/orttraining/python/training/torchdynamo/ort_backend.py
index 701a4d6ebfc67..4843e552c5305 100644
--- a/orttraining/orttraining/python/training/torchdynamo/ort_backend.py
+++ b/orttraining/orttraining/python/training/torchdynamo/ort_backend.py
@@ -5,7 +5,7 @@
 
 import dataclasses
 import logging
-from typing import Any, Dict, Mapping, Optional, Set, Tuple, Union
+from typing import Any, Dict, List, Mapping, Optional, Set, Tuple, Union
 
 import numpy as np
 import onnx
@@ -45,6 +45,19 @@
     torch.bool: np.bool_,
 }
 
+_ONNX_ELEMENT_TYPE_TO_TORCH_DTYPE = {
+    1: torch.float32,
+    2: torch.uint8,
+    3: torch.int8,
+    5: torch.int16,
+    6: torch.int32,
+    7: torch.int64,
+    9: torch.bool,
+    10: torch.float16,
+}
+
+_TORCH_DTYPE_TO_ONNX_ELEMENT_TYPE = {value: key for key, value in _ONNX_ELEMENT_TYPE_TO_TORCH_DTYPE.items()}
+
 
 def _nvtx_range_push(name: str):
     """If PyTorch is installed with CUDA support, this starts NVTX range.
@@ -198,22 +211,37 @@ def _infer_ep_from_device(*args) -> Tuple[str, ...]:
     return tuple(eps)
 
 
-def _infer_ep_from_graph_module(graph_module: torch.fx.GraphModule) -> Tuple[str, ...]:
-    """Return the first valid device (i.e., GPU or CPU) among outputs of this torch.fx.GraphModule."""
+def _extract_graph_module_inputs(graph_module: torch.fx.GraphModule) -> Tuple[Any, ...]:
+    placeholders = []
+    for node in graph_module.graph.nodes:
+        if node.op == "placeholder":
+            if hasattr(node, "meta") and "val" in node.meta:
+                assert isinstance(node.meta["val"], torch.Tensor)
+            placeholders.append(node)
+
+
+def _extract_graph_module_outputs(graph_module: torch.fx.GraphModule) -> Any:
+    """Collect "val" fields from outputs metadata in this torch.fx.GraphModule."""
     for node in graph_module.graph.nodes:
         if node.op == "output":
             # Output node is unique. Let's retrieve output values from
             # this node's input list. And then just return.
-            flattened_output_args, _ = _pytree.tree_flatten(node.args)
-            output_args = []
-            for output_arg in flattened_output_args:
-                if hasattr(output_arg, "meta") and "val" in output_arg.meta:
-                    # Select outputs with "val" information. Without "val",
-                    # it's not possible access output_arg.meta["val"].device.
-                    output_args.append(output_arg.meta["val"])  # noqa: PERF401
-            return _infer_ep_from_device(*output_args)
-    graph_module_str = graph_module.print_readable(print_output=False)
-    raise ValueError(f"No output node is found in graph_module: {graph_module_str}")
+            return node.args[0]
+    raise ValueError("No output node found in this torch.fx.GraphModule.")
+
+
+def _infer_ep_from_graph_module(graph_module: torch.fx.GraphModule) -> Tuple[str, ...]:
+    """Return the all valid devices (i.e., GPU or CPU) among outputs of this torch.fx.GraphModule."""
+    flattened_output_args, _ = _pytree.tree_flatten(_extract_graph_module_outputs(graph_module))
+    # Output arguments with example value (type: torch.Tensor) in the `graph_module`.
+    selected_output_args = [
+        output_arg.meta["val"]
+        for output_arg in flattened_output_args
+        # output_arg must have tensor for its device information.
+        # Otherwise, skip it.
+        if (hasattr(output_arg, "meta") and "val" in output_arg.meta)
+    ]
+    return _infer_ep_from_device(*selected_output_args)
 
 
 def _sort_eps(eps: Tuple[str, ...]) -> Tuple[str, ...]:
@@ -335,28 +363,87 @@ def _assert_allclose_with_detailed_error_message(
         )
 
 
-@dataclasses.dataclass
-class OrtExecutionInfo:
+class OrtExecutionInfoPerSession:
     """Information required to execute torch.fx.GraphModule using onnxruntime.InferenceSession"""
 
+    def __init__(
+        self,
+        session: onnxruntime.InferenceSession,
+        input_names: Tuple[str, ...],
+        input_value_infos: Tuple[onnx.ValueInfoProto, ...],
+        output_names: Tuple[str, ...],
+        output_value_infos: Tuple[onnx.ValueInfoProto, ...],
+        input_devices: Tuple[ORTC.OrtDevice, ...],  # type: ignore
+        output_devices: Tuple[ORTC.OrtDevice, ...],  # type: ignore
+        example_outputs: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+    ):
+        # Carrier of ONNX model and its executor.
+        self.session: onnxruntime.InferenceSession = session
+        # For the ONNX model stored in self.session, self.input_names[i] is the
+        # name of the i-th positional input.
+        self.input_names: Tuple[str, ...] = input_names
+        # self.input_name[i]'s type information is stored in self.input_value_infos[i].
+        self.input_value_infos: Tuple[onnx.ValueInfoProto, ...] = input_value_infos
+        # Similar to self.input_names, but for outputs.
+        self.output_names: Tuple[str, ...] = output_names
+        # Similar to self.input_value_infos but for outputs.
+        self.output_value_infos: Tuple[onnx.ValueInfoProto, ...] = output_value_infos
+        # For the ONNX model stored in self.session, self.input_devices[i] is the
+        # i-th positional input's device.
+        self.input_devices: Tuple[ORTC.OrtDevice, ...] = input_devices  # type: ignore
+        # Similar to self.input_devices, but for outputs.
+        self.output_devices: Tuple[ORTC.OrtDevice, ...] = output_devices  # type: ignore
+        # This is the outputs of executing the original torch.fx.GraphModule with example inputs
+        # (i.e., args passed into OrtBackend._ort_acclerated_call).
+        self.example_outputs: Union[Tuple[torch.Tensor, ...], torch.Tensor] = example_outputs
+
+    def is_supported(self, *args):
+        # Compare the args and the input schema in ONNX model and
+        # return the first match.
+        if len(args) != len(self.input_value_infos):
+            return False
+        for arg, value_info in zip(args, self.input_value_infos):
+            if not isinstance(arg, torch.Tensor):
+                return False
+            onnx_dtype = _TORCH_DTYPE_TO_ONNX_ELEMENT_TYPE[arg.dtype]
+            if onnx_dtype != value_info.type.tensor_type.elem_type:
+                return False
+            for dim, onnx_dim in zip(arg.shape, value_info.type.tensor_type.shape.dim):
+                if isinstance(dim, int) and (onnx_dim.dim_value == dim or onnx_dim.dim_param):
+                    continue
+                elif isinstance(dim, torch.SymInt) and onnx_dim.dim_param:
+                    continue
+                else:
+                    return False
+        return True
+
+
+@dataclasses.dataclass
+class OrtExecutionInfoForAllGraphModules:
     def __init__(self):
-        # session self.sessions[mod] is created for computing the graph in mod.
-        self.sessions: Dict[torch.fx.GraphModule, onnxruntime.InferenceSession] = {}
-        # self.input_names[mod] contains all input names in the ONNX model exported from mod.
-        # self.input_names[mod][i] is the name of the i-th positional input of the graph in mod.
-        self.input_names: Dict[torch.fx.GraphModule, Tuple[str, ...]] = {}
-        # Similar to self.input_names, but for outputs of the graph.
-        self.output_names: Dict[torch.fx.GraphModule, Tuple[str, ...]] = {}
-        # self.input_devices[mod] contains devices of inputs fed to mod.forward (excluding self).
-        # self.input_devices[mod][i] is the i-th positional input's device.
-        self.input_devices: Dict[torch.fx.GraphModule, Tuple[ORTC.OrtDevice, ...]] = {}  # type: ignore
-        # Similar to self.input_devices, but for outputs of the graph.
-        self.output_devices: Dict[torch.fx.GraphModule, Tuple[ORTC.OrtDevice, ...]] = {}  # type: ignore
-        # This is a debug flag. When True, this backend will compare its
-        self.assert_allclose_to_baseline: bool = False
-        # We need example outputs to determine output schema of ORT run.
-        # self.example_outputs[mod] is the outputs of mod.forward(*self.example_inputs[mod]).
-        self.example_outputs: Dict[torch.fx.GraphModule, Union[Tuple[torch.Tensor, ...], torch.Tensor]] = {}
+        # All sessions (and their related information) created by exporting the same GraphModule
+        # with different inputs.
+        self.execution_info_per_graph_module: Dict[torch.fx.GraphModule, List[OrtExecutionInfoPerSession]] = {}
+
+    def search_reusable_session_execution_info(self, graph_module: torch.fx.GraphModule, *args):
+        if graph_module not in self.execution_info_per_graph_module:
+            return None
+        # All execution information for ONNX models exported from the same `graph_module`
+        # with different inputs.
+        candidates = self.execution_info_per_graph_module[graph_module]
+
+        for candidate in candidates:
+            if candidate.is_supported(*args):
+                # Returns the first session that accepts this input schema.
+                return candidate
+        # No reusable session found.
+        return None
+
+    def cache_session_execution_info(self, graph_module: torch.fx.GraphModule, info: OrtExecutionInfoPerSession):
+        if graph_module not in self.execution_info_per_graph_module:
+            self.execution_info_per_graph_module[graph_module] = [info]
+        else:
+            self.execution_info_per_graph_module[graph_module].append(info)
 
 
 class OrtBackend:
@@ -409,8 +496,23 @@ def __init__(
         self._supported_ops = OrtOperatorSupport(support_dict, extra_support_dict)
         # TODO: this is a naive implementation of cache without proper guard
         self._partitioner_cache: Dict[torch.fx.GraphModule, torch.fx.GraphModule] = {}
-        # TODO: this is a naive implementation of cache without proper guard, this will only work for identical inputs
-        self._ort_execution_info = OrtExecutionInfo()
+        # Conceptually, this filed is a 2-layer dictionary
+        #   GraphModule 0
+        #     ONNX Model 0 (with ORT InferenceSession and related information. type: OrtExecutionInfoPerSession)
+        #     ONNX Model 1
+        #     ...
+        #   GraphModule 1
+        #     ONNX Model 2 (with ORT InferenceSession and related information. type: OrtExecutionInfoPerSession)
+        #     ONNX Model 3
+        #     ...
+        #   ...
+        # , which caches all previous compilation result so that we can reuse them.
+        # ONNX Model 0 and 1 are exported from the same GraphModule 0 but with different inputs
+        # (e.g., tensors with different ranks). GraphModule 0 and GraphModule 1 are different
+        # graphs captured by Dynamo and sent to OrtBackend.compile.
+        self._all_ort_execution_info = OrtExecutionInfoForAllGraphModules()
+
+        self._assert_allclose_to_baseline = False
 
         self.ep = ep
         self.session_options = session_options
@@ -426,14 +528,16 @@ def __init__(
         self.preallocate_output = preallocate_output
 
     def _ort_acclerated_call(self, graph_module: torch.fx.GraphModule, *args, **kwargs):
-        if graph_module in self._ort_execution_info.sessions:
-            # We have seen this graph before, so we can use cached objects including session.
-            onnx_session = self._ort_execution_info.sessions[graph_module]
-            input_names = self._ort_execution_info.input_names[graph_module]
-            output_names = self._ort_execution_info.output_names[graph_module]
-            input_devices = self._ort_execution_info.input_devices[graph_module]
-            output_devices = self._ort_execution_info.output_devices[graph_module]
-            prim_outputs = self._ort_execution_info.example_outputs[graph_module]
+        cached_execution_info_per_session = self._all_ort_execution_info.search_reusable_session_execution_info(
+            graph_module, *args
+        )
+        if cached_execution_info_per_session:
+            onnx_session = cached_execution_info_per_session.session
+            input_names = cached_execution_info_per_session.input_names
+            output_names = cached_execution_info_per_session.output_names
+            input_devices = cached_execution_info_per_session.input_devices
+            output_devices = cached_execution_info_per_session.output_devices
+            prim_outputs = cached_execution_info_per_session.example_outputs
         else:
             # It's first time seeing such as graph. Let's make a new session
             # (type: onnxruntime.InferenceSession) for it.
@@ -445,17 +549,32 @@ def _ort_acclerated_call(self, graph_module: torch.fx.GraphModule, *args, **kwar
             #
             # WARNING: The downstream code should not change prim_outputs and
             # this backend should always produces output with schema identical to prim_outputs'.
-            try:
-                prim_outputs = FakeTensorProp(graph_module).propagate(*args, **kwargs)
-            except Exception:
-                logger.info(f"FakeTensorProb failed for {graph_module}")
-                # When FakeTensorProp fails, it is not possible to preallocate output buffers
-                # because the output shapes are not inferred.
+
+            if self.resolved_onnx_exporter_options.dynamic_shapes:
+                # No pre-allocation when dynamic shape is enabled.
                 self.preallocate_output = False
+                extracted_outputs = _extract_graph_module_outputs(graph_module)
+
+                def maybe_map_to_meta_val(value):
+                    if hasattr(value, "meta") and "val" in value.meta:
+                        # Select outputs with "val" information. Without "val",
+                        # it's not possible access output_arg.meta["val"].device.
+                        return value.meta["val"]
+                    else:
+                        return value
+
+                prim_outputs = _pytree.tree_map(maybe_map_to_meta_val, extracted_outputs)
+            else:
+                try:
+                    prim_outputs = FakeTensorProp(graph_module).propagate(*args, **kwargs)
+                except Exception:
+                    logger.info(f"FakeTensorProb failed for {graph_module}")
+                    # When FakeTensorProp fails, it is not possible to preallocate output buffers
+                    # because the output shapes are not inferred.
+                    self.preallocate_output = False
 
-                # rethrow FakeTensorProb failure because it is not yet currently handled.
-                raise
-            self._ort_execution_info.example_outputs[graph_module] = prim_outputs
+                    # rethrow FakeTensorProb failure because it is not yet currently handled.
+                    raise
 
             from torch.onnx._internal.fx import fx_onnx_interpreter
 
@@ -500,7 +619,6 @@ def _ort_acclerated_call(self, graph_module: torch.fx.GraphModule, *args, **kwar
 
             onnx_session = _create_onnx_session(onnx_proto, selected_eps, self.session_options)
             # Cache ORT session. It's reused for the same "graph_module".
-            self._ort_execution_info.sessions[graph_module] = onnx_session
             # Generate ONNX model and extract its input and output names.
             onnx_model = _create_onnx_model(onnx_proto)
             # TODO(wechi): ORT session should provide a API to extract
@@ -515,10 +633,19 @@ def _ort_acclerated_call(self, graph_module: torch.fx.GraphModule, *args, **kwar
                 output_devices = _get_onnx_devices(prim_outputs)
             else:
                 output_devices = _get_onnx_devices((prim_outputs,))
-            self._ort_execution_info.input_names[graph_module] = input_names
-            self._ort_execution_info.output_names[graph_module] = output_names
-            self._ort_execution_info.input_devices[graph_module] = input_devices
-            self._ort_execution_info.output_devices[graph_module] = output_devices
+
+            execution_info_per_session = OrtExecutionInfoPerSession(
+                session=onnx_session,
+                input_names=input_names,
+                input_value_infos=tuple(input for input in onnx_model.graph.input),
+                output_names=output_names,
+                output_value_infos=tuple(output for output in onnx_model.graph.output),
+                input_devices=input_devices,
+                output_devices=output_devices,
+                example_outputs=prim_outputs,
+            )
+
+            self._all_ort_execution_info.cache_session_execution_info(graph_module, execution_info_per_session)
 
         if isinstance(prim_outputs, tuple):
             assert all(isinstance(elem, torch.Tensor) for elem in prim_outputs)
@@ -536,7 +663,7 @@ def _ort_acclerated_call(self, graph_module: torch.fx.GraphModule, *args, **kwar
                 self.preallocate_output,
             )
             _nvtx_range_pop()
-            if self._ort_execution_info.assert_allclose_to_baseline:
+            if self._assert_allclose_to_baseline:
                 # Compute baseline.
                 baseline_outputs = torch._prims.executor.execute(graph_module, *args, executor="aten")
                 # Ensure every output tensor is close to the corresponding baseline.
@@ -559,7 +686,7 @@ def _ort_acclerated_call(self, graph_module: torch.fx.GraphModule, *args, **kwar
                 self.preallocate_output,
             )
             assert len(onnx_outputs) == 1
-            if self._ort_execution_info.assert_allclose_to_baseline:
+            if self._assert_allclose_to_baseline:
                 # Compute baseline.
                 baseline_outputs = torch._prims.executor.execute(graph_module, *args, executor="aten")
                 # Ensure output tensor is close to the corresponding baseline.
diff --git a/orttraining/orttraining/python/training/torchdynamo/register_backend.py b/orttraining/orttraining/python/training/torchdynamo/register_backend.py
index 9030c6f8fb86d..3a49e85ab836d 100644
--- a/orttraining/orttraining/python/training/torchdynamo/register_backend.py
+++ b/orttraining/orttraining/python/training/torchdynamo/register_backend.py
@@ -5,14 +5,37 @@
 
 from functorch.compile import min_cut_rematerialization_partition
 from torch._dynamo.backends.common import aot_autograd
+from torch.onnx._internal.exporter import ExportOptions
 
 from .ort_backend import OrtBackend
 
-# This should be the underlying compiler for ALL graphs if
-# the user uses ORT to accelerate PyTorch via Dynamo.
-# By using a global compiler for all graphs, cached compilation
-# results can be reused when encountering the identical graphs.
-DEFAULT_BACKEND = OrtBackend()
+
+def make_aot_ort(dynamic: bool = True):
+    """Wrap OrtBackend as PyTorch's AOT compiler.
+
+    Example usages:
+        import torch
+        from onnxruntime.training.torchdynamo.register_backend import make_aot_ort
+        use_dynamic = True
+        local_aot_ort, _ = make_aot_ort(dynamic = use_dynamic)
+
+        @torch._dynamo.optimize(local_aot_ort, dynamic=use_dynamic)
+        def foo(x: torch.Tensor):
+            return torch.sigmoid(x)
+
+        x = torch.rand(2, 2, dtype=torch.float)
+        torch.testing.assert_close(torch.sigmoid(x), foo(x))
+    """
+    ort_backend = OrtBackend(onnx_exporter_options=ExportOptions(dynamic_shapes=dynamic))
+    return (
+        aot_autograd(
+            fw_compiler=ort_backend,
+            partition_fn=min_cut_rematerialization_partition,
+            decompositions=ort_backend.resolved_onnx_exporter_options.decomposition_table,
+        ),
+        ort_backend,
+    )
+
 
 # Wrap ORT as a compiler in Dynamo for training (i.e., when .backward is called).
 #
@@ -28,12 +51,24 @@
 #  compiled_model = torch._dynamo.optimize(aot_ort)(model)
 #  result = compiled_model(torch.rand(2, 2, dtype=torch.float)
 #  result.sum().backward()
+#
+# DEFAULT_BACKEND should be the underlying compiler for ALL graphs if
+# the user uses ORT to accelerate PyTorch via Dynamo.
+# By using a global compiler for all graphs, cached compilation
+# results can be reused when encountering the identical graphs.
+aot_ort, DEFAULT_BACKEND = make_aot_ort(dynamic=False)
 
-aot_ort = aot_autograd(
-    fw_compiler=DEFAULT_BACKEND,
-    partition_fn=min_cut_rematerialization_partition,
-    decompositions=DEFAULT_BACKEND.resolved_onnx_exporter_options.decomposition_table,
-)
+# Similar to aot_ort but should be used with
+#    torch._dynamo.optimize(dynamic_aot_ort, dynamic=True)
+# to enable dynamic shapes in ONNX graph.
+#
+# Similar to DEFAULT_BACKEND but DEFAULT_DYNAMIC_BACKEND enables dynamic shapes
+# when exporting FX graph to ONNX.
+# Note that this backend must be used with
+#    torch._dynamo.optimize(DEFAULT_DYNAMIC_BACKEND, dynamic=True)
+# Without `dynamic=True`, the FX graph only contains static shapes, and results ONNX graph
+# with static shapes.
+dynamic_aot_ort, DEFAULT_DYNAMIC_BACKEND = make_aot_ort(dynamic=True)
 
 # Declare ORT as a compiler in Dynamo for inference (i.e., when .backward is NOT called).
 #
@@ -47,3 +82,8 @@
 #  model = torch.nn.Linear(2, 2)
 #  compiled_model = torch._dynamo.optimize(ort)(model)
 ort = DEFAULT_BACKEND
+
+# Similar to ort but should be used with
+#    torch._dynamo.optimize(dynamic_ort, dynamic=True)
+# to enable dynamic shapes in ONNX graph.
+dynamic_ort = DEFAULT_DYNAMIC_BACKEND
diff --git a/orttraining/orttraining/test/python/orttraining_test_dort.py b/orttraining/orttraining/test/python/orttraining_test_dort.py
index 6f4295d6b2a24..88d9c00984d3e 100644
--- a/orttraining/orttraining/test/python/orttraining_test_dort.py
+++ b/orttraining/orttraining/test/python/orttraining_test_dort.py
@@ -4,10 +4,13 @@
 import unittest
 
 import torch
+import torch._dynamo
+import torch.onnx._internal.exporter
 from torch import nn
 from torch.nn import functional as F
+from torch.utils import _pytree
 
-from onnxruntime.training.torchdynamo.register_backend import aot_ort, ort
+from onnxruntime.training.torchdynamo.register_backend import aot_ort, dynamic_aot_ort, make_aot_ort, ort
 
 
 class TestTorchDynamoOrt(unittest.TestCase):
@@ -55,6 +58,141 @@ def run(fun, list_x):
         for _ in range(5):
             run_elementwise_model()
 
+    def test_dynamo_shape_model(self):
+        torch._dynamo.reset()
+        """Test DORT with a pure function."""
+
+        def run_elementwise_model():
+            # A function to test DORT.
+            def elementwise_model(tensor_x: torch.Tensor):
+                tensor_y = tensor_x.sigmoid()
+                tensor_z = tensor_y + tensor_x
+                tensor_p = tensor_z * tensor_x
+                tensor_q = tensor_p.sigmoid()
+                return tensor_q
+
+            # This function should only generate one graph and execute
+            # it for all inputs.
+            # With dynamic_shape=True, Dynamo sends FX graphs with dynamic
+            # shapes (e.g., batch size is a symbol "batch" instead of a fixed
+            # number) to OrtBackend.compile(...).
+            @torch._dynamo.optimize(dynamic_aot_ort, dynamic=True)
+            def optimized_elementwise_model(tensor_x: torch.Tensor):
+                return elementwise_model(tensor_x)
+
+            def run(fun, seed: torch.Tensor):
+                tensor_x = seed.detach().clone().requires_grad_()
+                tensor_y = fun(tensor_x)
+                tensor_y.sum().backward()
+                return tensor_x, tensor_y, tensor_x.grad
+
+            # Dimension changed.
+            for shape in [(2, 3), (3, 4)]:
+                seed = torch.rand(shape)
+                # Baseline.
+                tensor_x, tensor_y, tensor_x_grad = run(elementwise_model, seed)
+                # ORT result.
+                tensor_x_new, tensor_y_new, tensor_x_grad_new = run(optimized_elementwise_model, seed)
+
+                torch.testing.assert_close(tensor_x, tensor_x_new)
+                torch.testing.assert_close(tensor_y, tensor_y_new)
+                torch.testing.assert_close(tensor_x_grad, tensor_x_grad_new)
+
+            # Rank changed.
+            for shape in [(1,), (2,), (2, 3), (2, 3, 4)]:
+                seed = torch.rand(shape)
+                # Baseline.
+                tensor_x, tensor_y, tensor_x_grad = run(elementwise_model, seed)
+                # ORT result.
+                tensor_x_new, tensor_y_new, tensor_x_grad_new = run(optimized_elementwise_model, seed)
+
+                torch.testing.assert_close(tensor_x, tensor_x_new)
+                torch.testing.assert_close(tensor_y, tensor_y_new)
+                torch.testing.assert_close(tensor_x_grad, tensor_x_grad_new)
+
+        run_elementwise_model()
+
+    def test_elementwise_model_with_dynamic_shapes_and_complicated_output_schema(self):
+        torch._dynamo.reset()
+
+        def run_elementwise_model():
+            # A function to test DORT.
+            def elementwise_model(tensor_x: torch.Tensor):
+                tensor_y = tensor_x.sigmoid()
+                tensor_z = tensor_y + tensor_x
+                tensor_p = tensor_z * tensor_x
+                tensor_q = tensor_p.sigmoid()
+                return (tensor_q, (tensor_y, tensor_z))
+
+            local_aot_ort, ort_backend = make_aot_ort(dynamic=True)
+            cached = ort_backend._all_ort_execution_info.execution_info_per_graph_module
+            # Before compilation, no graph is generated.
+            assert len(cached) == 0
+
+            # This function should only generate one graph and execute
+            # it for all inputs.
+            # With dynamic_shape=True, Dynamo sends FX graphs with dynamic
+            # shapes (e.g., batch size is a symbol "batch" instead of a fixed
+            # number) to OrtBackend.compile(...).
+            @torch._dynamo.optimize(local_aot_ort, dynamic=True)
+            def optimized_elementwise_model(tensor_x: torch.Tensor):
+                return elementwise_model(tensor_x)
+
+            def run(fun, seed: torch.Tensor):
+                tensor_x = seed.detach().clone().requires_grad_()
+                result = fun(tensor_x)
+                forward_outputs, _ = _pytree.tree_flatten(result)
+                result[0].sum().backward()
+                return (tensor_x, *forward_outputs, tensor_x.grad)
+
+            # Dimension changed.
+            for shape in [(2, 3), (3, 4)]:
+                seed = torch.rand(shape)
+                # Baseline.
+                baseline_tensors = run(elementwise_model, seed)
+                # ORT result.
+                tensors = run(optimized_elementwise_model, seed)
+
+                for tensor, baseline_tensor in zip(tensors, baseline_tensors):
+                    torch.testing.assert_close(tensor, baseline_tensor)
+
+            assert (
+                len(cached.keys()) == 2
+            ), "Should only see two GraphModules so far. One for forward and the other one for backward."
+            for value in cached.values():
+                assert len(value) == 1, (
+                    "One GraphModule should only be mapped to one ONNX model since "
+                    "dynamic shape is enabled and input tensor's rank is unchanged."
+                )
+
+            # Rank changed.
+            for shape in [(1,), (2,), (2, 3), (2, 3, 4)]:
+                seed = torch.rand(shape)
+                # Baseline.
+                baseline_tensors = run(elementwise_model, seed)
+                # ORT result.
+                tensors = run(optimized_elementwise_model, seed)
+
+                for tensor, baseline_tensor in zip(tensors, baseline_tensors):
+                    torch.testing.assert_close(tensor, baseline_tensor)
+
+            # 4 GraphModule's respectively for
+            #  - (1,)
+            #  - (2,)
+            #  - (2, 3), (3, 4)
+            #  - (2, 3, 4)
+            # Because (1,) is treated as a special dimension in Dynamo,
+            # we can NOT merge (1,) and (2,). More specifically, their GraphModule's
+            # are hashed to different values.
+            # Another 4 GraphModule's for the corresponding backward passes.
+            assert len(cached.keys()) == 8
+            for value in cached.values():
+                # When dynamic shape is enabled, there should be only one ONNX model
+                # for inputs with the same rank.
+                assert len(value) == 1
+
+        run_elementwise_model()
+
     def test_elementwise_model_for_inference(self):
         torch._dynamo.reset()
 
diff --git a/orttraining/orttraining/test/python/orttraining_test_dort_custom_ops.py b/orttraining/orttraining/test/python/orttraining_test_dort_custom_ops.py
index 9c18b0347fb98..338a5212f6385 100644
--- a/orttraining/orttraining/test/python/orttraining_test_dort_custom_ops.py
+++ b/orttraining/orttraining/test/python/orttraining_test_dort_custom_ops.py
@@ -11,6 +11,7 @@
 from functorch.compile import min_cut_rematerialization_partition
 from torch._dynamo.backends.common import aot_autograd
 from torch.library import Library
+from torch.onnx._internal.exporter import ExportOptions
 
 import onnxruntime
 from onnxruntime.training.torchdynamo.ort_backend import OrtBackend
@@ -90,12 +91,22 @@ def create_onnxruntime_session_options():
         session_options.register_custom_ops_library(custom_op_library_path)
         return session_options
 
-    def test_DORT_custom_ops(self):
-        torch._dynamo.reset()
+    def test_export_aten_mul_as_onnx_custom_op_and_run_ort(self):
+        """A Custom Operator Test for DORT
 
-        session_options = TestTorchDynamoOrtCustomOp.create_onnxruntime_session_options()
+        In this test, aten.mul.Tensor is exported to test.customop::CustomOpOne and
+        executed by ORT.
+        """
+        torch._dynamo.reset()
 
-        ort_backend = OrtBackend(ep="CPUExecutionProvider", session_options=session_options)
+        # Create executor of ONNX model.
+        # We will register a custom exporter for aten.mul.Tensor
+        # in the following step.
+        ort_backend = OrtBackend(
+            ep="CPUExecutionProvider",
+            session_options=TestTorchDynamoOrtCustomOp.create_onnxruntime_session_options(),
+            onnx_exporter_options=ExportOptions(dynamic_shapes=True),
+        )
         # Register custom_exporter_for_aten_add_Tensor as "aten::mul.Tensor"'s
         # exporter.
         # Use custom_exporter_for_aten_add_Tensor.to_function_proto() to see
@@ -107,6 +118,7 @@ def test_DORT_custom_ops(self):
             overload="Tensor",
         )
 
+        # Wrap ORT executor as a Dynamo backend.
         aot_ort = aot_autograd(
             fw_compiler=ort_backend,
             partition_fn=min_cut_rematerialization_partition,
@@ -126,7 +138,15 @@ def one_mul(tensor_x: torch.Tensor, tensor_y: torch.Tensor):
             result_ort = opt_mul(tensor_x, tensor_y)
             torch.testing.assert_close(result_ref, result_ort)
 
-    def test_dort_with_custom_torch_op_library(self):
+    def test_export_pytorch_custom_op_to_onnx_custom_op_and_run_ort(self):
+        """A Custom Operator Test.
+
+        In this test, torch.ops.foo.bar.default is exported to
+        test.customop::CustomOpOne and executed by ORT.
+
+        See test_export_aten_mul_as_onnx_custom_op_and_run_ort for mapping
+        official PyTorch operator (e.g., aten.mul.Tensor) to ONNX custom operator.
+        """
         torch._dynamo.reset()
 
         foo_lib = Library("foo", "DEF")
@@ -139,20 +159,22 @@ def bar_impl(self: torch.Tensor) -> torch.Tensor:
 
         foo_lib.impl(bar_name, bar_impl, "CompositeExplicitAutograd")
 
-        # TODO(wechi): Redesign API to expose this better.
-
-        session_options = TestTorchDynamoOrtCustomOp.create_onnxruntime_session_options()
-        ort_backend = OrtBackend(ep="CPUExecutionProvider", session_options=session_options)
+        # Create executor of ONNX model.
+        ort_backend = OrtBackend(
+            ep="CPUExecutionProvider", session_options=TestTorchDynamoOrtCustomOp.create_onnxruntime_session_options()
+        )
         # Allow torch.ops.foo.bar.default to be sent to DORT.
         # _support_dict tells Dynamo which ops to sent to DORT.
         ort_backend._supported_ops._support_dict.add(torch.ops.foo.bar.default)
         # Ask exporter to map "torch.ops.foo.bar" to
         # custom_exporter_for_foo_bar_default.
+        # TODO(wechi): Redesign API to expose this better.
         ort_backend.resolved_onnx_exporter_options.onnxfunction_dispatcher.onnx_registry.register_custom_op(
             function=custom_exporter_for_foo_bar_default,
             namespace="foo",
             op_name="bar",
         )
+        # Wrap ORT executor as a Dynamo backend.
         aot_ort = aot_autograd(
             fw_compiler=ort_backend,
             partition_fn=min_cut_rematerialization_partition,

From f2c0470436c39f4a6fb319c7c50e110b464bb7df Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Tue, 25 Jul 2023 08:21:46 +0800
Subject: [PATCH 31/34] Fix slice upstream - Incompatible dimensions (#16818)

### Fix slice upstream - (MatMul) [ShapeInferenceError] Incompatible
dimensions

```
     2023-07-22 14:58:16.918478478 [I:onnxruntime:Default, constant_sharing.cc:256 ApplyImpl] Total shared scalar initializer count: 10
        2023-07-22 14:58:16.919494252 [W:onnxruntime:Default, graph.cc:108 MergeShapeInfo] Error merging shape info for output. 'onnx::Cast_424' source:{-1,31,-1,-1} target:{-1,32,-1,-1}. Falling back to lenient merge.
        2023-07-22 14:58:16.921014114 [W:onnxruntime:Default, graph.cc:108 MergeShapeInfo] Error merging shape info for output. 'onnx::MatMul_425' source:{-1,31,-1,-1} target:{-1,32,-1,-1}. Falling back to lenient merge.

Traceback (most recent call last):
  File "examples/onnxruntime/training/language-modeling/run_clm.py", line 594, in <module>
    main()
  File "examples/onnxruntime/training/language-modeling/run_clm.py", line 542, in main
    train_result = trainer.train(resume_from_checkpoint=checkpoint)
  File "/bert_ort/pengwa/optimum/optimum/onnxruntime/trainer.py", line 454, in train
    return inner_training_loop(
  File "/bert_ort/pengwa/optimum/optimum/onnxruntime/trainer.py", line 755, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs)
  File "/bert_ort/pengwa/py38/lib/python3.8/site-packages/transformers/trainer.py", line 2735, in training_step
    loss = self.compute_loss(model, inputs)
  File "/bert_ort/pengwa/optimum/optimum/onnxruntime/trainer.py", line 363, in compute_loss
    return model_with_loss(dict_inputs, return_outputs)
  File "/bert_ort/pengwa/py38/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/bert_ort/pengwa/py38/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
    ret_val = func(*args, **kwargs)
  File "/bert_ort/pengwa/py38/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1724, in forward
    loss = self.module(*inputs, **kwargs)
  File "/bert_ort/pengwa/py38/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/bert_ort/pengwa/py38/lib/python3.8/site-packages/onnxruntime/training/ortmodule/_utils.py", line 384, in _forward
    return ortmodule._torch_module.forward(*inputs, **kwargs)
  File "/bert_ort/pengwa/py38/lib/python3.8/site-packages/onnxruntime/training/ortmodule/_utils.py", line 364, in _forward
    return torch_module_ort._execution_manager(torch_module_ort.is_training()).forward(*inputs, **kwargs)
  File "/bert_ort/pengwa/py38/lib/python3.8/site-packages/onnxruntime/training/ortmodule/_training_manager.py", line 345, in forward
    self._fallback_manager.handle_exception(
  File "/bert_ort/pengwa/py38/lib/python3.8/site-packages/onnxruntime/training/ortmodule/_fallback.py", line 157, in handle_exception
    raise exception
  File "/bert_ort/pengwa/py38/lib/python3.8/site-packages/onnxruntime/training/ortmodule/_training_manager.py", line 280, in forward
    self._build_graph(graph_transformer_config)
  File "/bert_ort/pengwa/py38/lib/python3.8/site-packages/onnxruntime/training/ortmodule/_logger.py", line 218, in wrapper
    result = func(graph_execution_manager, *args, **kwargs)
  File "/bert_ort/pengwa/py38/lib/python3.8/site-packages/onnxruntime/training/ortmodule/_training_manager.py", line 360, in _build_graph
    super()._build_graph(graph_transformer_config)
  File "/bert_ort/pengwa/py38/lib/python3.8/site-packages/onnxruntime/training/ortmodule/_graph_execution_manager.py", line 186, in _build_graph
    self._graph_builder.build(config)
RuntimeError: /bert_ort/pengwa/onnxruntime/orttraining/orttraining/python/orttraining_pybind_state.cc:823 onnxruntime::python::addObjectMethodsForTraining(pybind11::module&, onnxruntime::python::ExecutionProviderRegistrationFn)::<lambda(onnxruntime::training::OrtModuleGraphBuilder*, const onnxruntime::training::TrainingGraphTransformerConfiguration&)> [ONNXRuntimeError] : 1 : FAIL : Node (MatMul_403) Op (MatMul) [ShapeInferenceError] Incompatible dimensions


```

Missed using `axis` attribute for `Slice` op, so change to use `axes`
inputs instead.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../compute_optimizer/shared_utils.cc         |   4 +-
 .../compute_optimizer/upstream_gather.cc      |  83 +++-
 .../upstream_gather_actors.h                  |  24 +-
 .../test/optimizer/compute_optimizer_test.cc  | 375 ++++++++++++++----
 4 files changed, 397 insertions(+), 89 deletions(-)

diff --git a/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc b/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc
index d076a9a1f3eec..913f3b6811183 100644
--- a/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc
+++ b/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc
@@ -184,7 +184,9 @@ NodeArg* CreateInitializerFromVector(Graph& graph,
     total_count *= dim;
   }
 
-  ORT_ENFORCE(total_count == static_cast<int64_t>(values.size()));
+  ORT_ENFORCE(total_count == static_cast<int64_t>(values.size()),
+              "The total count of dims does not match the size of values. ",
+              "total_count: ", total_count, " values.size(): ", values.size());
 
   const_tensor.set_raw_data(values.data(), values.size() * sizeof(int64_t));
   return &graph_utils::AddInitializer(graph, const_tensor);
diff --git a/onnxruntime/core/optimizer/compute_optimizer/upstream_gather.cc b/onnxruntime/core/optimizer/compute_optimizer/upstream_gather.cc
index 9ad5edf4f21b1..094ea1e24dd92 100644
--- a/onnxruntime/core/optimizer/compute_optimizer/upstream_gather.cc
+++ b/onnxruntime/core/optimizer/compute_optimizer/upstream_gather.cc
@@ -138,20 +138,65 @@ SliceInfo UpStreamGatherGraphTransformer::PropagateSlicingForInput(
                              std::to_string(!info.is_scalar_slice));
 
   InlinedVector<NodeArg*> input_args;
-  input_args.reserve(slice_node.InputDefs().size());
+  input_args.resize(slice_node.InputDefs().size());
+
+  int axis_input_index = -1;  // -1 means axis is passed in attribute.
+  if (std::holds_alternative<int>(info.axis_attr_name_or_input_index)) {
+    axis_input_index = std::get<int>(info.axis_attr_name_or_input_index);
+  }
+
+  auto create_axes_input = [&info, new_axis, &graph]() -> NodeArg* {
+    InlinedVector<int64_t> dims;
+    if (info.rank_of_axis_value == 1) {
+      dims.push_back(1);
+    }
+    return CreateInitializerFromVector(graph, dims, {new_axis}, graph.GenerateNodeArgName("axes"));
+  };
+
   // The first slice op's data input should be current_node's current_node_input_index-th input.
   // For some cases when rank changes, slice op's slice input should also be adapted.
-  input_args.push_back(current_node.MutableInputDefs()[current_node_input_index]);
-  for (size_t i = 1; i < slice_node.InputDefs().size(); ++i) {
-    input_args.push_back(slice_node.MutableInputDefs()[i]);
+  int i = 0;
+  for (; i < static_cast<int>(slice_node.InputDefs().size()); ++i) {
+    if (i == info.GetDataInputIndex()) {
+      input_args[i] = current_node.MutableInputDefs()[current_node_input_index];
+    } else if (axis_input_index != -1 && i == axis_input_index) {
+      if (info.non_negative_axis == new_axis) {
+        input_args[i] = slice_node.MutableInputDefs()[i];
+      } else {
+        input_args[i] = create_axes_input();
+      }
+    } else {
+      input_args[i] = slice_node.MutableInputDefs()[i];
+    }
+  }
+
+  // It is possible axes input is null.
+  if (axis_input_index != -1 && info.non_negative_axis != new_axis) {
+    for (; i <= axis_input_index; ++i) {
+      if (i == axis_input_index) {
+        input_args.push_back(create_axes_input());
+      } else {
+        NodeArg& empty_input = graph.GetOrCreateNodeArg("", nullptr);
+        input_args.push_back(&empty_input);
+      }
+    }
   }
 
   // Update the axis attribute if new_axis is not the same as the original slicing axis (which happens when data
   // layout got changed by Transpose or Reshape ops)
   onnxruntime::NodeAttributes attributes = slice_node.GetAttributes();
-  if (info.non_negative_axis != new_axis) {
-    attributes[info.axis_attr_name] =
-        ONNX_NAMESPACE::MakeAttribute(info.axis_attr_name, static_cast<int64_t>(new_axis));
+
+  if (axis_input_index == -1 && info.non_negative_axis != new_axis) {
+    std::string attr_name = std::get<std::string>(info.axis_attr_name_or_input_index);
+    if (info.rank_of_axis_value == 0) {
+      attributes[attr_name] =
+          ONNX_NAMESPACE::MakeAttribute(attr_name, static_cast<int64_t>(new_axis));
+    } else if (info.rank_of_axis_value == 1) {
+      attributes[attr_name] =
+          ONNX_NAMESPACE::MakeAttribute(attr_name, std::vector<int64_t>{static_cast<int64_t>(new_axis)});
+    } else {
+      ORT_THROW("Unexpected rank of axis attribute value: " + std::to_string(info.rank_of_axis_value));
+    }
   }
 
   InlinedVector<NodeArg*> output_args;
@@ -183,7 +228,8 @@ SliceInfo UpStreamGatherGraphTransformer::PropagateSlicingForInput(
   auto new_slice_out_arg = new_slice_node->MutableOutputDefs()[new_slice_output_index_to_connect];
   UpdateSliceOutputShape(*new_slice_out_arg, new_axis, info.output_dim_on_axis);
 
-  auto new_slice_info = SliceInfo(graph, new_slice_node, info.is_scalar_slice, info.axis_attr_name, new_axis);
+  auto new_slice_info = SliceInfo(graph, new_slice_node, info.is_scalar_slice, info.axis_attr_name_or_input_index,
+                                  new_axis, info.rank_of_axis_value);
   new_slice_info.entry_node_name = info.entry_node_name;
   new_slice_info.entry_slice_arg_name = info.entry_slice_arg_name;
   return new_slice_info;
@@ -263,7 +309,8 @@ std::optional<SliceInfo> IsSupportedGatherND(Graph& graph, Node& node,
     return std::nullopt;
   }
 
-  return SliceInfo(graph, &node, false, "batch_dims", static_cast<int>(batch_dims), true);
+  return SliceInfo(graph, &node, false, "batch_dims", static_cast<int>(batch_dims),
+                   0 /* rank of axis attribute value */, true);
 }
 
 std::optional<SliceInfo> IsSupportedGather(Graph& graph, Node& node,
@@ -304,7 +351,7 @@ std::optional<SliceInfo> IsSupportedGather(Graph& graph, Node& node,
     }
   }
 
-  return SliceInfo(graph, &node, dim_size == 0, "axis", axis, true);
+  return SliceInfo(graph, &node, dim_size == 0, "axis", axis, 0 /* rank of axis attribute value */, true);
 }
 
 std::optional<SliceInfo> IsSupportedShrunkenGather(Graph& graph, Node& node,
@@ -342,7 +389,7 @@ std::optional<SliceInfo> IsSupportedShrunkenGather(Graph& graph, Node& node,
     return std::nullopt;
   }
 
-  return SliceInfo(graph, &node, false /*is_slice_scalar*/, "axis", axis, true);
+  return SliceInfo(graph, &node, false /*is_slice_scalar*/, "axis", axis, 0 /* rank of axis attribute value */, true);
 }
 
 /**
@@ -366,34 +413,37 @@ std::optional<SliceInfo> IsSupportedSlice(Graph& graph, Node& node,
   const NodeArg* axes_input = node.InputDefs().size() > 3 ? node.InputDefs()[3] : nullptr;
 
   if (data_input->Shape() == nullptr || starts_input->Shape() == nullptr || ends_input->Shape() == nullptr ||
-      (axes_input && axes_input->Shape() == nullptr)) {
+      (axes_input && axes_input->Exists() && axes_input->Shape() == nullptr)) {
     LOG_DEBUG_INFO(logger, "Skip Slice node " + node.Name() + " due to undefined shape.");
     return std::nullopt;
   }
 
   // Make sure starts/ends/axes/steps are all 1D tensors, since we only support single-dimension slicing.
   if (starts_input->Shape()->dim_size() != 1 || ends_input->Shape()->dim_size() != 1 ||
-      (axes_input && axes_input->Shape()->dim_size() != 1)) {
+      (axes_input && axes_input->Exists() && axes_input->Shape()->dim_size() != 1)) {
     LOG_DEBUG_INFO(logger, "Skip Slice node " + node.Name() + " due to unsupported dim size: " +
                                std::to_string(starts_input->Shape()->dim_size()) + ", " +
                                std::to_string(ends_input->Shape()->dim_size()) + ", " +
-                               std::to_string(axes_input ? axes_input->Shape()->dim_size() : 0));
+                               std::to_string(axes_input && axes_input->Exists() ? axes_input->Shape()->dim_size() : 0));
     return std::nullopt;
   }
 
   // Try to parse the 'axes' value.
   int axis = 0;
-  if (axes_input) {
+  if (axes_input && axes_input->Exists()) {
     InlinedVector<int64_t> axes_values;
     if (!graph_utils::IsConstantInitializer(graph, axes_input->Name()) ||
         !optimizer_utils::AppendTensorFromInitializer(graph, *axes_input, axes_values, true) ||
         axes_values.size() != 1) {
+      LOG_DEBUG_INFO(logger, "Skip Slice node " + node.Name() + " due to unsupported axes value.");
       return std::nullopt;
     }
     axis = static_cast<int>(axes_values[0]);
   } else {
     // If 'axes' is not specified, then it is [0, .., r-1], so we force data rank to be 1.
     if (data_input->Shape()->dim_size() != 1) {
+      LOG_DEBUG_INFO(logger, "Skip Slice node " + node.Name() + " due to unsupported data rank: " +
+                                 std::to_string(data_input->Shape()->dim_size()));
       return std::nullopt;
     }
   }
@@ -401,7 +451,8 @@ std::optional<SliceInfo> IsSupportedSlice(Graph& graph, Node& node,
   if (axis < 0)
     axis += data_input->Shape()->dim_size();
 
-  return SliceInfo(graph, &node, false /*is_slice_scalar*/, "axis", axis, true);
+  return SliceInfo(graph, &node, false /*is_slice_scalar*/, 3 /* axis input index */, axis,
+                   1 /* rank of axes value */, true);
 }
 
 }  // namespace
diff --git a/onnxruntime/core/optimizer/compute_optimizer/upstream_gather_actors.h b/onnxruntime/core/optimizer/compute_optimizer/upstream_gather_actors.h
index 514368cea16c7..f6715e4bb1f32 100644
--- a/onnxruntime/core/optimizer/compute_optimizer/upstream_gather_actors.h
+++ b/onnxruntime/core/optimizer/compute_optimizer/upstream_gather_actors.h
@@ -25,11 +25,21 @@ struct SliceInfo : public UpstreamOperatorInfoBase {
  public:
   SliceInfo(const Graph& graph, Node* slice_node,
             bool is_slice_scalar,
-            const std::string& slice_axis_attr_name,
+            std::variant<std::string, int> axis_name_or_index,
             int slice_axis,
+            int rank_of_axis,
             bool is_entry_node_ptr = false)
       : UpstreamOperatorInfoBase(slice_node, is_entry_node_ptr), is_scalar_slice(is_slice_scalar) {
-    axis_attr_name = slice_axis_attr_name;
+    axis_attr_name_or_input_index = axis_name_or_index;
+    rank_of_axis_value = rank_of_axis;
+
+    if (std::holds_alternative<int>(axis_name_or_index)) {
+      int axis_input_index = std::get<int>(axis_name_or_index);
+      ORT_ENFORCE(axis_input_index >= 0, "Axis input index is invalid");
+    }
+
+    ORT_ENFORCE(rank_of_axis_value == 0 || rank_of_axis_value == 1, "Rank of axis value is invalid: " +
+                                                                        std::to_string(rank_of_axis_value));
 
     const NodeArg* input = node_ptr->InputDefs()[kSliceDataInputIndex_];
     const NodeArg* output = node_ptr->OutputDefs()[kSliceOutputIndex_];
@@ -65,8 +75,16 @@ struct SliceInfo : public UpstreamOperatorInfoBase {
   }
 
   bool is_scalar_slice;  // whether the slice is a scalar, if it is after Gather, the rank will be reduced by 1.
-  std::string axis_attr_name;
+
+  // The index of the input that contains the axis value. If it is a string, then axis will be treated as an attribute.
+  std::variant<std::string, int> axis_attr_name_or_input_index;
+
   int non_negative_axis;  // The axis to slice on
+
+  // The rank of value for axis attribute. For example, for Gather, its axis attribute is a scalar, so the rank is 0.
+  // For Slice, its axes attribute is a 1D tensor, so the rank is 1.
+  int rank_of_axis_value;
+
   std::string entry_slice_arg_name;
 
   int input_rank;  // rank of the Gather data input tensor
diff --git a/onnxruntime/test/optimizer/compute_optimizer_test.cc b/onnxruntime/test/optimizer/compute_optimizer_test.cc
index 55a78648205e1..fe2a49577ca10 100644
--- a/onnxruntime/test/optimizer/compute_optimizer_test.cc
+++ b/onnxruntime/test/optimizer/compute_optimizer_test.cc
@@ -18,6 +18,7 @@
 #include "core/common/span_utils.h"
 #include "core/framework/data_types.h"
 #include "core/framework/ort_value.h"
+#include "core/framework/tensorprotoutils.h"
 #include "core/graph/graph_utils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/model.h"
@@ -1597,93 +1598,329 @@ Test graph includes multiple equivalent subgraphs as below.
 Add an Identity node because currently we don't allow Slice generates graph output.
 */
 TEST(ComputeOptimizerTests, SliceElementwiseOps_PropagationOnTwoBranches) {
-  const logging::Logger* logger = &logging::LoggingManager::DefaultLogger();
-  InlinedVector<int64_t> starts_indices;
-  auto pre_graph_checker = [&starts_indices](Graph& graph) -> Status {
-    auto op_count_pre = CountOpsInGraph(graph);
-    TEST_RETURN_IF_NOT(op_count_pre.size() == 3U);
-    TEST_RETURN_IF_NOT(op_count_pre["Add"] == 1);
-    TEST_RETURN_IF_NOT(op_count_pre["Slice"] == 1);
-    TEST_RETURN_IF_NOT(op_count_pre["Identity"] == 1);
+  // 0: no input, 1: has input, 2: empty input
+  std::vector<std::tuple<std::optional<int>, std::vector<int64_t>, int, int, bool>> has_axes_and_has_steps_pairs{
+      {std::nullopt, {4, 32, 256}, 0, 0, false},  // {axis, data_shape, has_axes, has_steps, expected to propagate}
+      {1, {4, 32, 256}, 1, 0, true},
+      {1, {4, 32, 256}, 1, 1, true},
+      {1, {4, 32, 256}, 1, 2, true},
+      {std::nullopt, {4, 32, 256}, 2, 0, false},
+      {std::nullopt, {4, 32, 256}, 2, 1, false},
+      {std::nullopt, {4, 32, 256}, 2, 2, false},
+
+      {std::nullopt, {256}, 0, 0, true},
+      {0, {256}, 1, 0, true},
+      {0, {256}, 1, 1, true},
+      {0, {256}, 1, 2, true},
+      {std::nullopt, {256}, 2, 0, true},
+      {std::nullopt, {256}, 2, 1, true},
+      {std::nullopt, {256}, 2, 2, true},
+  };
 
-    for (Node& node : graph.Nodes()) {
-      if (node.OpType() == "Slice") {
-        TEST_RETURN_IF_NOT(starts_indices.empty());
-        constexpr bool require_constant = true;
-        NodeArg* initializer_node_arg = graph.GetNodeArg(node.InputDefs()[1]->Name());
-        TEST_RETURN_IF_NOT(optimizer_utils::AppendTensorFromInitializer(graph, *initializer_node_arg, starts_indices,
-                                                                        require_constant));
+  for (auto p : has_axes_and_has_steps_pairs) {
+    std::optional<int> axis = std::get<0>(p);
+    std::vector<int64_t> data_shape = std::get<1>(p);
+    int has_axes = std::get<2>(p);
+    int has_steps = std::get<3>(p);
+    bool expected_to_propagate = std::get<4>(p);
+
+    const logging::Logger* logger = &logging::LoggingManager::DefaultLogger();
+    InlinedVector<int64_t> starts_indices;
+    auto pre_graph_checker = [&starts_indices](Graph& graph) -> Status {
+      auto op_count_pre = CountOpsInGraph(graph);
+      TEST_RETURN_IF_NOT(op_count_pre.size() == 3U);
+      TEST_RETURN_IF_NOT(op_count_pre["Add"] == 1);
+      TEST_RETURN_IF_NOT(op_count_pre["Slice"] == 1);
+      TEST_RETURN_IF_NOT(op_count_pre["Identity"] == 1);
+
+      for (Node& node : graph.Nodes()) {
+        if (node.OpType() == "Slice") {
+          TEST_RETURN_IF_NOT(starts_indices.empty());
+          constexpr bool require_constant = true;
+          NodeArg* initializer_node_arg = graph.GetNodeArg(node.InputDefs()[1]->Name());
+          TEST_RETURN_IF_NOT(optimizer_utils::AppendTensorFromInitializer(graph, *initializer_node_arg, starts_indices,
+                                                                          require_constant));
+        }
       }
-    }
-    return Status::OK();
-  };
+      return Status::OK();
+    };
 
-  auto post_graph_checker = [&starts_indices](Graph& graph) {
-    auto op_count_post = CountOpsInGraph(graph);
-    TEST_RETURN_IF_NOT(op_count_post.size() == 3U);
-    TEST_RETURN_IF_NOT(op_count_post["Add"] == 1);
-    TEST_RETURN_IF_NOT(op_count_post["Slice"] == 2);
-    TEST_RETURN_IF_NOT(op_count_post["Identity"] == 1);
+    auto post_graph_checker = [&starts_indices, expected_to_propagate](Graph& graph) {
+      auto op_count_post = CountOpsInGraph(graph);
+      TEST_RETURN_IF_NOT(op_count_post.size() == 3U);
+      TEST_RETURN_IF_NOT(op_count_post["Add"] == 1);
+      if (expected_to_propagate) {
+        TEST_RETURN_IF_NOT(op_count_post["Slice"] == 2);
+      } else {
+        TEST_RETURN_IF_NOT(op_count_post["Slice"] == 1);
+      }
+      TEST_RETURN_IF_NOT(op_count_post["Identity"] == 1);
+
+      for (Node& node : graph.Nodes()) {
+        if (node.OpType() == "Add") {
+          const auto& input_defs = node.InputDefs();
+
+          {
+            auto producer_node = graph.GetProducerNode(input_defs[0]->Name());
+
+            if (expected_to_propagate) {
+              TEST_RETURN_IF_NOT(producer_node != nullptr);
+              TEST_RETURN_IF_NOT(producer_node->OpType() == "Slice");
+
+              InlinedVector<int64_t> values;
+              constexpr bool require_constant = true;
+              NodeArg* initializer_node_arg = graph.GetNodeArg(producer_node->InputDefs()[1]->Name());
+              TEST_RETURN_IF_NOT(optimizer_utils::AppendTensorFromInitializer(graph, *initializer_node_arg, values,
+                                                                              require_constant));
+              for (size_t i = 0; i < values.size(); i++) {
+                TEST_RETURN_IF_NOT(values[i] == starts_indices[i]);
+              }
+            } else {
+              TEST_RETURN_IF_NOT(producer_node == nullptr);
+            }
+          }
 
-    for (Node& node : graph.Nodes()) {
-      if (node.OpType() == "Add") {
-        const auto& input_defs = node.InputDefs();
+          {
+            auto producer_node = graph.GetProducerNode(input_defs[1]->Name());
+
+            if (expected_to_propagate) {
+              TEST_RETURN_IF_NOT(producer_node != nullptr);
+              TEST_RETURN_IF_NOT(producer_node->OpType() == "Slice");
+
+              InlinedVector<int64_t> values;
+              constexpr bool require_constant = true;
+              NodeArg* initializer_node_arg = graph.GetNodeArg(producer_node->InputDefs()[1]->Name());
+              TEST_RETURN_IF_NOT(optimizer_utils::AppendTensorFromInitializer(graph, *initializer_node_arg, values, require_constant));
+              for (size_t i = 0; i < values.size(); i++) {
+                TEST_RETURN_IF_NOT(values[i] == starts_indices[i]);
+              }
+            } else {
+              TEST_RETURN_IF_NOT(producer_node == nullptr);
+            }
+          }
+        }
+      }
+      return Status::OK();
+    };
 
-        {
-          auto producer_node = graph.GetProducerNode(input_defs[0]->Name());
-          TEST_RETURN_IF_NOT(producer_node != nullptr);
-          TEST_RETURN_IF_NOT(producer_node->OpType() == "Slice");
+    auto build_test_case = [has_axes, has_steps, &data_shape, axis](ModelTestBuilder& builder) {
+      auto* input1_arg = builder.MakeInput<int64_t>(data_shape);
+      auto* input2_arg = builder.MakeInput<int64_t>(data_shape);
+      auto* add_out = builder.MakeIntermediate();
+      builder.AddNode("Add", {input1_arg, input2_arg}, {add_out});
 
-          InlinedVector<int64_t> values;
+      auto* starts_initializer = builder.MakeInitializer<int64_t>({1}, {0});
+      auto* ends_initializer = builder.MakeInitializer<int64_t>({1}, {-1});
+
+      std::vector<NodeArg*> slice_inputs;
+      slice_inputs = {add_out, starts_initializer, ends_initializer};
+
+      NodeArg* axes_initializer = nullptr;
+      NodeArg* steps_initializer = nullptr;
+      if (has_axes == 0 && has_steps == 0) {
+        // nothing
+      } else if (has_axes == 1 && has_steps == 0) {
+        axes_initializer = builder.MakeInitializer<int64_t>({1}, {axis.value()});
+        slice_inputs.push_back(axes_initializer);
+      } else if (has_axes == 1 && has_steps == 1) {
+        axes_initializer = builder.MakeInitializer<int64_t>({1}, {axis.value()});
+        slice_inputs.push_back(axes_initializer);
+        steps_initializer = builder.MakeInitializer<int64_t>({1}, {1});
+        slice_inputs.push_back(steps_initializer);
+      } else if (has_axes == 1 && has_steps == 2) {
+        axes_initializer = builder.MakeInitializer<int64_t>({1}, {axis.value()});
+        slice_inputs.push_back(axes_initializer);
+        steps_initializer = builder.MakeEmptyInput();
+        slice_inputs.push_back(steps_initializer);
+      } else if (has_axes == 2 && has_steps == 0) {
+        axes_initializer = builder.MakeEmptyInput();
+        slice_inputs.push_back(axes_initializer);
+      } else if (has_axes == 2 && has_steps == 1) {
+        axes_initializer = builder.MakeEmptyInput();
+        slice_inputs.push_back(axes_initializer);
+        steps_initializer = builder.MakeInitializer<int64_t>({1}, {1});
+        slice_inputs.push_back(steps_initializer);
+      } else if (has_axes == 2 && has_steps == 2) {
+        axes_initializer = builder.MakeEmptyInput();
+        slice_inputs.push_back(axes_initializer);
+        steps_initializer = builder.MakeEmptyInput();
+        slice_inputs.push_back(steps_initializer);
+      }
+
+      auto* slice_out = builder.MakeIntermediate();
+      builder.AddNode("Slice", slice_inputs,
+                      {slice_out});
+
+      auto* identity_out = builder.MakeOutput();
+      builder.AddNode("Identity", {slice_out}, {identity_out});
+    };
+
+    std::unique_ptr<GraphTransformer> transformer = std::make_unique<UpStreamGatherGraphTransformer>();
+    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger, std::move(transformer),
+                                          TransformerLevel::Level1,
+                                          1, pre_graph_checker, post_graph_checker));
+  }
+}
+
+/*
+Test graph includes multiple equivalent subgraphs as below.
+             graph input [2, 4, 32, 256] (float)
+                            |
+                        Transpose[perms=[0, 2, 1, 3]]
+                            |
+                      [2, 32, 4, 256]
+                            |   starts:(0)  ends: (-1)  axes: (1) steps: (1)
+                            \       \       |          /         /
+                                \       \     |        /       /
+                                  \      \   |     /      /
+                                    \     \  |   /     /
+                                        \   \ |  /   /
+                                            Slice
+                                              |
+                                          Identity
+                                              |
+                                graph output [2, 31, 4, 256] (float)
+
+Add an Identity node because currently, we don't allow Slice generates graph output.
+*/
+TEST(ComputeOptimizerTests, SliceTranspose_Propagation) {
+  // 0: no input, 1: has input, 2: empty input
+  std::vector<std::tuple<int, int, bool>> has_axes_and_has_steps_pairs{
+      {0, 0, false},  // {has_axes, has_steps, expected to propagate}
+      {1, 0, true},
+      {1, 1, true},
+      {1, 2, true},
+      {2, 0, false},
+      {2, 1, false},
+      {2, 2, false},
+  };
+
+  for (auto p : has_axes_and_has_steps_pairs) {
+    int has_axes = std::get<0>(p);
+    int has_steps = std::get<1>(p);
+    bool expected_to_propagate = std::get<2>(p);
+
+    const logging::Logger* logger = &logging::LoggingManager::DefaultLogger();
+    InlinedVector<int64_t> starts_indices;
+    auto pre_graph_checker = [&starts_indices](Graph& graph) -> Status {
+      auto op_count_pre = CountOpsInGraph(graph);
+      TEST_RETURN_IF_NOT(op_count_pre.size() == 3U);
+      TEST_RETURN_IF_NOT(op_count_pre["Transpose"] == 1);
+      TEST_RETURN_IF_NOT(op_count_pre["Slice"] == 1);
+      TEST_RETURN_IF_NOT(op_count_pre["Identity"] == 1);
+
+      for (Node& node : graph.Nodes()) {
+        if (node.OpType() == "Slice") {
+          TEST_RETURN_IF_NOT(starts_indices.empty());
           constexpr bool require_constant = true;
-          NodeArg* initializer_node_arg = graph.GetNodeArg(producer_node->InputDefs()[1]->Name());
-          TEST_RETURN_IF_NOT(optimizer_utils::AppendTensorFromInitializer(graph, *initializer_node_arg, values,
+          NodeArg* initializer_node_arg = graph.GetNodeArg(node.InputDefs()[1]->Name());
+          TEST_RETURN_IF_NOT(optimizer_utils::AppendTensorFromInitializer(graph, *initializer_node_arg, starts_indices,
                                                                           require_constant));
-          for (size_t i = 0; i < values.size(); i++) {
-            TEST_RETURN_IF_NOT(values[i] == starts_indices[i]);
-          }
         }
+      }
+      return Status::OK();
+    };
 
-        {
-          auto producer_node = graph.GetProducerNode(input_defs[1]->Name());
-          TEST_RETURN_IF_NOT(producer_node != nullptr);
-          TEST_RETURN_IF_NOT(producer_node->OpType() == "Slice");
+    auto post_graph_checker = [&starts_indices, expected_to_propagate](Graph& graph) {
+      auto op_count_post = CountOpsInGraph(graph);
 
-          InlinedVector<int64_t> values;
-          constexpr bool require_constant = true;
-          NodeArg* initializer_node_arg = graph.GetNodeArg(producer_node->InputDefs()[1]->Name());
-          TEST_RETURN_IF_NOT(optimizer_utils::AppendTensorFromInitializer(graph, *initializer_node_arg, values, require_constant));
-          for (size_t i = 0; i < values.size(); i++) {
-            TEST_RETURN_IF_NOT(values[i] == starts_indices[i]);
+      TEST_RETURN_IF_NOT(op_count_post.size() == 3U);
+      TEST_RETURN_IF_NOT(op_count_post["Transpose"] == 1);
+      TEST_RETURN_IF_NOT(op_count_post["Slice"] == 1);
+      TEST_RETURN_IF_NOT(op_count_post["Identity"] == 1);
+
+      for (Node& node : graph.Nodes()) {
+        if (node.OpType() == "Transpose") {
+          const auto& input_defs = node.InputDefs();
+
+          auto producer_node = graph.GetProducerNode(input_defs[0]->Name());
+          if (expected_to_propagate) {
+            TEST_RETURN_IF_NOT(producer_node != nullptr);
+            TEST_RETURN_IF_NOT(producer_node->OpType() == "Slice");
+
+            InlinedVector<int64_t> values;
+            constexpr bool require_constant = true;
+            NodeArg* initializer_node_arg = graph.GetNodeArg(producer_node->InputDefs()[1]->Name());
+            TEST_RETURN_IF_NOT(optimizer_utils::AppendTensorFromInitializer(graph, *initializer_node_arg, values,
+                                                                            require_constant));
+            for (size_t i = 0; i < values.size(); i++) {
+              TEST_RETURN_IF_NOT(values[i] == starts_indices[i]);
+            }
+
+            const ONNX_NAMESPACE::TensorShapeProto* slice_out_shape = producer_node->OutputDefs()[0]->Shape();
+            TEST_RETURN_IF_NOT(slice_out_shape != nullptr);
+            TEST_RETURN_IF_NOT(slice_out_shape->dim_size() == 4);
+            TEST_RETURN_IF_NOT(utils::HasDimValue(slice_out_shape->dim(0)) && slice_out_shape->dim(0).dim_value() == 2);
+            TEST_RETURN_IF_NOT(utils::HasDimValue(slice_out_shape->dim(1)) && slice_out_shape->dim(1).dim_value() == 4);
+            TEST_RETURN_IF_NOT(utils::HasDimValue(slice_out_shape->dim(2)) && slice_out_shape->dim(2).dim_value() == 31);
+            TEST_RETURN_IF_NOT(utils::HasDimValue(slice_out_shape->dim(3)) && slice_out_shape->dim(3).dim_value() == 256);
+          } else {
+            TEST_RETURN_IF_NOT(producer_node == nullptr);
           }
         }
       }
-    }
-    return Status::OK();
-  };
 
-  auto build_test_case = [](ModelTestBuilder& builder) {
-    auto* input1_arg = builder.MakeInput<int64_t>({{4, 32, 256}});
-    auto* input2_arg = builder.MakeInput<int64_t>({{4, 32, 256}});
-    auto* add_out = builder.MakeIntermediate();
-    builder.AddNode("Add", {input1_arg, input2_arg}, {add_out});
+      return Status::OK();
+    };
 
-    auto* starts_initializer = builder.MakeInitializer<int64_t>({1}, {0});
-    auto* ends_initializer = builder.MakeInitializer<int64_t>({1}, {-1});
-    auto* axes_initializer = builder.MakeInitializer<int64_t>({1}, {1});
-    auto* steps_initializer = builder.MakeInitializer<int64_t>({1}, {1});
-    auto* slice_out = builder.MakeIntermediate();
-    builder.AddNode("Slice", {add_out, starts_initializer, ends_initializer, axes_initializer, steps_initializer},
-                    {slice_out});
+    auto build_test_case = [has_axes, has_steps](ModelTestBuilder& builder) {
+      auto* input1_arg = builder.MakeInput<int64_t>({{2, 4, 32, 256}});
+      auto* trans_out = builder.MakeIntermediate();
+      builder.AddNode("Transpose", {input1_arg}, {trans_out})
+          .AddAttribute("perm", std::vector<int64_t>{0, 2, 1, 3});
+
+      std::vector<NodeArg*> slice_inputs;
+
+      auto* starts_initializer = builder.MakeInitializer<int64_t>({1}, {0});
+      auto* ends_initializer = builder.MakeInitializer<int64_t>({1}, {-1});
+
+      slice_inputs = {trans_out, starts_initializer, ends_initializer};
+
+      NodeArg* axes_initializer = nullptr;
+      NodeArg* steps_initializer = nullptr;
+      if (has_axes == 0 && has_steps == 0) {
+        // nothing
+      } else if (has_axes == 1 && has_steps == 0) {
+        axes_initializer = builder.MakeInitializer<int64_t>({1}, {1});
+        slice_inputs.push_back(axes_initializer);
+      } else if (has_axes == 1 && has_steps == 1) {
+        axes_initializer = builder.MakeInitializer<int64_t>({1}, {1});
+        slice_inputs.push_back(axes_initializer);
+        steps_initializer = builder.MakeInitializer<int64_t>({1}, {1});
+        slice_inputs.push_back(steps_initializer);
+      } else if (has_axes == 1 && has_steps == 2) {
+        axes_initializer = builder.MakeInitializer<int64_t>({1}, {1});
+        slice_inputs.push_back(axes_initializer);
+        steps_initializer = builder.MakeEmptyInput();
+        slice_inputs.push_back(steps_initializer);
+      } else if (has_axes == 2 && has_steps == 0) {
+        axes_initializer = builder.MakeEmptyInput();
+        slice_inputs.push_back(axes_initializer);
+      } else if (has_axes == 2 && has_steps == 1) {
+        axes_initializer = builder.MakeEmptyInput();
+        slice_inputs.push_back(axes_initializer);
+        steps_initializer = builder.MakeInitializer<int64_t>({1}, {1});
+        slice_inputs.push_back(steps_initializer);
+      } else if (has_axes == 2 && has_steps == 2) {
+        axes_initializer = builder.MakeEmptyInput();
+        slice_inputs.push_back(axes_initializer);
+        steps_initializer = builder.MakeEmptyInput();
+        slice_inputs.push_back(steps_initializer);
+      }
 
-    auto* identity_out = builder.MakeOutput();
-    builder.AddNode("Identity", {slice_out}, {identity_out});
-  };
+      auto* slice_out = builder.MakeIntermediate();
+      builder.AddNode("Slice", slice_inputs,
+                      {slice_out});
 
-  std::unique_ptr<GraphTransformer> transformer = std::make_unique<UpStreamGatherGraphTransformer>();
-  ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger, std::move(transformer),
-                                        TransformerLevel::Level1,
-                                        1, pre_graph_checker, post_graph_checker));
+      auto* identity_out = builder.MakeOutput();
+      builder.AddNode("Identity", {slice_out}, {identity_out});
+    };
+
+    std::unique_ptr<GraphTransformer> transformer = std::make_unique<UpStreamGatherGraphTransformer>();
+    ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger, std::move(transformer),
+                                          TransformerLevel::Level1,
+                                          1, pre_graph_checker, post_graph_checker));
+  }
 }
 
 /*

From 2e214d6e27cdac996d80cfa39a2de0364957f15e Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Tue, 25 Jul 2023 08:35:52 +0800
Subject: [PATCH 32/34] Workaround to upgrade VS2022 for Windows ARM build
 (#16826)

### Description



### Motivation and Context
It should be reverted when VS2022 is upgraded to 17.7 or above.

### Vefication

https://dev.azure.com/aiinfra/Lotus/_build/results?buildId=331401&view=logs&j=7517abfd-115a-5c61-78a0-7ba3c9e3a88d
---
 .pipelines/windowsai-steps.yml | 2 +-
 cmake/onnxruntime_mlas.cmake   | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.pipelines/windowsai-steps.yml b/.pipelines/windowsai-steps.yml
index a29e2fe6c8204..45ebf889c5da1 100644
--- a/.pipelines/windowsai-steps.yml
+++ b/.pipelines/windowsai-steps.yml
@@ -84,7 +84,7 @@ jobs:
         7z x cmake-3.26.3-windows-x86_64.zip
         set PYTHONHOME=$(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.9.7\tools
         set PYTHONPATH=$(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.9.7\tools
-        $(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.9.7\tools\python.exe "$(Build.SourcesDirectory)\tools\ci_build\build.py" --build_dir $(Build.BinariesDirectory) --build_shared_lib --enable_onnx_tests --ms_experimental --use_dml --use_winml --cmake_generator "Visual Studio 16 2019" --update --config RelWithDebInfo --enable_lto --use_telemetry --disable_rtti --enable_wcos $(BuildFlags) --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.19041.0 --cmake_path $(Build.BinariesDirectory)\cmake-3.26.3-windows-x86_64\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake-3.26.3-windows-x86_64\bin\ctest.exe
+        $(Build.BinariesDirectory)\${{ parameters.PythonPackageName }}.3.9.7\tools\python.exe "$(Build.SourcesDirectory)\tools\ci_build\build.py" --build_dir $(Build.BinariesDirectory) --build_shared_lib --enable_onnx_tests --ms_experimental --use_dml --use_winml --cmake_generator "Visual Studio 17 2022" --update --config RelWithDebInfo --enable_lto --use_telemetry --disable_rtti --enable_wcos $(BuildFlags) --cmake_extra_defines CMAKE_SYSTEM_VERSION=10.0.19041.0 --cmake_path $(Build.BinariesDirectory)\cmake-3.26.3-windows-x86_64\bin\cmake.exe --ctest_path $(Build.BinariesDirectory)\cmake-3.26.3-windows-x86_64\bin\ctest.exe
       workingDirectory: '$(Build.BinariesDirectory)'
       displayName: 'Generate cmake config'
 
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index db40dee554e2b..df869bca53156 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -125,6 +125,10 @@ function(setup_mlas_source_for_windows)
     target_sources(onnxruntime_mlas PRIVATE
       ${MLAS_SRC_DIR}/arm/sgemmc.cpp
     )
+    # it should be removed after Visual Stuio is upgraded to 17.7
+    if (MSVC)
+      add_compile_options("-d2SSAOptimizer-")
+    endif()
   elseif(onnxruntime_target_platform STREQUAL "x64")
 
     file(GLOB_RECURSE mlas_platform_srcs_avx CONFIGURE_DEPENDS

From 8b30dc11d74c070e441f961bc92b58fdde7713fa Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 24 Jul 2023 23:11:53 -0700
Subject: [PATCH 33/34] Update run_CIs_for_external_pr.py to skip passed checks
 (#16808)

### Description
Update run_CIs_for_external_pr.py to skip passed checks
---
 tools/python/run_CIs_for_external_pr.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/tools/python/run_CIs_for_external_pr.py b/tools/python/run_CIs_for_external_pr.py
index b472e9538e07e..beee4efc74c30 100644
--- a/tools/python/run_CIs_for_external_pr.py
+++ b/tools/python/run_CIs_for_external_pr.py
@@ -25,9 +25,9 @@ def parse_args():
     return args
 
 
-def run_gh_pr_command(command: typing.List[str]):
+def run_gh_pr_command(command: typing.List[str], check=True):
     try:
-        return subprocess.run(["gh", "pr", *command], capture_output=True, text=True, check=True)
+        return subprocess.run(["gh", "pr", *command], capture_output=True, text=True, check=check)
     except subprocess.CalledProcessError as cpe:
         print(cpe)
         print(cpe.stderr)
@@ -51,9 +51,23 @@ def main():
                 print(f"PR {pr_id} is not OPEN. Currently in state {pieces[1]}.")
                 sys.exit(-1)
 
+    print("Check passed pipelines")
+    gh_out = run_gh_pr_command(["checks", pr_id, "--required"], check=False)
+    # output format is a tab separated list of columns:
+    # (pipeline name) "\t" (status) "\t" (ran time) "\t" (url)
+    checked_pipelines = [
+        columns[0]
+        for columns in (line.strip().split("\t") for line in gh_out.stdout.split("\n"))
+        if len(columns) == 4 and columns[1] == "pass"
+    ]
+
     print("Adding azp run commands")
 
     # Current pipelines. These change semi-frequently and may need updating.
+    #
+    # Note: there is no easy way to get the list for azp "required" pipelines before they starts.
+    #       we need to maintain this list manually.
+    #
     pipelines = [
         # windows
         "Windows ARM64 QNN CI Pipeline",
@@ -80,6 +94,9 @@ def main():
         "onnxruntime-binary-size-checks-ci-pipeline",
     ]
 
+    # remove pipelines that have already run successfully
+    pipelines = [p for p in pipelines if p not in checked_pipelines]
+
     # azp run is limited to 10 pipelines at a time
     max_pipelines_per_comment = 10
     start = 0

From f88f0d8e36abd2d29bce4b049b232609419e9f6c Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Tue, 25 Jul 2023 14:22:39 +0800
Subject: [PATCH 34/34] Upgrade 4 stages in nuget pipeline to VS2022 (#16825)

### Description


### Motivation and Context
Continue upgrading to VS2022

### Verfication

https://dev.azure.com/aiinfra/Lotus/_build/results?buildId=331377&view=results

N.B.
In practice, SDLNativeRules@3 doesn't support VS2019.
---
 .../c-api-noopenmp-packaging-pipelines.yml    | 16 +++++-----
 .../azure-pipelines/templates/compliance.yml  | 30 ++++++++-----------
 2 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 7a175c7f80a9c..3604fb723429c 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -794,11 +794,11 @@ stages:
 
 - template: nuget/templates/dml-vs-2022.yml
   parameters:
-    AgentPool : 'onnxruntime-Win2019-GPU-dml-A10'
+    AgentPool : 'onnxruntime-Win2022-GPU-dml-A10'
     IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
     ArtifactName: 'drop-nuget-dml'
     StageName: 'Windows_CI_GPU_DML_Dev'
-    BuildCommand: --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --use_winml --cmake_generator "Visual Studio 16 2019"
+    BuildCommand: --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --use_winml --cmake_generator "Visual Studio 17 2022"
     BuildArch: 'x64'
     msbuildArchitecture: 'amd64'
     EnvSetupScript: 'setup_env.bat'
@@ -817,11 +817,11 @@ stages:
 
 - template: nuget/templates/dml-vs-2022.yml
   parameters:
-    AgentPool : 'onnxruntime-Win2019-GPU-dml-A10'
+    AgentPool : 'onnxruntime-Win2022-GPU-dml-A10'
     IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
     ArtifactName: 'drop-win-dml-x86-zip'
     StageName: 'Windows_CI_GPU_DML_Dev_x86'
-    BuildCommand: --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --use_winml --cmake_generator "Visual Studio 16 2019"
+    BuildCommand: --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --use_winml --cmake_generator "Visual Studio 17 2022"
     BuildArch: 'x86'
     EnvSetupScript: 'setup_env_x86.bat'
     sln_platform: 'Win32'
@@ -840,11 +840,11 @@ stages:
 
 - template: nuget/templates/dml-vs-2022.yml
   parameters:
-    AgentPool : 'onnxruntime-Win2019-GPU-dml-A10'
+    AgentPool : 'onnxruntime-Win2022-GPU-dml-A10'
     IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
     ArtifactName: 'drop-win-dml-arm64-zip'
     StageName: 'Windows_CI_GPU_DML_Dev_arm64'
-    BuildCommand: --build_dir $(Build.BinariesDirectory) --arm64 --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --use_winml --cmake_generator "Visual Studio 16 2019"
+    BuildCommand: --build_dir $(Build.BinariesDirectory) --arm64 --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --use_winml --cmake_generator "Visual Studio 17 2022"
     BuildArch: 'x64'
     EnvSetupScript: 'setup_env.bat'
     sln_platform: 'arm64'
@@ -863,11 +863,11 @@ stages:
 
 - template: nuget/templates/dml-vs-2022.yml
   parameters:
-    AgentPool : 'onnxruntime-Win-CPU-2019'
+    AgentPool : 'onnxruntime-Win-CPU-2022'
     IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
     ArtifactName: 'drop-win-dml-arm-zip'
     StageName: 'Windows_CI_GPU_DML_Dev_arm'
-    BuildCommand: --build_dir $(Build.BinariesDirectory) --arm --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --use_winml --cmake_generator "Visual Studio 16 2019"
+    BuildCommand: --build_dir $(Build.BinariesDirectory) --arm --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --use_winml --cmake_generator "Visual Studio 17 2022"
     BuildArch: 'x64'
     EnvSetupScript: 'setup_env.bat'
     sln_platform: 'arm'
diff --git a/tools/ci_build/github/azure-pipelines/templates/compliance.yml b/tools/ci_build/github/azure-pipelines/templates/compliance.yml
index b0722cecdceb5..6f312db63413b 100644
--- a/tools/ci_build/github/azure-pipelines/templates/compliance.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/compliance.yml
@@ -31,36 +31,30 @@ steps:
      **/*.obj
      **/*.pdb
      **/*.dll
-#Manually set msBuildCommandline so that we can also set CAExcludePath
-- task: securedevelopmentteam.vss-secure-development-tools.build-task-prefast.SDLNativeRules@2
+
+# Manually set msBuildCommandline so that we can also set CAExcludePath
+- task: SDLNativeRules@3
   displayName: 'Run the PREfast SDL Native Rules for MSBuild'
   inputs:
     userProvideBuildInfo: msBuildInfo
     msBuildArchitecture: x64
-    ${{ if eq(parameters.vs2022, false)}}:
-      msBuildVersion: 16.0
-      msBuildCommandline: '"C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Current\Bin\amd64\msbuild.exe" "$(Build.BinariesDirectory)\RelWithDebInfo\onnxruntime.sln" /p:platform="${{parameters.msbuildPlatform}}" /p:configuration="RelWithDebInfo" /p:CAExcludePath="$(Build.BinariesDirectory);$(Build.SourcesDirectory)\cmake;C:\program files (x86)" /p:VisualStudioVersion="17.0" /m /p:PreferredToolArchitecture=x64'
-    ${{ else }}:
-      msBuildVersion: 17.0
-      msBuildCommandline: '"C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Current\Bin\amd64\msbuild.exe" "$(Build.BinariesDirectory)\RelWithDebInfo\onnxruntime.sln" /p:platform="${{parameters.msbuildPlatform}}" /p:configuration="RelWithDebInfo" /p:CAExcludePath="$(Build.BinariesDirectory);$(Build.SourcesDirectory)\cmake;C:\program files (x86)" /p:VisualStudioVersion="17.0" /m /p:PreferredToolArchitecture=x64'
-  continueOnError: true
+    msBuildVersion: 17.0
+    msBuildCommandline: '"C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Current\Bin\amd64\msbuild.exe" "$(Build.BinariesDirectory)\RelWithDebInfo\onnxruntime.sln" /p:platform="${{parameters.msbuildPlatform}}" /p:configuration="RelWithDebInfo" /p:CAExcludePath="$(Build.BinariesDirectory);$(Build.SourcesDirectory)\cmake;C:\program files (x86)" /p:VisualStudioVersion="17.0" /m /p:PreferredToolArchitecture=x64'
+    excludedPaths: '$(Build.SourcesDirectory)\b#$(Build.SourcesDirectory)\cmake#C:\program files'
 
-- task: securedevelopmentteam.vss-secure-development-tools.build-task-report.SdtReport@1
+- task: SdtReport@2
   displayName: 'Create Security Analysis Report'
   inputs:
-    BinSkim: true
-    BinSkimBreakOn: WarningAbove
-    CredScan: true
     SDLNativeRules: true
 
-- task: securedevelopmentteam.vss-secure-development-tools.build-task-publishsecurityanalysislogs.PublishSecurityAnalysisLogs@2
+- task: PublishSecurityAnalysisLogs@3
   displayName: 'Publish Security Analysis Logs'
   continueOnError: true
 
-- task: securedevelopmentteam.vss-secure-development-tools.build-task-uploadtotsa.TSAUpload@1
+- task: TSAUpload@2
+  displayName: 'TSA upload'
   condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
-  displayName: 'TSA Upload'
   inputs:
-    tsaVersion: TsaV2
-    codeBaseName: 'onnxruntime_main'
+    GdnPublishTsaOnboard: false
+    GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa'
   continueOnError: true