From 559a21c7c30a82ca07a17409ea76d5793b0e942a Mon Sep 17 00:00:00 2001
From: Boris Fomitchev <borisfom@users.noreply.github.com>
Date: Thu, 23 Mar 2023 09:36:51 -0700
Subject: [PATCH 01/20] Fixing CUDA12 build (#15135)

Removing flags for CUDA architectures not supported in CUDA12 SDK
Adding build flags for Hopper architecture, supported in CUDA12 SDK.
---
 cmake/CMakeLists.txt | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index ef44681626ae8..0bcf5fa38a6f4 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -1272,10 +1272,11 @@ if (onnxruntime_USE_CUDA)
       if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11)
         set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_30,code=sm_30") # K series
       endif()
-      # 37, 50 still work in CUDA 11 but are marked deprecated and will be removed in future CUDA version.
-      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_37,code=sm_37") # K80
-      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50") # M series
-
+      if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12)
+	# 37, 50 still work in CUDA 11 but are marked deprecated and will be removed in future CUDA version.
+	set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_37,code=sm_37") # K80
+	set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_50,code=sm_50") # M series
+      endif()
       set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_52,code=sm_52") # M60
       set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60") # P series
       set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70") # V series
@@ -1283,6 +1284,9 @@ if (onnxruntime_USE_CUDA)
       if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11)
         set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_80,code=sm_80") # A series
       endif()
+      if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12)
+        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_90,code=sm_90") # H series
+      endif()
     endif()
   endif()
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")

From b82f94ac2e5306ddf525a540f4011ccff792b699 Mon Sep 17 00:00:00 2001
From: Faith Xu <faxu@microsoft.com>
Date: Thu, 23 Mar 2023 09:39:04 -0700
Subject: [PATCH 02/20] Update labeler.yml for web (#15142)

### Description
Adds a few addl tags for web

---------

Co-authored-by: Nat Kershaw (MSFT) <nakersha@microsoft.com>
---
 .github/labeler.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/labeler.yml b/.github/labeler.yml
index aaa957ff07a6a..526d8a643e713 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -14,7 +14,7 @@ ep:tvm: '/\btvm\b/i'
 ep:VitisAI: '/\bvitis(?:ai)?\b/i'
 platform:jetson: '/\bjetson\b/i'
 platform:mobile: '/(\bobj(?:ective)?-?c\b|\bnnapi\b|\bcore-?ml\b|\bmobile\b|\bandroid\b|\bios\b|\bxamarin\b|\bmaui\b)/i'
-platform:web: '/(\bwebgl\b|\bwasm\b)/i'
+platform:web: '/(\bwebgl\b|\bweb-?gpu\b|\bwasm\b|\bonnxruntime-node\b|\bonnxruntime-web\b)/i'
 platform:windows: '/(\bwindows\b|\bwinrt\b|\bwinml\b)/i'
 model:transformer: '/(\bbert\b|\bgpt-?2\b|\bhugging-?face\b|\blong-?former\b|\bt5\b)/i'
 quantization: '/(is this a quantized model\?\n\nYes|\bquantization\b)/i'

From 88a66a289bfc5cea3177736f11d80b3425e18b72 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Thu, 23 Mar 2023 09:45:16 -0700
Subject: [PATCH 03/20] Fix prune_graph and gpt attention fusion scripts
 (#15147)

Fix two issues: (1) GPT attention fusion: get_parent could return None when the input is
initializer, add a check (2) ONNX node could have optional inputs and outputs. During
prune_graph, we shall exclude empty inputs/outputs. Here we exclude ""
from output_name_to_node and input_name_to_nodes.

Add an option allow_remove_graph_inputs in prune_graph
---
 .../transformers/fusion_gpt_attention.py      |  8 +--
 .../python/tools/transformers/onnx_model.py   | 50 +++++++++++--------
 2 files changed, 33 insertions(+), 25 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/fusion_gpt_attention.py b/onnxruntime/python/tools/transformers/fusion_gpt_attention.py
index 7fe3257950568..b8a1cbb9f2044 100644
--- a/onnxruntime/python/tools/transformers/fusion_gpt_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_gpt_attention.py
@@ -43,7 +43,7 @@ def match_past_pattern_1(self, concat_k, concat_v, output_name_to_node):
         #             |
         #         {present}
         gather = self.model.get_parent(concat_v, 0, output_name_to_node)
-        if gather.op_type != "Gather":
+        if gather is None or gather.op_type != "Gather":
             logger.debug("match_past_pattern_1: expect Gather for past")
             return None
 
@@ -53,7 +53,7 @@ def match_past_pattern_1(self, concat_k, concat_v, output_name_to_node):
         past = gather.input[0]
 
         parent = self.model.get_parent(concat_k, 0, output_name_to_node)
-        if parent.op_type == "Gather":
+        if parent and parent.op_type == "Gather":
             gather_past_k = parent
         else:
             past_k_nodes = self.model.match_parent_path(concat_k, ["Transpose", "Gather"], [0, 0])
@@ -95,12 +95,12 @@ def match_past_pattern_2(self, concat_k, concat_v, output_name_to_node):
         #               {present}
         #
         squeeze = self.model.get_parent(concat_v, 0, output_name_to_node)
-        if squeeze.op_type != "Squeeze":
+        if squeeze is None or squeeze.op_type != "Squeeze":
             logger.debug("match_past_pattern_2: expect Squeeze as parent of concat_v")
             return None
 
         split = self.model.get_parent(squeeze, 0, output_name_to_node)
-        if split.op_type != "Split":
+        if split is None or split.op_type != "Split":
             logger.debug("match_past_pattern_2: expect Split for past path")
             return None
 
diff --git a/onnxruntime/python/tools/transformers/onnx_model.py b/onnxruntime/python/tools/transformers/onnx_model.py
index cce2cbe5a44c7..7cfc6d355c7a3 100644
--- a/onnxruntime/python/tools/transformers/onnx_model.py
+++ b/onnxruntime/python/tools/transformers/onnx_model.py
@@ -49,17 +49,19 @@ def input_name_to_nodes(self):
         input_name_to_nodes = {}
         for node in self.nodes():
             for input_name in node.input:
-                if input_name not in input_name_to_nodes:
-                    input_name_to_nodes[input_name] = [node]
-                else:
-                    input_name_to_nodes[input_name].append(node)
+                if input_name:  # could be empty when it is optional
+                    if input_name not in input_name_to_nodes:
+                        input_name_to_nodes[input_name] = [node]
+                    else:
+                        input_name_to_nodes[input_name].append(node)
         return input_name_to_nodes
 
     def output_name_to_node(self):
         output_name_to_node = {}
         for node in self.nodes():
             for output_name in node.output:
-                output_name_to_node[output_name] = node
+                if output_name:  # could be empty when it is optional
+                    output_name_to_node[output_name] = node
         return output_name_to_node
 
     def nodes(self):
@@ -599,7 +601,7 @@ def convert_float_to_float16(self, use_symbolic_shape_infer=True, **kwargs):
                                                        Defaults to True.
             keep_io_types (Union[bool, List[str]], optional): boolean or a list of float32 input/output names.
                                                               If True, model inputs/outputs should be left as float32.
-                                                              Defaults to False.
+                                                              Defaults to True.
             op_block_list (List[str], optional): List of operator types to leave as float32.
                                                  Defaults to None, which will use `float16.DEFAULT_OP_BLOCK_LIST`.
             node_block_list (List[str], optional): List of node names to leave as float32. Defaults to None.
@@ -751,14 +753,18 @@ def remove_unused_constant(self):
         if len(unused_nodes) > 0:
             logger.debug(f"Removed unused constant nodes: {len(unused_nodes)}")
 
-    def prune_graph(self, outputs=None):
+    def prune_graph(self, outputs=None, allow_remove_graph_inputs=True):
         """
-        Prune graph to keep only required outputs. It removes unnecessary inputs and nodes.
-        Nodes are not linked (directly or indirectly) to any required output will be removed.
+        Prune graph to keep only required outputs. It removes unnecessary nodes that are not linked
+        (directly or indirectly) to any required output.
+
+        There is also an option to remove graph inputs that are not used to generate any required output.
 
         Args:
             outputs (list): a list of graph outputs to retain. If it is None, all graph outputs will be kept.
+            allow_remove_graph_inputs (bool): allow remove graph inputs.
         """
+
         if len(self.graphs()) > 1:
             logger.debug("Skip prune_graph since graph has subgraph")
             return
@@ -793,13 +799,14 @@ def prune_graph(self, outputs=None):
             self.model.graph.output.remove(output)
 
         # remove inputs not used by any node.
-        input_name_to_nodes = self.input_name_to_nodes()
         input_to_remove = []
-        for input in self.model.graph.input:
-            if input.name not in input_name_to_nodes:
-                input_to_remove.append(input)
-        for input in input_to_remove:
-            self.model.graph.input.remove(input)
+        if allow_remove_graph_inputs:
+            input_name_to_nodes = self.input_name_to_nodes()
+            for input in self.model.graph.input:
+                if input.name not in input_name_to_nodes:
+                    input_to_remove.append(input)
+            for input in input_to_remove:
+                self.model.graph.input.remove(input)
 
         if input_to_remove or output_to_remove or nodes_to_remove:
             removed = []
@@ -813,7 +820,7 @@ def prune_graph(self, outputs=None):
 
         self.update_graph()
 
-    def update_graph(self, verbose=False):
+    def update_graph(self, verbose=False, allow_remove_graph_inputs=False):
         graph = self.model.graph
 
         remaining_input_names = []
@@ -831,11 +838,12 @@ def update_graph(self, verbose=False):
 
         # remove graph input that is not used
         inputs_to_remove = []
-        for input in graph.input:
-            if input.name not in remaining_input_names:
-                inputs_to_remove.append(input)
-        for input in inputs_to_remove:
-            graph.input.remove(input)
+        if allow_remove_graph_inputs:
+            for input in graph.input:
+                if input.name not in remaining_input_names:
+                    inputs_to_remove.append(input)
+            for input in inputs_to_remove:
+                graph.input.remove(input)
 
         names_to_remove = [input.name for input in inputs_to_remove]
         logger.debug(f"remove {len(inputs_to_remove)} unused inputs: {names_to_remove}")

From 910fc09de2af277dc54e01f99de3b6e4fc11c746 Mon Sep 17 00:00:00 2001
From: Zhang Lei <zhang.huanning@hotmail.com>
Date: Thu, 23 Mar 2023 10:04:22 -0700
Subject: [PATCH 04/20] Using standard layernorm cuda kernel for skiplayernorm.
 (#15076)

* Current SkipLayernorm did not using stable algo and cause correctness
issue.
  * Enrich existing layernorm kernel to accept bias and residual.
* Tune standard layernorm threads.y according to elements and device
property.
  * Remove existing skiplayernorm cuda implementation.
---
 .../contrib_ops/cuda/bert/skip_layer_norm.cc  |  40 +--
 .../cuda/bert/skip_layer_norm_impl.cu         | 241 ------------------
 .../cuda/bert/skip_layer_norm_impl.h          |  28 --
 .../core/providers/cuda/nn/layer_norm.cc      |   1 +
 .../core/providers/cuda/nn/layer_norm_impl.cu | 116 +++++++--
 .../core/providers/cuda/nn/layer_norm_impl.h  |   5 +-
 .../test/contrib_ops/layer_norm_test.cc       |  10 +
 .../test/contrib_ops/skiplayernorm_op_test.cc |  44 ++--
 8 files changed, 153 insertions(+), 332 deletions(-)
 delete mode 100644 onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.cu
 delete mode 100644 onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.h

diff --git a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc
index 53da91420e0c4..d2f7d974be85e 100644
--- a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc
@@ -2,8 +2,8 @@
 // Licensed under the MIT License.
 
 #include "core/providers/cuda/cuda_common.h"
+#include "core/providers/cuda/nn/layer_norm_impl.h"
 #include "skip_layer_norm.h"
-#include "skip_layer_norm_impl.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -106,25 +106,29 @@ Status SkipLayerNorm<T, Simplified>::ComputeInternal(OpKernelContext* ctx) const
     }
   }
 
-  int sequence_length = static_cast<int>(input_dims[1]);
-  int hidden_size = static_cast<int>(input_dims[2]);
-  int64_t element_count = input_dims[0] * sequence_length * hidden_size;
-  size_t element_size = sizeof(T);
-  typedef typename ToCudaType<T>::MappedType CudaT;
+  int sequence_length = gsl::narrow_cast<int>(input_dims[1]);
+  int hidden_size = gsl::narrow_cast<int>(input_dims[2]);
+  int row_count = gsl::narrow_cast<int>(input_dims[0] * sequence_length);
 
-  return LaunchSkipLayerNormKernel<CudaT, Simplified>(
+  typedef typename ToCudaType<T>::MappedType CudaT;
+  HostApplyLayerNorm<CudaT, float, CudaT, Simplified>(
+      GetDeviceProp(),
       Stream(ctx),
-      reinterpret_cast<CudaT*>(output->MutableData<T>()),
-      skip_input_bias_add_output != nullptr ? reinterpret_cast<CudaT*>(skip_input_bias_add_output->MutableData<T>()) : nullptr,
-      reinterpret_cast<const CudaT*>(input->Data<T>()),
-      reinterpret_cast<const CudaT*>(skip->Data<T>()),
-      reinterpret_cast<const CudaT*>(gamma->Data<T>()),
-      (beta != nullptr) ? reinterpret_cast<const CudaT*>(beta->Data<T>()) : nullptr,
-      (bias != nullptr) ? reinterpret_cast<const CudaT*>(bias->Data<T>()) : nullptr,
-      epsilon_,
-      hidden_size,
-      static_cast<int>(element_count),
-      element_size);
+      reinterpret_cast<CudaT*>(output->MutableData<T>()),                             // Y_data
+      nullptr,                                                                        // mean_data
+      nullptr,                                                                        // inv_var_data
+      reinterpret_cast<const CudaT*>(input->Data<T>()),                               // X_data
+      row_count,                                                                      // n1
+      hidden_size,                                                                    // n2
+      (double)epsilon_,                                                               // epsilon
+      reinterpret_cast<const CudaT*>(gamma->Data<T>()),                               // gamma
+      (beta != nullptr) ? reinterpret_cast<const CudaT*>(beta->Data<T>()) : nullptr,  // beta
+      reinterpret_cast<const CudaT*>(skip->Data<T>()),                                // skip or residual to add
+      (bias != nullptr) ? reinterpret_cast<const CudaT*>(bias->Data<T>()) : nullptr,  // bias to add
+      skip_input_bias_add_output != nullptr ? reinterpret_cast<CudaT*>(skip_input_bias_add_output->MutableData<T>()) : nullptr);
+
+  CUDA_RETURN_IF_ERROR(cudaGetLastError());
+  return Status::OK();
 }
 
 }  // namespace cuda
diff --git a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.cu b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.cu
deleted file mode 100644
index 0a815f7f4923a..0000000000000
--- a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.cu
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- The implementation of this file is based on skipLayerNorm plugin in TensorRT demo:
- https://github.com/NVIDIA/TensorRT/tree/release/5.1/demo/BERT/
-
-Copyright 2019 NVIDIA Corporation
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-// Modifications: Add SkipLayerNormKernelVec to
-//                leverage vectorized load/write.
-//                and templatize ComputeSkipLayerNorm for different
-//                data types.
-// Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
-// Licensed under the MIT License.
-
-#include "contrib_ops/cuda/bert/layer_norm.cuh"
-#include "contrib_ops/cuda/bert/skip_layer_norm_impl.h"
-#include <cuda_fp16.h>
-
-namespace onnxruntime {
-namespace contrib {
-namespace cuda {
-
-namespace {
-template <typename T>
-T maybe2half(float x);
-
-template <>
-float maybe2half(float x) {
-  return x;
-}
-
-template <>
-half maybe2half(float x) {
-  return __float2half_rn(x);
-}
-
-// Using only power of 2 numbers will lead to waste of compute for same size such as 768, which is a very common case
-// in BERT. Ideally we can step by wrap_size * num_unroll, but listing too many steps will cause long compile time.
-constexpr int kSizes[] = {32, 64, 128, 384, 768, 1024, 2048};
-constexpr int kMinBlockSize = 32;
-constexpr int kMaxBlockSize = 256;
-
-int NextSize(int x) {
-  size_t len = sizeof(kSizes) / sizeof(kSizes[0]);
-  for (size_t i = 0; i < len; ++i) {
-    if (x <= kSizes[i]) {
-      return kSizes[i];
-    }
-  }
-  return kSizes[len - 1];
-}
-
-template <typename T, int NumUnroll>
-bool CanVectorized(T* output, T* skip_input_bias_add_output, const T* input, const T* skip, const T* gamma,
-                   const T* beta, const T* bias, const int ld, const int next_size) {
-  constexpr int alignment = std::alignment_of<aligned_vector<T, NumUnroll>>::value;
-  return ld % NumUnroll == 0 && reinterpret_cast<uint64_t>(output) % alignment == 0 &&
-         reinterpret_cast<uint64_t>(skip_input_bias_add_output) % alignment == 0 &&
-         reinterpret_cast<uint64_t>(input) % alignment == 0 && reinterpret_cast<uint64_t>(skip) % alignment == 0 &&
-         reinterpret_cast<uint64_t>(gamma) % alignment == 0 && reinterpret_cast<uint64_t>(beta) % alignment == 0 &&
-         reinterpret_cast<uint64_t>(bias) % alignment == 0 && next_size / NumUnroll >= kMinBlockSize &&
-         next_size / NumUnroll <= kMaxBlockSize;
-}
-}  // namespace
-
-template <typename T, unsigned TPB, bool Simplified>
-__global__ void SkipLayerNormKernel(
-    const int ld, const T* input, const T* skip,
-    const T* beta, const T* gamma, const T* bias,
-    const T epsilon, T* output, T* skip_input_bias_add_output) {
-  const T reverse_ld = T(1.f / ld);
-  const int offset = blockIdx.x * ld;
-
-  KeyValuePairSum pair_sum;
-  // reduce x and x^2
-  cub::KeyValuePair<T, T> thread_data(0, 0);
-
-  for (int i = threadIdx.x; i < ld; i += TPB) {
-    const int idx = offset + i;
-
-    const T val = (bias == nullptr) ? input[idx] + skip[idx] : input[idx] + skip[idx] + bias[i];
-    const T rldval = reverse_ld * val;
-    thread_data = pair_sum(thread_data, cub::KeyValuePair<T, T>(rldval, rldval * val));
-
-    if (skip_input_bias_add_output != nullptr) {
-      skip_input_bias_add_output[idx] = val;
-    }
-
-    output[idx] = val;
-  }
-  if (Simplified) {
-    SimplifiedLayerNorm<T, TPB>(thread_data.value, ld, offset, gamma, epsilon, output);
-    return;
-  }
-  LayerNorm<T, TPB>(thread_data, ld, offset, beta, gamma, epsilon, output);
-}
-
-// Vectorized kernel
-template <typename T, unsigned TPB, int ILP, bool Simplified>
-__global__ void SkipLayerNormKernelSmall(
-    const int ld, const T* input, const T* skip, const T* beta, const T* gamma,
-    const T* bias, const T epsilon, T* output, T* skip_input_bias_add_output,
-    bool hasBias, bool hasSkipInputBiasAdditionOutput) {
-  const T rld = T(1.f / ld);
-  const int idx = blockIdx.x * ld + threadIdx.x * ILP;  // grid_size = n / ld
-
-  using VecT = aligned_vector<T, ILP>;
-
-  T input_v[ILP], skip_v[ILP], bias_v[ILP], skip_input_bias_add_output_v[ILP];
-
-  VecT* input_val = reinterpret_cast<VecT*>(&input_v);
-  *input_val = *reinterpret_cast<const VecT*>(&input[idx]);
-
-  VecT* skip_val = reinterpret_cast<VecT*>(&skip_v);
-  *skip_val = *reinterpret_cast<const VecT*>(&skip[idx]);
-
-  if (hasBias) {
-    VecT* bias_val = reinterpret_cast<VecT*>(&bias_v);
-    *bias_val = *reinterpret_cast<const VecT*>(&bias[threadIdx.x * ILP]);
-  }
-
-  cub::KeyValuePair<T, T> thread_data(T(0.f), T(0.f));
-
-  if (ILP * threadIdx.x < ld) {
-    T rldval_sum = T(0.f);
-    T rldvalsq_sum = T(0.f);
-#pragma unroll
-    for (int i = 0; i < ILP; i++) {
-      input_v[i] += hasBias ? skip_v[i] + bias_v[i] : skip_v[i];
-
-      if (hasSkipInputBiasAdditionOutput) {
-        skip_input_bias_add_output_v[i] = input_v[i];
-      }
-
-      const T rldval = rld * input_v[i];
-      rldval_sum += rldval;
-      rldvalsq_sum += rldval * input_v[i];
-    }
-
-    if (hasSkipInputBiasAdditionOutput) {
-      *(reinterpret_cast<VecT*>(&skip_input_bias_add_output[idx])) = *reinterpret_cast<VecT*>(&skip_input_bias_add_output_v);
-    }
-
-    thread_data = cub::KeyValuePair<T, T>(rldval_sum, rldvalsq_sum);
-  }
-
-  if (Simplified) {
-    SimplifiedLayerNormSmall<T, TPB, ILP>(input_v, thread_data.value, ld, idx, gamma, epsilon, output);
-    return;
-  }
-  LayerNormSmall<T, TPB, ILP>(input_v, thread_data, ld, idx, beta, gamma, epsilon, output);
-}
-
-template <typename T, bool Simplified>
-Status LaunchSkipLayerNormKernel(
-    cudaStream_t stream, T* output, T* skip_input_bias_add_output, const T* input, const T* skip, const T* gamma,
-    const T* beta, const T* bias, float epsilon, const int ld, const int element_count,
-    size_t element_size) {
-  // this must be true because n is the total size of the tensor
-  assert(element_count % ld == 0);
-  bool hasBias = (bias == nullptr) ? false : true;
-  bool hasSkipInputBiasAdditionOutput = (skip_input_bias_add_output == nullptr) ? false : true;
-
-  const int next_size = NextSize(ld);
-  const int grid_size = element_count / ld;
-  bool flag_vec2 =
-      CanVectorized<T, 2>(output, skip_input_bias_add_output, input, skip, gamma, beta, bias, ld, next_size);
-  bool flag_vec4 =
-      CanVectorized<T, 4>(output, skip_input_bias_add_output, input, skip, gamma, beta, bias, ld, next_size);
-
-  switch (next_size) {
-#define LAUNCH_SKIP_LAYER_NORM_KERNEL_SMALL(num_unroll)                                                          \
-  SkipLayerNormKernelSmall<T, block_size, num_unroll, Simplified>                                                \
-      <<<grid_size, block_size, 0, stream>>>(ld, input, skip, beta, gamma, bias, maybe2half<T>(epsilon), output, \
-                                             skip_input_bias_add_output, hasBias, hasSkipInputBiasAdditionOutput)
-#define LAUNCH_SKIP_LAYER_NORM_KERNEL()                                                       \
-  SkipLayerNormKernel<T, kMaxBlockSize, Simplified><<<grid_size, kMaxBlockSize, 0, stream>>>( \
-      ld, input, skip, beta, gamma, bias, maybe2half<T>(epsilon), output, skip_input_bias_add_output)
-#define CASE_NEXT_SIZE(next_size_value)               \
-  case next_size_value: {                             \
-    if (flag_vec4) {                                  \
-      constexpr int block_size = next_size_value / 4; \
-      LAUNCH_SKIP_LAYER_NORM_KERNEL_SMALL(4);         \
-    } else if (flag_vec2) {                           \
-      constexpr int block_size = next_size_value / 2; \
-      LAUNCH_SKIP_LAYER_NORM_KERNEL_SMALL(2);         \
-    } else {                                          \
-      if (next_size_value <= kMaxBlockSize) {         \
-        constexpr int block_size = next_size_value;   \
-        LAUNCH_SKIP_LAYER_NORM_KERNEL_SMALL(1);       \
-      } else {                                        \
-        LAUNCH_SKIP_LAYER_NORM_KERNEL();              \
-      }                                               \
-    }                                                 \
-  } break
-    CASE_NEXT_SIZE(kSizes[0]);
-    CASE_NEXT_SIZE(kSizes[1]);
-    CASE_NEXT_SIZE(kSizes[2]);
-    CASE_NEXT_SIZE(kSizes[3]);
-    CASE_NEXT_SIZE(kSizes[4]);
-    CASE_NEXT_SIZE(kSizes[5]);
-    CASE_NEXT_SIZE(kSizes[6]);
-#undef CASE_NEXT_SIZE
-#undef LAUNCH_SKIP_LAYER_NORM_KERNEL
-#undef LAUNCH_SKIP_LAYER_NORM_KERNEL_SMALL
-  }
-
-  return CUDA_CALL(cudaGetLastError());
-}
-
-#define SKIPLAYERNORM_IMPL(T, Simplified)                                                                 \
-  template Status LaunchSkipLayerNormKernel<T, Simplified>(cudaStream_t stream, T * output,               \
-                                                           T * skip_input_bias_add_output,                \
-                                                           const T* input, const T* skip, const T* gamma, \
-                                                           const T* beta, const T* bias, float epsilon,   \
-                                                           const int ld, const int element_count,         \
-                                                           size_t element_size);
-
-SKIPLAYERNORM_IMPL(float, true);
-SKIPLAYERNORM_IMPL(float, false);
-SKIPLAYERNORM_IMPL(half, true);
-SKIPLAYERNORM_IMPL(half, false);
-
-}  // namespace cuda
-}  // namespace contrib
-}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.h b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.h
deleted file mode 100644
index da2894928c062..0000000000000
--- a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm_impl.h
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-#include "core/common/common.h"
-
-namespace onnxruntime {
-namespace contrib {
-namespace cuda {
-
-template <typename T, bool Simplified>
-Status LaunchSkipLayerNormKernel(
-    cudaStream_t stream,
-    T* output,                      // normalized output tensor
-    T* skip_input_bias_add_output,  // sum of the input and skip (and bias if it exists) tensors output
-    const T* input,                 // input tensor
-    const T* skip,                  // skip tensor
-    const T* gamma,                 // Layer normalization gamma tensor
-    const T* beta,                  // Layer normalization beta tensor
-    const T* bias,                  // Layer normalization beta tensor
-    float epsilon,                  // Layer normalization epsilon
-    int hidden_size,                // hidden size, it is the leading dimension (ld)
-    int element_count,              // number of elements in input tensor
-    size_t element_size);
-
-}  // namespace cuda
-}  // namespace contrib
-}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/nn/layer_norm.cc b/onnxruntime/core/providers/cuda/nn/layer_norm.cc
index 24a6b196c12f2..7dd10f9c2960c 100644
--- a/onnxruntime/core/providers/cuda/nn/layer_norm.cc
+++ b/onnxruntime/core/providers/cuda/nn/layer_norm.cc
@@ -95,6 +95,7 @@ Status LayerNorm<T, U, V, simplified>::ComputeInternal(OpKernelContext* ctx) con
 
   HostApplyLayerNorm<CudaT, CudaU, CudaV, simplified>(GetDeviceProp(), Stream(ctx), Y_data, mean_data, inv_var_data,
                                                       X_data, n1, n2, epsilon_, scale_data, bias_data);
+  CUDA_RETURN_IF_ERROR(cudaGetLastError());
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/providers/cuda/nn/layer_norm_impl.cu b/onnxruntime/core/providers/cuda/nn/layer_norm_impl.cu
index 717b994001f18..7f9d41298daf9 100644
--- a/onnxruntime/core/providers/cuda/nn/layer_norm_impl.cu
+++ b/onnxruntime/core/providers/cuda/nn/layer_norm_impl.cu
@@ -1,18 +1,18 @@
 /**
-* Copyright (c) 2016-present, Facebook, Inc.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 //
 // Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
@@ -85,7 +85,9 @@ __device__ void cuWelfordMuSigma2(
     const int i1,
     U& mu,
     U& sigma2,
-    U* buf) {
+    U* buf,
+    const T* __restrict__ skip,
+    const T* __restrict__ bias) {
   // Assumptions:
   // 1) blockDim.x == GPU_WARP_SIZE
   // 2) Tensor is contiguous
@@ -102,15 +104,34 @@ __device__ void cuWelfordMuSigma2(
     const int numx = blockDim.x * blockDim.y;
     const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
     const T* lvals = vals + i1 * n2;
+    const T* skip_vals = (skip != NULL) ? skip + i1 * n2 : NULL;
     int l = 4 * thrx;
     for (; l + 3 < n2; l += 4 * numx) {
       for (int k = 0; k < 4; ++k) {
         U curr = static_cast<U>(lvals[l + k]);
+
+        if (bias != NULL) {
+          curr += static_cast<U>(bias[l + k]);
+        }
+
+        if (skip_vals != NULL) {
+          curr += static_cast<U>(skip_vals[l + k]);
+        }
+
         cuWelfordOnlineSum<U, simplified>(curr, mu, sigma2, count);
       }
     }
     for (; l < n2; ++l) {
       U curr = static_cast<U>(lvals[l]);
+
+      if (bias != NULL) {
+        curr += static_cast<U>(bias[l]);
+      }
+
+      if (skip_vals != NULL) {
+        curr += static_cast<U>(skip_vals[l]);
+      }
+
       cuWelfordOnlineSum<U, simplified>(curr, mu, sigma2, count);
     }
 // intra-warp reductions
@@ -314,7 +335,10 @@ __global__ void cuApplyLayerNorm(
     const int n2,
     const U epsilon,
     const V* __restrict__ gamma,
-    const V* __restrict__ beta) {
+    const V* __restrict__ beta,
+    const T* __restrict__ skip,
+    const T* __restrict__ bias,
+    T* __restrict__ skip_input_bias_add_output) {
   // Assumptions:
   // 1) blockDim.x == GPU_WARP_SIZE
   // 2) Tensors are contiguous
@@ -323,20 +347,35 @@ __global__ void cuApplyLayerNorm(
     SharedMemory<U> shared;
     U* buf = shared.getPointer();
     U mu, sigma2;
-    cuWelfordMuSigma2<T, U, simplified>(vals, n1, n2, i1, mu, sigma2, buf);
+    cuWelfordMuSigma2<T, U, simplified>(vals, n1, n2, i1, mu, sigma2, buf, skip, bias);
     const T* lvals = vals + i1 * n2;
+    const T* skip_vals = (skip != NULL) ? skip + i1 * n2 : NULL;
     V* ovals = output_vals + i1 * n2;
+    T* skip_input_bias_add_ovals = (skip_input_bias_add_output != NULL) ? skip_input_bias_add_output + i1 * n2 : NULL;
     U c_inv_std_dev = rsqrt(sigma2 + epsilon);
     const int numx = blockDim.x * blockDim.y;
     const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
     for (int i = thrx; i < n2; i += numx) {
       U curr = static_cast<U>(lvals[i]);
-      V gamma_i = (gamma != NULL) ? gamma[i] : (V)1;
-      V beta_i = (beta != NULL) ? beta[i] : (V)0;
+
+      if (bias != NULL) {
+        curr += static_cast<U>(bias[i]);
+      }
+
+      if (skip_vals != NULL) {
+        curr += static_cast<U>(skip_vals[i]);
+      }
+
+      U gamma_i = (gamma != NULL) ? (U)gamma[i] : (U)1;
+      U beta_i = (beta != NULL) ? (U)beta[i] : (U)0;
       if (simplified) {
-        ovals[i] = gamma_i * static_cast<V>(c_inv_std_dev * curr);
+        ovals[i] = static_cast<V>(gamma_i * c_inv_std_dev * curr);
       } else {
-        ovals[i] = gamma_i * static_cast<V>(c_inv_std_dev * (curr - mu)) + beta_i;
+        ovals[i] = static_cast<V>(gamma_i * c_inv_std_dev * (curr - mu) + beta_i);
+      }
+
+      if (skip_input_bias_add_ovals != NULL) {
+        skip_input_bias_add_ovals[i] = static_cast<T>(curr);
       }
     }
     if (threadIdx.x == 0 && threadIdx.y == 0) {
@@ -346,6 +385,17 @@ __global__ void cuApplyLayerNorm(
   }
 }
 
+int32_t round_up_power_of_2(int32_t v) {
+  v--;
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  v++;
+  return v;
+}
+
 template <typename T, typename U, typename V, bool simplified>
 void HostApplyLayerNorm(
     const cudaDeviceProp& prop,
@@ -358,12 +408,20 @@ void HostApplyLayerNorm(
     int n2,
     double epsilon,
     const V* gamma,
-    const V* beta) {
+    const V* beta,
+    const T* skip,
+    const T* bias,
+    T* skip_input_bias_add_output) {
   const int maxGridY = prop.maxGridSize[1];
   const int warp_size = prop.warpSize;
   ORT_ENFORCE(warp_size == GPU_WARP_SIZE_HOST);
 
-  dim3 threads(warp_size, 4, 1);
+  // Be careful for the logic on treads_y calc:
+  //     * 4 is current implementation related, yet it does not using vectorized load
+  //     * Do not using maxTreads as it will cause resource issue.
+  int threads_y = std::min(round_up_power_of_2((n2 + 4 * warp_size - 1) / (4 * warp_size)),
+                           prop.maxThreadsPerBlock / (warp_size * 2));
+  dim3 threads(warp_size, threads_y, 1);
 #ifdef __HIP_PLATFORM_HCC__
   // Optimization for ROCm MI100
   threads.y = 1;
@@ -378,13 +436,15 @@ void HostApplyLayerNorm(
       input,
       n1, n2,
       U(epsilon),
-      gamma, beta);
+      gamma, beta,
+      skip, bias, skip_input_bias_add_output);
 }
 
-#define LAYERNORM_LINEAR_IMPL(T, U, V, simplified)                                                                  \
-  template void HostApplyLayerNorm<T, U, V, simplified>(const cudaDeviceProp& prop, cudaStream_t stream, V* output, \
-                                                        U* mean, U* inv_std_dev, const T* input, int n1, int n2,    \
-                                                        double epsilon, const V* gamma, const V* beta);
+#define LAYERNORM_LINEAR_IMPL(T, U, V, simplified)                                                                    \
+  template void HostApplyLayerNorm<T, U, V, simplified>(const cudaDeviceProp& prop, cudaStream_t stream, V* output,   \
+                                                        U* mean, U* inv_std_dev, const T* input, int n1, int n2,      \
+                                                        double epsilon, const V* gamma, const V* beta, const T* skip, \
+                                                        const T* bias, T* skip_input_bias_add_output);
 
 LAYERNORM_LINEAR_IMPL(float, float, float, true)
 LAYERNORM_LINEAR_IMPL(half, float, half, true)
diff --git a/onnxruntime/core/providers/cuda/nn/layer_norm_impl.h b/onnxruntime/core/providers/cuda/nn/layer_norm_impl.h
index 8c904072c2c53..e3952eefae35d 100644
--- a/onnxruntime/core/providers/cuda/nn/layer_norm_impl.h
+++ b/onnxruntime/core/providers/cuda/nn/layer_norm_impl.h
@@ -40,7 +40,10 @@ void HostApplyLayerNorm(
     int n2,
     double epsilon,
     const V* gamma,
-    const V* beta);
+    const V* beta,
+    const T* skip = nullptr,
+    const T* bias = nullptr,
+    T* skip_input_bias_add_output = nullptr);
 
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/contrib_ops/layer_norm_test.cc b/onnxruntime/test/contrib_ops/layer_norm_test.cc
index 5b6ae6b9c6034..c6c82565bec45 100644
--- a/onnxruntime/test/contrib_ops/layer_norm_test.cc
+++ b/onnxruntime/test/contrib_ops/layer_norm_test.cc
@@ -111,6 +111,16 @@ TEST(CudaKernelTest, LayerNorm_LargeSizeTensor) {
   TestLayerNorm(X_dims, LAYER_NORM_OP, k_epsilon_default);
 }
 
+TEST(CudaKernelTest, LayerNorm_4KTensor) {
+  std::vector<int64_t> X_dims{3, 10, 4096};
+  TestLayerNorm(X_dims, LAYER_NORM_OP, k_epsilon_default);
+}
+
+TEST(CudaKernelTest, LayerNorm_8KTensor) {
+  std::vector<int64_t> X_dims{3, 10, 8192};
+  TestLayerNorm(X_dims, LAYER_NORM_OP, k_epsilon_default);
+}
+
 TEST(CudaKernelTest, LayerNorm_MidSizeTensor_NoBias) {
   std::vector<int64_t> X_dims{8, 80, 768};
   constexpr int64_t axis = -1;
diff --git a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc
index c501bd72a9dd9..638b7565a3ef0 100644
--- a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc
+++ b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc
@@ -280,23 +280,35 @@ TEST(SkipLayerNormTest, SkipLayerNormBatch1_Float16_vec) {
       0.9f, -0.5f, 0.8f, 0.f, 0.3f, 0.3f, 0.3f, -0.6f,  // 7
       0.8f, -0.5f, 0.0f, 1.f, 0.5f, 0.2f, 0.3f, 0.1f};  // 8
 
+  // Update test data result which use internal fp32 calculation for fp16 input/parameters.
+  // Following pytorch code snippet are used to generate the result: (Not torch uses fp32 internal calculation for this)
+  //
+  // gamma_tensor = torch.tensor(gamma_data, dtype=torch.float32).reshape(hidden_size).to('cuda:0').to(torch.float16)
+  // beta_tensor = torch.tensor(beta_data, dtype=torch.float32).reshape(hidden_size).to('cuda:0').to(torch.float16)
+  // input_tensor = torch.tensor(input_data, dtype=torch.float32).reshape(
+  //     batch_size, sequence_length, hidden_size).to('cuda:0').to(torch.float16)
+  // skip_tensor = torch.tensor(skip_data, dtype=torch.float32).reshape(
+  //     batch_size, sequence_length, hidden_size).to('cuda:0').to(torch.float16)
+  // added_input = torch.add(input_tensor, skip_tensor)
+  // out32 = torch.layer_norm(added_input, [hidden_size], gamma_tensor, beta_tensor, eps=epsilon_).to(torch.float32)
+  //
   std::vector<float> output_data = {
-      1.2490234f, -0.044372559f, 0.f, 1.7890625f, 0.61132812f, 0.1763916f, 0.28100586f, 0.014961243f,
-      0.20117188f, 2.6894531f, 5.8476562f, 0.78955078f, 0.54443359f, 0.1763916f, 4.8984375f, 0.1763916f,
-      0.64941406f, -0.044372559f, 0.f, 1.7890625f, -0.044372559f, 0.1763916f, 0.29882812f, 0.80175781f,
-      1.2490234f, -0.044372559f, 0.f, 2.9238281f, 0.61132812f, 0.17687988f, 0.29882812f, -0.077514648f,
-      0.21240234f, 11.59375f, 6.5273438f, -0.44458008f, 0.61132812f, 0.058441162f, 0.1763916f, -0.80664062f,
-      1.2490234f, -1.0439453f, 0.f, 3.7890625f, 0.61132812f, 0.63134766f, 0.78515625f, -0.39331055f,
-      1.5078125f, -0.044372559f, 0.3503418f, 3.8457031f, 0.29882812f, 0.29882812f, 0.29882812f, -1.2148438f,
-      0.85595703f, 0.40893555f, 0.f, 1.7890625f, 0.61132812f, 0.1763916f, 0.29882812f, -0.0025119781f,
-      1.0097656f, -0.12133789f, 0.f, 1.4189453f, 0.51367188f, 0.1583252f, -0.25830078f, -0.098876953f,
-      -1.0097656f, 4.8945312f, 1.2695312f, 4.3398438f, 0.50537109f, 0.1583252f, 4.6835938f, 0.12695312f,
-      0.40966797f, 0.46655273f, 0.f, 2.203125f, -0.23901367f, 0.1583252f, 0.26123047f, 0.38110352f,
-      1.0097656f, -0.23901367f, 0.f, 1.0273438f, 0.23901367f, 0.17907715f, 0.26123047f, -0.33178711f,
-      0.15234375f, -0.84863281f, 5.9804688f, -0.83789062f, 0.74853516f, -0.050079346f, 0.25244141f, -1.1015625f,
-      1.0097656f, -1.1210938f, 0.f, 3.4179688f, 0.51367188f, 0.67431641f, 0.37133789f, -0.098876953f,
-      1.1357422f, -0.12133789f, 0.77832031f, 0.053985596f, 0.30810547f, 0.47265625f, 0.096557617f, -0.53662109f,
-      0.82617188f, -0.12133789f, 0.f, 1.4189453f, 0.51367188f, 0.1583252f, 0.26123047f, 0.071289062f};
+    1.25000000f, -0.04403687f,  0.00000000f,  1.79003906f,  0.61132812f,  0.17639160f,  0.28125000f,  0.01530457f,
+    0.20166016f,  2.69140625f,  5.84765625f,  0.78955078f,  0.54443359f,  0.17639160f,  4.89843750f,  0.17639160f,
+    0.64990234f, -0.04403687f,  0.00000000f,  1.79003906f, -0.04403687f,  0.17639160f,  0.29882812f,  0.80175781f,
+    1.25000000f, -0.04403687f,  0.00000000f,  2.92382812f,  0.61132812f,  0.17687988f,  0.29882812f, -0.07745361f,
+    0.21240234f, 11.60156250f,  6.52734375f, -0.44482422f,  0.61132812f,  0.05844116f,  0.17639160f, -0.80712891f,
+    1.25000000f, -1.04394531f,  0.00000000f,  3.78906250f,  0.61132812f,  0.63134766f,  0.78564453f, -0.39331055f,
+    1.50878906f, -0.04403687f,  0.34985352f,  3.84765625f,  0.29882812f,  0.29882812f,  0.29882812f, -1.21582031f,
+    0.85595703f,  0.40966797f,  0.00000000f,  1.79003906f,  0.61132812f,  0.17639160f,  0.29882812f, -0.00255013f,
+    1.00976562f, -0.12152100f,  0.00000000f,  1.41894531f,  0.51367188f,  0.15832520f, -0.25805664f, -0.09875488f,
+    -1.00976562f,  4.89453125f,  1.26953125f,  4.33984375f,  0.50537109f,  0.15832520f,  4.68359375f,  0.12695312f,
+    0.40942383f,  0.46655273f,  0.00000000f,  2.20312500f, -0.23913574f,  0.15832520f,  0.26123047f,  0.38110352f,
+    1.00976562f, -0.23913574f,  0.00000000f,  1.02734375f,  0.23913574f,  0.17907715f,  0.26123047f, -0.33178711f,
+    0.15234375f, -0.85058594f,  5.98046875f, -0.83789062f,  0.74853516f, -0.04998779f,  0.25244141f, -1.10156250f,
+    1.00976562f, -1.12109375f,  0.00000000f,  3.41992188f,  0.51367188f,  0.67431641f,  0.37133789f, -0.09875488f,
+    1.13574219f, -0.12152100f,  0.77832031f,  0.05398560f,  0.30810547f,  0.47265625f,  0.09643555f, -0.53662109f,
+    0.82617188f, -0.12152100f,  0.00000000f,  1.41894531f,  0.51367188f,  0.15832520f,  0.26123047f,  0.07135010f};
 
   RunTest(input_data,
           skip_data,

From 7033346605e0e6bf619ac6049356d478f97e1237 Mon Sep 17 00:00:00 2001
From: Hariharan Seshadri <shariharan91@gmail.com>
Date: Thu, 23 Mar 2023 11:00:09 -0700
Subject: [PATCH 05/20] Support mask_filter_value attribute in
 DecoderMaskedMultiheadAttention (#15158)

---
 docs/ContribOperators.md                                    | 2 ++
 onnxruntime/core/graph/contrib_ops/bert_defs.cc             | 6 +++++-
 onnxruntime/python/tools/transformers/convert_generation.py | 1 +
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index a62d757aa9062..656f0e86d2a29 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -1126,6 +1126,8 @@ This version of the operator has been available since version 1 of the 'com.micr
 #### Attributes
 
 <dl>
+<dt><tt>mask_filter_value</tt> : float</dt>
+<dd>The value to be filled in the attention mask. Default value is -10000.0f</dd>
 <dt><tt>num_heads</tt> : int (required)</dt>
 <dd>Number of attention heads</dd>
 <dt><tt>past_present_share_buffer</tt> : int</dt>
diff --git a/onnxruntime/core/graph/contrib_ops/bert_defs.cc b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
index ae9e0c1324ca6..b205b64954559 100644
--- a/onnxruntime/core/graph/contrib_ops/bert_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
@@ -473,6 +473,10 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
               "Custom scale will be used if specified. Default value is 1/sqrt(head_size)",
               AttributeProto::FLOAT,
               OPTIONAL_VALUE)
+        .Attr("mask_filter_value",
+              "The value to be filled in the attention mask. Default value is -10000.0f",
+              AttributeProto::FLOAT,
+              OPTIONAL_VALUE)
         .Input(0,
                "input",
                "Input tensor with shape (batch_size, 1, input_hidden_size)",
@@ -571,7 +575,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
         .Input(1,
                "key",
                "Key with shape (batch_size, kv_sequence_length, hidden_size), or packed KV with shape (batch_size, kv_sequence_length, num_heads, 2, head_size), "
-                "or past_key with shape (batch_size, num_heads, kv_sequence_length, head_size)",
+               "or past_key with shape (batch_size, num_heads, kv_sequence_length, head_size)",
                "T",
                OpSchema::Optional)
         .Input(2,
diff --git a/onnxruntime/python/tools/transformers/convert_generation.py b/onnxruntime/python/tools/transformers/convert_generation.py
index a7ea3b4e0568b..22690dc18efdc 100644
--- a/onnxruntime/python/tools/transformers/convert_generation.py
+++ b/onnxruntime/python/tools/transformers/convert_generation.py
@@ -1106,6 +1106,7 @@ def update_decoder_subgraph_use_decoder_masked_multihead_attention(
             "past_present_share_buffer",
             "num_heads",
             "scale",
+            "mask_filter_value",
             "domain",
         ]
 

From 2ee822d4831d2043e3d588d601264ed96cd72bf3 Mon Sep 17 00:00:00 2001
From: Ye Wang <52801275+wangyems@users.noreply.github.com>
Date: Thu, 23 Mar 2023 11:05:17 -0700
Subject: [PATCH 06/20] Extend memory efficient attention coverage in
 Attention/MHA cuda op (#15064)

### Description
<!-- Describe your changes. -->

1. upgrade cutlass to 3.0 that containing attn_bias support.
2. extend Attention/MHA to use memory efficient attention when
rel_pos_bias with [1, num_head, s, s*] and 1d mask with [2 * batch_size
+ 1] are present.

new mask format introduction:
MASK_1D_KEY_SEQ_LEN_START,
[3 * batch_size + 2] with [key_len[0], ..., key_len[batch_size - 1],
query_start[0], ..., query_start[batch_size - 1], query_end[batch_size -
1], key_start[0], ..., key_start[batch_size - 1], key_end[batch_size -
1]]

e.g
2D mask with [[1, 1, 1, 0, 0, 0], [1, 1, 1, 1, 1, 0]] converts to this
1D mask is [3, 5, 0, 6, 12, 0, 6, 12]


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

It potentially benefits tnlrv6 and t5(encoder)

---------

Co-authored-by: Ubuntu <wy@v100-2.0cdb2e52twzevn1i4fi45bylyg.jx.internal.cloudapp.net>
Co-authored-by: Kunal Vaishnavi <kvaishnavi@microsoft.com>
Co-authored-by: Kunal Vaishnavi <kvaishnavi@microsoft.com@orttrainingdev7.d32nl1ml4oruzj4qz3bqlggovf.px.internal.cloudapp.net>
---
 cgmanifests/generated/cgmanifest.json         |   2 +-
 cmake/deps.txt                                |   2 +-
 cmake/external/cutlass.cmake                  |   1 +
 cmake/patches/cutlass/cutlass.patch           |  92 ++
 docs/ContribOperators.md                      |   4 +-
 .../contrib_ops/cpu/bert/attention_base.cc    |  10 +-
 .../contrib_ops/cpu/bert/attention_common.h   |  17 +-
 .../cpu/bert/multihead_attention_helper.h     |  10 +-
 .../contrib_ops/cuda/bert/attention.cc        |   6 +-
 .../contrib_ops/cuda/bert/attention_impl.cu   |  27 +-
 .../bert/cutlass_fmha/fmha_launch_template.h  |  22 +-
 .../cuda/bert/cutlass_fmha/kernel_forward.h   | 947 ------------------
 .../cutlass_fmha/memory_efficient_attention.h |  20 +-
 .../cuda/bert/multihead_attention.cc          |   7 +-
 .../core/graph/contrib_ops/bert_defs.cc       |   4 +-
 .../contrib_ops/attention_op_test_helper.cc   | 189 ++++
 .../contrib_ops/attention_op_test_helper.h    |   2 +
 .../multihead_attention_op_test.cc            |  15 +-
 .../templates/download-deps.yml               |   4 +-
 19 files changed, 382 insertions(+), 999 deletions(-)
 create mode 100644 cmake/patches/cutlass/cutlass.patch
 mode change 100644 => 100755 docs/ContribOperators.md
 delete mode 100644 onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/kernel_forward.h

diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index 953a1948d201e..ce8b9bf4b156e 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -392,7 +392,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "66d9cddc832c1cdc2b30a8755274f7f74640cfe6",
+          "commitHash": "c4f6b8c6bc94ff69048492fb34df0dfaf1983933",
           "repositoryUrl": "https://github.com/NVIDIA/cutlass.git"
         },
         "comments": "cutlass"
diff --git a/cmake/deps.txt b/cmake/deps.txt
index 9d91eb97d29ae..a97ce38c7e12f 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -34,7 +34,7 @@ pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/5916273f79a21551890fd
 re2;https://github.com/google/re2/archive/refs/tags/2022-06-01.zip;aa77313b76e91b531ee7f3e45f004c6a502a5374
 safeint;https://github.com/dcleblanc/SafeInt/archive/ff15c6ada150a5018c5ef2172401cb4529eac9c0.zip;913a4046e5274d329af2806cb53194f617d8c0ab
 tensorboard;https://github.com/tensorflow/tensorboard/archive/373eb09e4c5d2b3cc2493f0949dc4be6b6a45e81.zip;67b833913605a4f3f499894ab11528a702c2b381
-cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v2.11.0.zip;be70c559f07251ba7f33c789dba98872b444c10f
+cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v3.0.0.zip;0f95b3c1fc1bd1175c4a90b2c9e39074d1bccefd
 # below are deps introduced by triton client, might remove after 1.14 release
 openssl;https://github.com/openssl/openssl/archive/refs/tags/openssl-3.0.7.zip;dda8fc81308555410505eb4a9eab3e1da0436a1d
 rapidjson;https://github.com/Tencent/rapidjson/archive/refs/tags/v1.1.0.zip;0fe7b4f7b83df4b3d517f4a202f3a383af7a0818
diff --git a/cmake/external/cutlass.cmake b/cmake/external/cutlass.cmake
index dc02168b861b6..18ac668bb1592 100644
--- a/cmake/external/cutlass.cmake
+++ b/cmake/external/cutlass.cmake
@@ -4,6 +4,7 @@ if (onnxruntime_USE_FLASH_ATTENTION)
     cutlass
     URL ${DEP_URL_cutlass}
     URL_HASH SHA1=${DEP_SHA1_cutlass}
+    PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/cutlass/cutlass.patch
   )
 
   FetchContent_GetProperties(cutlass)
diff --git a/cmake/patches/cutlass/cutlass.patch b/cmake/patches/cutlass/cutlass.patch
new file mode 100644
index 0000000000000..bda1de8b46916
--- /dev/null
+++ b/cmake/patches/cutlass/cutlass.patch
@@ -0,0 +1,92 @@
+diff --git a/include/cute/numeric/complex.hpp b/include/cute/numeric/complex.hpp
+index 3790ebd3..cf727d09 100644
+--- a/include/cute/numeric/complex.hpp
++++ b/include/cute/numeric/complex.hpp
+@@ -41,10 +41,14 @@
+ // With CUDA 11.4, builds show spurious "-Wconversion" warnings
+ // on line 656 of thrust/detail/type_traits.h.
+ // These pragmas suppress the warnings.
++#ifdef __GNUC__
+ #pragma GCC diagnostic push
+ #pragma GCC diagnostic ignored "-Wconversion"
++#endif
+ #include <thrust/complex.h>
++#ifdef __GNUC__
+ #pragma GCC diagnostic pop
++#endif
+ 
+ #include <cute/config.hpp>
+ 
+diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h
+index 59aec46a..8f2a913a 100644
+--- a/include/cutlass/functional.h
++++ b/include/cutlass/functional.h
+@@ -89,7 +89,7 @@ struct multiplies {
+   }
+ };
+ 
+-#if defined(__CUDA_ARCH__)
++#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+ /// Partial specializations needed when __CUDA_NO_HALF2_OPERATORS__ is set
+ template<>
+ struct plus<__half2> {
+@@ -143,12 +143,12 @@ struct multiplies<__half> {
+ 
+ 
+ // Maximum with nan propogation
+-// To propgate the NANs, the "max" of a two element that contains NaNs should also return a NaN 
++// To propgate the NANs, the "max" of a two element that contains NaNs should also return a NaN
+ template <typename T>
+ struct maximum_with_nan_propogation {
+   CUTLASS_HOST_DEVICE
+   T operator()(T const &lhs, T const &rhs) const {
+-    return lhs > rhs or std::isnan(lhs) ? lhs : rhs;
++    return lhs > rhs or isnan(lhs) ? lhs : rhs;
+   }
+ };
+ 
+@@ -160,7 +160,7 @@ struct maximum_with_nan_propogation<float> {
+ #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+     asm volatile("max.NaN.f32 %0, %1, %2;\n" : "=f"(res) : "f"(lhs), "f"(rhs));
+ #else
+-    res = lhs > rhs or std::isnan(lhs) ? lhs : rhs;
++    res = lhs > rhs or isnan(lhs) ? lhs : rhs;
+ #endif
+     return res;
+   }
+@@ -233,7 +233,7 @@ struct negate {
+   }
+ };
+ 
+-/// Greater equal 
++/// Greater equal
+ template <typename T>
+ struct greater_equal {
+   CUTLASS_HOST_DEVICE
+@@ -242,7 +242,7 @@ struct greater_equal {
+   }
+ };
+ 
+-/// Greater  
++/// Greater
+ template <typename T>
+ struct greater {
+   CUTLASS_HOST_DEVICE
+@@ -251,7 +251,7 @@ struct greater {
+   }
+ };
+ 
+-/// Less equal 
++/// Less equal
+ template <typename T>
+ struct less_equal {
+   CUTLASS_HOST_DEVICE
+@@ -260,7 +260,7 @@ struct less_equal {
+   }
+ };
+ 
+-/// Less  
++/// Less
+ template <typename T>
+ struct less {
+   CUTLASS_HOST_DEVICE
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
old mode 100644
new mode 100755
index 656f0e86d2a29..7bf1e3d0f646c
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -155,7 +155,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dt><tt>bias</tt> (optional) : T</dt>
 <dd>Bias tensor with shape (hidden_size + hidden_size + v_hidden_size) for input projection</dd>
 <dt><tt>mask_index</tt> (optional) : M</dt>
-<dd>Attention mask with shape (batch_size, 1, max_sequence_length, max_sequence_length), (batch_size, total_sequence_length) or (batch_size, sequence_length, total_sequence_length), or index with shape (batch_size) or (2 * batch_size)</dd>
+<dd>Attention mask with shape (batch_size, 1, max_sequence_length, max_sequence_length), (batch_size, total_sequence_length) or (batch_size, sequence_length, total_sequence_length), or index with shape (batch_size) or (2 * batch_size) or (3 * batch_size + 2)</dd>
 <dt><tt>past</tt> (optional) : T</dt>
 <dd>past state for key and value with shape (2, batch_size, num_heads, past_sequence_length, head_size)When past_present_share_buffer is set, its shape is (2, batch_size, num_heads, max_sequence_length, head_size)</dd>
 <dt><tt>relative_position_bias</tt> (optional) : T</dt>
@@ -2404,7 +2404,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dt><tt>bias</tt> (optional) : T</dt>
 <dd>Bias tensor with shape (hidden_size + hidden_size + v_hidden_size) from input projection</dd>
 <dt><tt>key_padding_mask</tt> (optional) : M</dt>
-<dd>Key padding mask with shape (batch_size) or (batch_size, kv_sequence_length)</dd>
+<dd>Key padding mask with shape (batch_size) or (3 * batch_size + 2) or (batch_size, kv_sequence_length)</dd>
 <dt><tt>relative_position_bias</tt> (optional) : T</dt>
 <dd>relative position bias: addition to QxK' with shape (batch_size, num_heads, sequence_length, total_sequence_length) or (1, num_heads, sequence_length, total_sequence_length)</dd>
 <dt><tt>past_key</tt> (optional) : T</dt>
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_base.cc b/onnxruntime/contrib_ops/cpu/bert/attention_base.cc
index 07f3b49b4e190..00e843ffb97f1 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention_base.cc
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_base.cc
@@ -41,7 +41,7 @@ Status AttentionBase::CheckInputs(const TensorShape& input_shape,
 
   // For mask_index, the following shapes are supported:
   //     NULL, (B, 1), (1, 1)
-  //     (B), (2 * B),
+  //     (B), (2 * B), (3 * B + 2)
   //     (B, T)
   //     (B, S, T)
   //     (B, 1, M, M)
@@ -274,11 +274,13 @@ Status AttentionBase::CheckMask(const Tensor* mask_index,
                                 int64_t total_sequence_length) const {
   const auto& mask_dims = mask_index->Shape().GetDims();
   if (mask_dims.size() == 1) {
-    if (mask_dims[0] != batch_size && mask_dims[0] != 2 * batch_size) {
+    if (mask_dims[0] != batch_size && mask_dims[0] != 2 * batch_size && mask_dims[0] != 3 * batch_size + 2) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "Inputs 'mask_index' with 1D data shall have length of batch_size or 2 * batch_size");
+                             "Inputs 'mask_index' with 1D data shall have length of batch_size or 2 * batch_size or 3 * batch_size + 2");
     }
-    mask_type = (mask_dims[0] == batch_size ? AttentionMaskType::MASK_1D_KEY_SEQ_LEN : AttentionMaskType::MASK_1D_END_START);
+    mask_type = (mask_dims[0] == batch_size ?
+                 AttentionMaskType::MASK_1D_KEY_SEQ_LEN :
+                 mask_dims[0] == 2 * batch_size ? AttentionMaskType::MASK_1D_END_START : AttentionMaskType::MASK_1D_KEY_SEQ_LEN_START);
   } else if (mask_dims.size() == 2) {
     if (mask_dims[0] == batch_size && mask_dims[1] == total_sequence_length) {
       mask_type = AttentionMaskType::MASK_2D_KEY_PADDING;
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_common.h b/onnxruntime/contrib_ops/cpu/bert/attention_common.h
index 680f875d23f30..1b52ff2a0f540 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention_common.h
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_common.h
@@ -7,13 +7,16 @@ namespace onnxruntime {
 namespace contrib {
 
 enum AttentionMaskType {
-  MASK_NONE,            // No mask
-  MASK_1D_KEY_SEQ_LEN,  // [batch_size], key sequence length
-  MASK_1D_END_START,    // [2 * batch_size] with end positions and start positions
-  MASK_2D_DUMMY,        // dummy mask with shape [1, 1] or [batch_size, 1]. It has same effect as no mask.
-  MASK_2D_KEY_PADDING,  // [batch_size, total_sequence_length]
-  MASK_3D_ATTENTION,    // [batch_size, sequence_length, total_sequence_length]
-  MASK_4D_MEGATRON,     // Megatron causal mask with shape [batch_size, 1, max_sequence_length, max_sequence_length]
+  MASK_NONE,                  // No mask
+  MASK_1D_KEY_SEQ_LEN,        // [batch_size], key sequence length
+  MASK_1D_END_START,          // [2 * batch_size] with end positions and start positions
+  MASK_1D_KEY_SEQ_LEN_START,  // [3 * batch_size + 2] with [key_len[0], ..., key_len[batch_size - 1], query_start[0],
+                              // ..., query_start[batch_size - 1], query_end[batch_size - 1], key_start[0], ...,
+                              // key_start[batch_size - 1], key_end[batch_size - 1]]
+  MASK_2D_DUMMY,              // dummy mask with shape [1, 1] or [batch_size, 1]. It has same effect as no mask.
+  MASK_2D_KEY_PADDING,        // [batch_size, total_sequence_length]
+  MASK_3D_ATTENTION,          // [batch_size, sequence_length, total_sequence_length]
+  MASK_4D_MEGATRON,           // Megatron causal mask with shape [batch_size, 1, max_sequence_length, max_sequence_length]
   MASK_UNKNOWN
 };
 
diff --git a/onnxruntime/contrib_ops/cpu/bert/multihead_attention_helper.h b/onnxruntime/contrib_ops/cpu/bert/multihead_attention_helper.h
index 80e506b49916a..cc7dad81b4dcb 100644
--- a/onnxruntime/contrib_ops/cpu/bert/multihead_attention_helper.h
+++ b/onnxruntime/contrib_ops/cpu/bert/multihead_attention_helper.h
@@ -25,7 +25,7 @@ Status CheckInputs(const T* query,
                    float mask_filter_value,
                    float scale,
                    int max_threads_per_block) {
-  //     key_padding_mask (K/V)     : (B) or (B, L) or None
+  //     key_padding_mask (K/V)     : (B) or (2*B + 1) or (B, L) or None
   //     relative_position_bias     : (B, 1, S, L)
   //     past_key                   : (B, N, S*, H)
   //     past_value                 : (B, N, S*, H)
@@ -188,8 +188,12 @@ Status CheckInputs(const T* query,
   if (key_padding_mask != nullptr) {
     mask_type = AttentionMaskType::MASK_UNKNOWN;
     const auto& mask_dims = key_padding_mask->Shape().GetDims();
-    if (mask_dims.size() == 1 && mask_dims[0] == static_cast<int64_t>(batch_size)) {
-      mask_type = AttentionMaskType::MASK_1D_KEY_SEQ_LEN;
+    if (mask_dims.size() == 1) {
+      if (mask_dims[0] == static_cast<int64_t>(batch_size)) {
+        mask_type = AttentionMaskType::MASK_1D_KEY_SEQ_LEN;
+      } else if (mask_dims[0] == static_cast<int64_t>(3 * batch_size + 2)) {
+        mask_type = AttentionMaskType::MASK_1D_KEY_SEQ_LEN_START;
+      }
     } else if (mask_dims.size() == 2 && mask_dims[0] == static_cast<int64_t>(batch_size) && mask_dims[1] == static_cast<int64_t>(kv_sequence_length)) {
       mask_type = AttentionMaskType::MASK_2D_KEY_PADDING;
     }
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention.cc b/onnxruntime/contrib_ops/cuda/bert/attention.cc
index 11c982e53a90f..def1508ca2368 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/attention.cc
@@ -102,6 +102,7 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
   // Check whether we can use fused kernel
   int sm = device_prop.major * 10 + device_prop.minor;
   bool is_mask_1d_seq_len = parameters.mask_type == AttentionMaskType::MASK_1D_KEY_SEQ_LEN;
+  bool is_mask_1d_key_seq_len_start = parameters.mask_type == AttentionMaskType::MASK_1D_KEY_SEQ_LEN_START;
 
   if (is_unidirectional_ && enable_fused_causal_attention_) {  // GPT
     // GPT fused kernels requires left side padding. mask can be:
@@ -151,12 +152,13 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
   }
 
 #if USE_FLASH_ATTENTION
+  bool is_good_for_rpb = relative_position_bias != nullptr && parameters.sequence_length % (4 * sizeof(T)) == 0;
   bool use_memory_efficient_attention = fused_runner == nullptr &&
                                         !disable_memory_efficient_attention_ &&
-                                        nullptr == mask_index &&  // TODO: support 1D mask
+                                        (nullptr == mask_index || is_mask_1d_key_seq_len_start) &&
                                         nullptr == past &&
                                         nullptr == present &&
-                                        nullptr == relative_position_bias &&
+                                        (nullptr == relative_position_bias || is_good_for_rpb) &&
                                         (sizeof(T) == 2 ||  // sequence length threshold is 0 in FP16
                                          parameters.sequence_length >= attention::kMinSequenceLengthForMemoryEfficientAttentionFp32) &&
                                         has_memory_efficient_attention(sm, sizeof(T) == 2);
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
index 28daf5d4af55e..c7127fecedbe5 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
@@ -445,6 +445,14 @@ Status PrepareQkv(contrib::AttentionParameters& parameters,
     DUMP_TENSOR_D("value", data.value, batch_size * kv_sequence_length, num_heads, v_head_size);
     DUMP_TENSOR_D("value_bias", data.bias + 2 * num_heads * qk_head_size, num_heads, v_head_size);
 
+    if (data.relative_position_bias != nullptr && parameters.broadcast_res_pos_bias) {
+      DUMP_TENSOR_D("relative_position_bias", data.relative_position_bias, num_heads, sequence_length, kv_sequence_length);
+    }
+
+    if (data.mask_index != nullptr && parameters.mask_type == AttentionMaskType::MASK_1D_KEY_SEQ_LEN_START) {
+      DUMP_TENSOR_D("mask_index", data.mask_index, 3 * batch_size + 2, 1);
+    }
+
     if (data.fused_cross_attention_kernel != nullptr) {
       assert(qk_head_size == v_head_size);
 
@@ -735,11 +743,14 @@ Status QkvToContext(
     return Status::OK();
   }
 
+  // For raw attention mask, the scalar 1/sqrt(H) is moved to combine with softmax computation.
+  const float scale = parameters.scale == 0.0f ? 1.f / sqrt(static_cast<float>(qk_head_size))
+                                               : parameters.scale;
+
 #if USE_FLASH_ATTENTION
   if (data.use_memory_efficient_attention) {
     // We only enable fused cross attention when there is no key padding mask.
     // Otherwise, key have effective batch size 2 * batch_size, which is different from batch_size of query.
-    assert(data.mask_index == nullptr);
     assert(qkv_format == AttentionQkvFormat::Q_K_V_BSNH);
 
     const void* query = q;
@@ -754,23 +765,26 @@ Status QkvToContext(
     MemoryEfficientAttentionParams p;
     p.sm = device_prop.major * 10 + device_prop.minor;
     p.is_half = sizeof(T) == 2;
-    p.batch_size = data.mask_index == nullptr ? parameters.batch_size : 2 * parameters.batch_size;
+    p.batch_size = parameters.batch_size;
     p.num_heads = parameters.num_heads;
     p.sequence_length = parameters.sequence_length;
     p.kv_sequence_length = parameters.total_sequence_length;
     p.qk_head_size = parameters.head_size;
     p.v_head_size = parameters.v_head_size;
     p.causal = parameters.is_unidirectional;
-    p.cu_seqlens_q = nullptr;
-    p.cu_seqlens_k = nullptr;
+    p.scale = scale;
+    p.seqlen_k_ptr = nullptr == data.mask_index ? nullptr : const_cast<int32_t*>(reinterpret_cast<const int32_t*>(data.mask_index));
+    p.seqstart_q_ptr = nullptr == data.mask_index ? nullptr : const_cast<int32_t*>(reinterpret_cast<const int32_t*>(data.mask_index + batch_size));
+    p.seqstart_k_ptr = nullptr == data.mask_index ? nullptr : const_cast<int32_t*>(reinterpret_cast<const int32_t*>(data.mask_index + 2 * batch_size + 1));
     p.query = query;
     p.key = key;
     p.value = value;
+    p.attn_bias = nullptr == data.relative_position_bias ? nullptr : data.relative_position_bias;
+    p.is_attn_bias_batched = !parameters.broadcast_res_pos_bias;
     p.output = data.output;
     p.workspace = MemoryEfficientAttentionParams::need_workspace(v_head_size, sizeof(T) == sizeof(float)) ? scratch1 : nullptr;
     p.stream = stream;
     run_memory_efficient_attention(p);
-
     DUMP_TENSOR("cutlass output", data.output, batch_size * sequence_length, num_heads, v_head_size);
     return Status::OK();
   }
@@ -789,9 +803,6 @@ Status QkvToContext(
   float one = 1.0f;
   float zero = 0.f;
 
-  // For raw attention mask, the scalar 1/sqrt(H) is moved to combine with softmax computation.
-  const float scale = parameters.scale == 0.0f ? 1.f / sqrt(static_cast<float>(qk_head_size))
-                                               : parameters.scale;
   float alpha = use_raw_attention_mask ? one : scale;
 
   cublasSetStream(cublas, stream);
diff --git a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
index 17f4665a80f77..ed38cabc464a2 100644
--- a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
+++ b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h
@@ -10,7 +10,7 @@
 #endif
 
 #include "contrib_ops/cuda/bert/cutlass_fmha/memory_efficient_attention.h"
-#include "contrib_ops/cuda/bert/cutlass_fmha/kernel_forward.h"
+#include "41_fused_multi_head_attention/kernel_forward.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -24,8 +24,10 @@ void LaunchCutlassFmha(const MemoryEfficientAttentionParams& params) {
     p.query_ptr = const_cast<T*>(reinterpret_cast<const T*>(params.query));
     p.key_ptr = const_cast<T*>(reinterpret_cast<const T*>(params.key));
     p.value_ptr = const_cast<T*>(reinterpret_cast<const T*>(params.value));
-    p.cu_seqlens_q_ptr = params.cu_seqlens_q;
-    p.cu_seqlens_k_ptr = params.cu_seqlens_k;
+    p.attn_bias_ptr = const_cast<T*>(reinterpret_cast<const T*>(params.attn_bias));
+    p.seqstart_q_ptr = params.seqstart_q_ptr;
+    p.seqstart_k_ptr = params.seqstart_k_ptr;
+    p.seqlen_k_ptr = params.seqlen_k_ptr;
 
     p.logsumexp_ptr = nullptr;  // [num_heads, num_queries] for backward or nullptr for forward
     p.output_ptr = reinterpret_cast<T*>(params.output);
@@ -42,28 +44,32 @@ void LaunchCutlassFmha(const MemoryEfficientAttentionParams& params) {
     p.head_dim = params.qk_head_size;
     p.head_dim_value = params.v_head_size;
 
+    p.scale = params.scale;
+
     // When params.cu_seqlens_q is provided, num_queries is max_seq_q and num_keys will be set inside the kernel
     p.num_queries = params.sequence_length;
     p.num_keys = params.kv_sequence_length;
 
-    p.causal = params.causal;
+    if (params.causal) {
+      p.custom_mask_type = Attention::CausalFromTopLeft;
+    }
 
     // Input format is BxSxNxH, output is BxSxNxH
     p.q_strideH = params.qk_head_size;
     p.k_strideH = params.qk_head_size;
     p.v_strideH = params.v_head_size;
-    p.o_strideH = params.v_head_size;
+    p.bias_strideH = nullptr == params.attn_bias ? 0 : p.num_queries * p.num_keys;
 
     p.q_strideM = params.num_heads * params.qk_head_size;
     p.k_strideM = params.num_heads * params.qk_head_size;
     p.v_strideM = params.num_heads * params.v_head_size;
+    p.o_strideM = params.num_heads * params.v_head_size;
+    p.bias_strideM = nullptr == params.attn_bias ? 0 : p.num_keys;
 
     p.q_strideB = static_cast<int64_t>(p.q_strideM) * params.sequence_length;
     p.k_strideB = static_cast<int64_t>(p.k_strideM) * params.kv_sequence_length;
     p.v_strideB = static_cast<int64_t>(p.v_strideM) * params.kv_sequence_length;
-    p.o_strideB = static_cast<int64_t>(params.num_heads) * params.v_head_size * params.sequence_length;
-
-    p.causal = params.causal;
+    p.bias_strideB = params.is_attn_bias_batched ? static_cast<int64_t>(p.bias_strideH) * params.num_heads : 0;
   }
 
   constexpr auto kernel_fn = attention_kernel_batched_impl<Attention>;
diff --git a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/kernel_forward.h b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/kernel_forward.h
deleted file mode 100644
index 7885983f99ea6..0000000000000
--- a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/kernel_forward.h
+++ /dev/null
@@ -1,947 +0,0 @@
-/***************************************************************************************************
- * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: BSD-3-Clause
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holdvr nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- **************************************************************************************************/
-
-#pragma once
-
-#if USE_FLASH_ATTENTION
-
-#include <cmath>
-#include <vector>
-
-#include "cutlass/bfloat16.h"
-#include "cutlass/gemm/gemm.h"
-#include "cutlass/layout/matrix.h"
-#include "cutlass/layout/vector.h"
-#include "cutlass/numeric_types.h"
-
-#include "41_fused_multi_head_attention/attention_scaling_coefs_updater.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_simt.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h"
-#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h"
-#include "cutlass/gemm/device/default_gemm_configuration.h"
-#include "cutlass/gemm/kernel/default_gemm.h"
-#include "cutlass/gemm/threadblock/default_mma.h"
-#include "cutlass/gemm/threadblock/default_mma_core_simt.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm70.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm75.h"
-#include "cutlass/gemm/threadblock/default_mma_core_sm80.h"
-#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
-#include "cutlass/matrix_shape.h"
-#include "cutlass/platform/platform.h"
-#include "cutlass/transform/threadblock/predicated_tile_iterator.h"
-#include "41_fused_multi_head_attention/debug_utils.h"
-#include "41_fused_multi_head_attention/epilogue_pipelined.h"
-#include "41_fused_multi_head_attention/epilogue_rescale_output.h"
-#include "41_fused_multi_head_attention/find_default_mma.h"
-#include "41_fused_multi_head_attention/gemm_kernel_utils.h"
-#include "41_fused_multi_head_attention/mma_from_smem.h"
-
-#include <inttypes.h>
-
-using namespace gemm_kernel_utils;
-
-namespace {
-template <typename scalar_t, typename Arch>
-constexpr int getWarpsPerSm() {
-  return (
-      Arch::kMinComputeCapability >= 80 &&
-              !cutlass::platform::is_same<scalar_t, float>::value
-          ? 16
-          : 12);
-}
-} // namespace
-
-template <
-    // The datatype of Q/K/V
-    typename scalar_t_,
-    // Architecture we are targeting (eg `cutlass::arch::Sm80`)
-    typename ArchTag,
-    // If Q/K/V are correctly aligned in memory and we can run a fast kernel
-    bool isAligned_,
-    int kQueriesPerBlock,
-    int kKeysPerBlock,
-    bool kSingleValueIteration // = `value.shape[-1] <= kKeysPerBlock`
-    >
-struct AttentionKernel {
-  using scalar_t = scalar_t_;
-  using accum_t = float;
-  using lse_scalar_t = float;
-  using output_t = scalar_t;
-  // Accumulator between 2 iterations
-  // Using `accum_t` improves perf on f16 at the cost of
-  // numerical errors
-  using output_accum_t = accum_t;
-  static constexpr bool kIsAligned = isAligned_;
-  static constexpr int32_t kAlignLSE = 32; // block size of backward
-  static constexpr bool kPreloadV = ArchTag::kMinComputeCapability >= 80 &&
-      cutlass::sizeof_bits<scalar_t>::value == 16;
-  static constexpr bool kKeepOutputInRF = kSingleValueIteration;
-  static constexpr bool kNeedsOutputAccumulatorBuffer = !kKeepOutputInRF &&
-      !cutlass::platform::is_same<output_accum_t, output_t>::value;
-
-  static_assert(kQueriesPerBlock % 32 == 0, "");
-  static_assert(kKeysPerBlock % 32 == 0, "");
-  static constexpr int kNumWarpsPerBlock =
-      kQueriesPerBlock * kKeysPerBlock / (32 * 32);
-  static constexpr int kWarpSize = 32;
-
-  // Launch bounds
-  static constexpr int kNumThreads = kWarpSize * kNumWarpsPerBlock;
-  static constexpr int kMinBlocksPerSm =
-      getWarpsPerSm<scalar_t, ArchTag>() / kNumWarpsPerBlock;
-
-  struct Params {
-    // Input tensors
-    scalar_t* query_ptr; // [num_queries, num_heads, head_dim]
-    scalar_t* key_ptr;   // [num_keys, num_heads, head_dim]
-    scalar_t* value_ptr; // [num_keys, num_heads, head_dim_value]
-    int32_t* cu_seqlens_q_ptr = nullptr;
-    int32_t* cu_seqlens_k_ptr = nullptr;
-
-    // Output tensors
-    output_t* output_ptr; // [num_queries, num_heads, head_dim_value]
-    output_accum_t*
-        output_accum_ptr; // [num_queries, num_heads, head_dim_value]
-    lse_scalar_t* logsumexp_ptr; // [num_heads, num_queries] - can be null
-
-    // Dimensions/strides
-    int32_t head_dim;
-    int32_t head_dim_value;
-    int32_t num_queries;
-    int32_t num_keys;
-
-    bool causal;
-
-    int32_t q_strideM;
-    int32_t k_strideM;
-    int32_t v_strideM;
-
-    // Everything below is only used in `advance_to_block`
-    // and shouldn't use registers
-    int32_t q_strideH;
-    int32_t k_strideH;
-    int32_t v_strideH;
-    int32_t o_strideH;
-    int64_t q_strideB;
-    int64_t k_strideB;
-    int64_t v_strideB;
-    int64_t o_strideB;
-    int32_t num_batches;
-    int32_t num_heads;
-
-    // https://github.com/NVIDIA/cutlass/issues/771
-    CUTLASS_HOST_DEVICE int32_t o_strideM() const {
-      return head_dim_value * num_heads;
-    }
-
-    // Moves pointers to what we should process
-    // Returns "false" if there is no work to do
-    CUTLASS_DEVICE bool advance_to_block() {
-      auto batch_id = blockIdx.z;
-      auto head_id = blockIdx.y;
-      auto query_start = blockIdx.x * kQueriesPerBlock;
-
-      auto lse_dim = ceil_div((int32_t)num_queries, kAlignLSE) * kAlignLSE;
-
-      int64_t q_start, k_start;
-      // Advance to current batch - in case of different sequence lengths
-      if (cu_seqlens_q_ptr != nullptr) {
-        assert(cu_seqlens_k_ptr != nullptr);
-        cu_seqlens_q_ptr += batch_id;
-        cu_seqlens_k_ptr += batch_id;
-        q_start = cu_seqlens_q_ptr[0];
-        k_start = cu_seqlens_k_ptr[0];
-        int64_t q_next_start = cu_seqlens_q_ptr[1];
-        int64_t k_next_start = cu_seqlens_k_ptr[1];
-        num_queries = q_next_start - q_start;
-        num_keys = k_next_start - k_start;
-
-        if (query_start >= num_queries) {
-          return false;
-        }
-      } else {
-        query_ptr += batch_id * q_strideB;
-        key_ptr += batch_id * k_strideB;
-        value_ptr += batch_id * v_strideB;
-        output_ptr += batch_id * o_strideB;
-        if (output_accum_ptr != nullptr) {
-          output_accum_ptr += batch_id * o_strideB;
-        }
-        q_start = 0;
-        k_start = 0;
-      }
-
-      // Advance to the current batch / head / query_start
-      query_ptr += (q_start + query_start) * q_strideM + head_id * q_strideH;
-      key_ptr += k_start * k_strideM + head_id * k_strideH;
-      value_ptr += k_start * v_strideM + head_id * v_strideH;
-      output_ptr += int64_t(q_start + query_start) * o_strideM() +
-          head_id * o_strideH;
-
-      if (output_accum_ptr != nullptr) {
-        output_accum_ptr += int64_t(q_start + query_start) * o_strideM() +
-            head_id * o_strideH;
-      } else {
-        // Accumulate directly in the destination buffer (eg for f32)
-        output_accum_ptr = (accum_t*)output_ptr;
-      }
-      if (logsumexp_ptr != nullptr) {
-        // lse[batch_id, head_id, query_start]
-        logsumexp_ptr +=
-            batch_id * lse_dim * num_heads + head_id * lse_dim + query_start;
-      }
-
-      num_queries -= query_start;
-      if (causal) {
-        num_keys = cutlass::fast_min(
-            int32_t(query_start + kQueriesPerBlock), num_keys);
-      }
-      num_batches = 0; // no longer used after
-
-      // Make sure the compiler knows these variables are the same on all
-      // the threads of the warp.
-      query_ptr = warp_uniform(query_ptr);
-      key_ptr = warp_uniform(key_ptr);
-      value_ptr = warp_uniform(value_ptr);
-      output_ptr = warp_uniform(output_ptr);
-      output_accum_ptr = warp_uniform(output_accum_ptr);
-      logsumexp_ptr = warp_uniform(logsumexp_ptr);
-      num_queries = warp_uniform(num_queries);
-      num_keys = warp_uniform(num_keys);
-      head_dim = warp_uniform(head_dim);
-      head_dim_value = warp_uniform(head_dim_value);
-      return true;
-    }
-
-    __host__ dim3 getBlocksGrid() const {
-      return dim3(
-          ceil_div(num_queries, (int32_t)kQueriesPerBlock),
-          num_heads,
-          num_batches);
-    }
-    __host__ dim3 getThreadsGrid() const {
-      return dim3(kWarpSize, kNumWarpsPerBlock, 1);
-    }
-  };
-
-  struct MM0 {
-    /*
-      In this first matmul, we compute a block of `Q @ K.T`.
-      While the calculation result is still hot in registers, we update
-      `mi`, `m_prime`, `s_prime` in shared-memory, and then store this value
-      into a shared-memory ("AccumulatorSharedStorage") that is used later as
-      operand A for the second matmul (see MM1)
-    */
-    using GemmType = DefaultGemmType<ArchTag, scalar_t>;
-
-    using OpClass = typename GemmType::OpClass;
-    using DefaultConfig =
-        typename cutlass::gemm::device::DefaultGemmConfiguration<
-            OpClass,
-            ArchTag,
-            scalar_t,
-            scalar_t,
-            scalar_t, // ElementC
-            accum_t // ElementAccumulator
-            >;
-    static constexpr int kAlignmentA =
-        kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment;
-    static constexpr int kAlignmentB =
-        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
-    using ThreadblockShape = cutlass::gemm::
-        GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
-    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
-    using DefaultMma = typename cutlass::gemm::threadblock::FindDefaultMma<
-        scalar_t, // ElementA,
-        cutlass::layout::RowMajor, // LayoutA,
-        kAlignmentA,
-        scalar_t, // ElementB,
-        cutlass::layout::ColumnMajor, // LayoutB,
-        kAlignmentB,
-        accum_t,
-        cutlass::layout::RowMajor, // LayoutC,
-        OpClass,
-        ArchTag, // ArchTag
-        ThreadblockShape, // ThreadblockShape
-        WarpShape, // WarpShape
-        typename GemmType::InstructionShape, // InstructionShape
-        DefaultConfig::kStages, // Should use `DefaultConfig::kStages`, but that
-                                // uses too much smem
-        typename GemmType::Operator // Operator
-        >::DefaultMma;
-    using MmaCore = typename DefaultMma::MmaCore;
-    using IteratorA = typename DefaultMma::IteratorA;
-    using IteratorB = typename DefaultMma::IteratorB;
-    using Mma = typename DefaultMma::ThreadblockMma;
-    using ScalingCoefsUpdater = typename DefaultAttentionScalingCoefsUpdater<
-        typename Mma::Operator::IteratorC,
-        accum_t,
-        kWarpSize>::Updater;
-    static_assert(
-        MmaCore::WarpCount::kM * MmaCore::WarpCount::kN *
-                MmaCore::WarpCount::kK ==
-            kNumWarpsPerBlock,
-        "");
-
-    // Epilogue to store to shared-memory in a format that we can use later for
-    // the second matmul
-    using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm<
-        typename Mma::Operator::IteratorC,
-        typename Mma::Operator,
-        scalar_t,
-        WarpShape,
-        ThreadblockShape>;
-    using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage;
-  };
-
-  struct MM1 {
-    /**
-      Second matmul: perform `attn @ V` where `attn` is the attention (not
-      normalized) and stored in shared memory
-    */
-    using GemmType = DefaultGemmType<ArchTag, scalar_t>;
-
-    using OpClass = typename GemmType::OpClass;
-    using DefaultConfig =
-        typename cutlass::gemm::device::DefaultGemmConfiguration<
-            OpClass,
-            ArchTag,
-            scalar_t,
-            scalar_t,
-            output_accum_t, // ElementC
-            accum_t // ElementAccumulator
-            >;
-    static constexpr int kAlignmentA = DefaultConfig::kAlignmentA; // from smem
-    static constexpr int kAlignmentB =
-        kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment;
-    using ThreadblockShape = cutlass::gemm::
-        GemmShape<kQueriesPerBlock, kKeysPerBlock, GemmType::ThreadK>;
-    using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>;
-    using InstructionShape = typename GemmType::InstructionShape;
-
-    using LayoutB = cutlass::layout::RowMajor;
-    using DefaultGemm = cutlass::gemm::kernel::DefaultGemm<
-        scalar_t, // ElementA,
-        cutlass::layout::RowMajor, // LayoutA,
-        kAlignmentA,
-        scalar_t, // ElementB,
-        LayoutB, // LayoutB,
-        kAlignmentB,
-        output_accum_t,
-        cutlass::layout::RowMajor, // LayoutC,
-        accum_t,
-        OpClass,
-        ArchTag,
-        ThreadblockShape,
-        WarpShape,
-        typename GemmType::InstructionShape,
-        typename DefaultConfig::EpilogueOutputOp,
-        void, // ThreadblockSwizzle - not used
-        DefaultConfig::kStages,
-        false, // SplitKSerial
-        typename GemmType::Operator>;
-
-    using DefaultMmaFromSmem =
-        typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory<
-            typename DefaultGemm::Mma,
-            typename MM0::AccumulatorSharedStorage>;
-    using Mma = typename DefaultMmaFromSmem::Mma;
-    using IteratorB = typename Mma::IteratorB;
-    using WarpCount = typename Mma::WarpCount;
-    static_assert(
-        WarpCount::kM * WarpCount::kN * WarpCount::kK == kNumWarpsPerBlock,
-        "");
-
-    using DefaultEpilogue = typename DefaultGemm::Epilogue;
-    using OutputTileIterator =
-        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
-            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
-            output_t>;
-    using OutputTileIteratorAccum =
-        typename cutlass::epilogue::threadblock::PredicatedTileIterator<
-            typename DefaultEpilogue::OutputTileIterator::ThreadMap,
-            output_accum_t>;
-
-    struct SharedStorageMM1 {
-      typename Mma::SharedStorage mm;
-    };
-  };
-
-  static constexpr int64_t kAlignmentQ = MM0::kAlignmentA;
-  static constexpr int64_t kAlignmentK = MM0::kAlignmentB;
-  static constexpr int64_t kAlignmentV = 1;
-
-  // Shared storage - depends on kernel params
-  struct ScalingCoefs {
-    cutlass::Array<accum_t, kQueriesPerBlock> m_prime;
-    cutlass::Array<accum_t, kQueriesPerBlock> s_prime;
-    cutlass::Array<accum_t, kQueriesPerBlock> mi;
-  };
-
-  struct SharedStorageEpilogueAtEnd : ScalingCoefs {
-    struct SharedStorageAfterMM0 {
-      // Everything here might be overwritten during MM0
-      typename MM0::AccumulatorSharedStorage si;
-      typename MM1::SharedStorageMM1 mm1;
-    };
-
-    union {
-      typename MM0::Mma::SharedStorage mm0;
-      SharedStorageAfterMM0 after_mm0;
-      typename MM1::DefaultEpilogue::SharedStorage epilogue;
-    };
-
-    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
-    epilogue_shared_storage() {
-      return epilogue;
-    }
-  };
-
-  struct SharedStorageEpilogueInLoop : ScalingCoefs {
-    struct SharedStorageAfterMM0 {
-      // Everything here might be overwritten during MM0
-      typename MM0::AccumulatorSharedStorage si;
-      typename MM1::SharedStorageMM1 mm1;
-      typename MM1::DefaultEpilogue::SharedStorage epilogue;
-    };
-
-    union {
-      typename MM0::Mma::SharedStorage mm0;
-      SharedStorageAfterMM0 after_mm0;
-    };
-
-    CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage&
-    epilogue_shared_storage() {
-      return after_mm0.epilogue;
-    }
-  };
-
-  using SharedStorage = typename cutlass::platform::conditional<
-      kSingleValueIteration || kKeepOutputInRF,
-      SharedStorageEpilogueAtEnd,
-      SharedStorageEpilogueInLoop>::type;
-
-  static bool __host__ check_supported(Params const& p) {
-    CHECK_ALIGNED_PTR(p.query_ptr, kAlignmentQ);
-    CHECK_ALIGNED_PTR(p.key_ptr, kAlignmentK);
-    CHECK_ALIGNED_PTR(p.value_ptr, kAlignmentV);
-    XFORMERS_CHECK(
-        p.q_strideM % kAlignmentQ == 0, "query is not correctly aligned");
-    XFORMERS_CHECK(
-        p.k_strideM % kAlignmentK == 0, "key is not correctly aligned");
-    XFORMERS_CHECK(
-        p.v_strideM % kAlignmentV == 0, "value is not correctly aligned");
-    XFORMERS_CHECK(
-        p.q_strideH % kAlignmentQ == 0, "query is not correctly aligned");
-    XFORMERS_CHECK(
-        p.k_strideH % kAlignmentK == 0, "key is not correctly aligned");
-    XFORMERS_CHECK(
-        p.v_strideH % kAlignmentV == 0, "value is not correctly aligned");
-    return true;
-  }
-
-  static void CUTLASS_DEVICE attention_kernel(Params& p) {
-    // In this block, we will only ever:
-    // - read query[query_start:query_end, :]
-    // - write to output[query_start:query_end, :]
-
-    extern __shared__ char smem_buffer[];
-    SharedStorage& shared_storage = *((SharedStorage*)smem_buffer);
-    auto& m_prime = shared_storage.m_prime;
-    auto& s_prime = shared_storage.s_prime;
-    auto& mi = shared_storage.mi;
-
-    static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, "");
-    if (thread_id() < kQueriesPerBlock) {
-      s_prime[thread_id()] = accum_t(0);
-      m_prime[thread_id()] =
-          -cutlass::platform::numeric_limits<accum_t>::infinity();
-      mi[thread_id()] = -cutlass::platform::numeric_limits<accum_t>::infinity();
-    }
-    typename MM1::Mma::FragmentC accum_o;
-    accum_o.clear();
-
-    auto createOutputIter = [&](int col) -> typename MM1::OutputTileIterator {
-      using OutputTileIterator = typename MM1::OutputTileIterator;
-      return OutputTileIterator(
-          typename OutputTileIterator::Params{(int32_t)p.o_strideM()},
-          p.output_ptr,
-          typename OutputTileIterator::TensorCoord{
-              p.num_queries, p.head_dim_value},
-          thread_id(),
-          {0, col});
-    };
-
-    auto createOutputAccumIter = [&](int col) ->
-        typename MM1::OutputTileIteratorAccum {
-          using OutputTileIteratorAccum = typename MM1::OutputTileIteratorAccum;
-          return OutputTileIteratorAccum(
-              typename OutputTileIteratorAccum::Params{(int32_t)p.o_strideM()},
-              p.output_accum_ptr,
-              typename OutputTileIteratorAccum::TensorCoord{
-                  p.num_queries, p.head_dim_value},
-              thread_id(),
-              {0, col});
-        };
-
-    // Iterate through keys
-    for (int32_t iter_key_start = 0; iter_key_start < p.num_keys;
-         iter_key_start += kKeysPerBlock) {
-      int32_t problem_size_0_m =
-          cutlass::fast_min((int32_t)kQueriesPerBlock, p.num_queries);
-      int32_t problem_size_0_n = cutlass::fast_min(
-          int32_t(kKeysPerBlock), p.num_keys - iter_key_start);
-      int32_t const& problem_size_0_k = p.head_dim;
-      int32_t const& problem_size_1_n = p.head_dim_value;
-      int32_t const& problem_size_1_k = problem_size_0_n;
-
-      auto prologueV = [&](int blockN) {
-        typename MM1::Mma::IteratorB iterator_V(
-            typename MM1::IteratorB::Params{MM1::LayoutB(p.v_strideM)},
-            p.value_ptr + iter_key_start * p.v_strideM,
-            {problem_size_1_k, problem_size_1_n},
-            thread_id(),
-            cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
-        MM1::Mma::prologue(
-            shared_storage.after_mm0.mm1.mm,
-            iterator_V,
-            thread_id(),
-            problem_size_1_k);
-      };
-
-      __syncthreads(); // Need to have shared memory initialized, and `m_prime`
-                       // updated from end of prev iter
-      //
-      // MATMUL: Q.K_t
-      //
-      // Computes the block-matrix product of:
-      // (a) query[query_start:query_end, :]
-      // with
-      // (b) key[iter_key_start:iter_key_start + kKeysPerBlock]
-      // and stores that into `shared_storage.si`
-      //
-
-      // Compute threadblock location
-      cutlass::gemm::GemmCoord tb_tile_offset = {0, 0, 0};
-
-      cutlass::MatrixCoord tb_offset_A{
-          tb_tile_offset.m() * MM0::Mma::Shape::kM, tb_tile_offset.k()};
-
-      cutlass::MatrixCoord tb_offset_B{
-          tb_tile_offset.k(), tb_tile_offset.n() * MM0::Mma::Shape::kN};
-
-      // Construct iterators to A and B operands
-      typename MM0::IteratorA iterator_A(
-          typename MM0::IteratorA::Params(
-              typename MM0::MmaCore::LayoutA(p.q_strideM)),
-          p.query_ptr,
-          {problem_size_0_m, problem_size_0_k},
-          thread_id(),
-          tb_offset_A);
-
-      typename MM0::IteratorB iterator_B(
-          typename MM0::IteratorB::Params(
-              typename MM0::MmaCore::LayoutB(p.k_strideM)),
-          p.key_ptr + iter_key_start * p.k_strideM,
-          {problem_size_0_k, problem_size_0_n},
-          thread_id(),
-          tb_offset_B);
-
-      auto my_warp_id = warp_id();
-      auto my_lane_id = lane_id();
-
-      // Construct thread-scoped matrix multiply
-      typename MM0::Mma mma(
-          shared_storage.mm0, thread_id(), my_warp_id, my_lane_id);
-
-      typename MM0::Mma::FragmentC accum;
-
-      accum.clear();
-
-      auto gemm_k_iterations =
-          (problem_size_0_k + MM0::Mma::Shape::kK - 1) / MM0::Mma::Shape::kK;
-
-      // Compute threadblock-scoped matrix multiply-add
-      mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum);
-      __syncthreads();
-
-      if (kPreloadV) {
-        prologueV(0);
-      }
-
-      typename MM0::Mma::Operator::IteratorC::TensorCoord
-          iteratorC_tile_offset = {
-              (tb_tile_offset.m() * MM0::Mma::WarpCount::kM) +
-                  (my_warp_id % MM0::Mma::WarpCount::kM),
-              (tb_tile_offset.n() * MM0::Mma::WarpCount::kN) +
-                  (my_warp_id / MM0::Mma::WarpCount::kM)};
-
-      // Mask out last if causal
-      if (p.causal && p.num_keys - iter_key_start <= kKeysPerBlock) {
-        auto query_start = blockIdx.x * kQueriesPerBlock;
-        auto lane_offset = MM0::ScalingCoefsUpdater::get_lane_offset(
-            lane_id(), warp_id(), iteratorC_tile_offset);
-        int32_t last_col;
-        MM0::ScalingCoefsUpdater::iterateRows(
-            lane_offset,
-            [&](int accum_m) {
-              last_col = query_start + accum_m - iter_key_start;
-            },
-            [&](int accum_m, int accum_n, int idx) {
-              if (accum_n > last_col) {
-                accum[idx] =
-                    -cutlass::platform::numeric_limits<accum_t>::infinity();
-              }
-            },
-            [&](int accum_m) {});
-      }
-      DISPATCH_BOOL(iter_key_start == 0, kIsFirst, ([&] {
-                      DISPATCH_BOOL(
-                          p.num_keys - iter_key_start >= kKeysPerBlock,
-                          kFullColumns,
-                          ([&] {
-                            // Update `mi` from accum stored in registers
-                            // Also updates `accum` with accum[i] <-
-                            // exp(accum[i] * scale
-                            // - mi)
-                            MM0::ScalingCoefsUpdater::update<
-                                kQueriesPerBlock,
-                                kFullColumns,
-                                kIsFirst,
-                                kKeepOutputInRF>(
-                                accum_o,
-                                accum,
-                                mi,
-                                m_prime,
-                                s_prime,
-                                lane_id(),
-                                thread_id(),
-                                warp_id(),
-                                p.num_keys - iter_key_start,
-                                iteratorC_tile_offset,
-                                1.0f / cutlass::fast_sqrt(float(p.head_dim)));
-                          }));
-                    }));
-
-      // Output results to shared-memory
-      int warp_idx_mn_0 = my_warp_id %
-          (MM0::Mma::Base::WarpCount::kM * MM0::Mma::Base::WarpCount::kN);
-      auto output_tile_coords = cutlass::MatrixCoord{
-          warp_idx_mn_0 % MM0::Mma::Base::WarpCount::kM,
-          warp_idx_mn_0 / MM0::Mma::Base::WarpCount::kM};
-
-      MM0::B2bGemm::accumToSmem(
-          shared_storage.after_mm0.si, accum, my_lane_id, output_tile_coords);
-
-      __syncthreads();
-
-      //
-      // MATMUL: Attn . V
-      // Run the matmul `attn @ V` for a block of attn and V.
-      // `attn` is read from shared memory (in `shared_storage_si`)
-      // `V` is read from global memory (with iterator_B)
-      //
-
-      const int64_t nBlockN = kSingleValueIteration
-          ? 1
-          : ceil_div(
-                (int64_t)problem_size_1_n, int64_t(MM1::ThreadblockShape::kN));
-      for (int blockN = 0; blockN < nBlockN; ++blockN) {
-        int gemm_k_iterations =
-            (problem_size_1_k + MM1::Mma::Shape::kK - 1) / MM1::Mma::Shape::kK;
-
-        // Compute threadblock-scoped matrix multiply-add and store it in accum
-        // (in registers)
-        if (!kPreloadV) {
-          __syncthreads(); // we share shmem between mma and epilogue
-        }
-
-        typename MM1::Mma::IteratorB iterator_V(
-            typename MM1::IteratorB::Params{MM1::LayoutB(p.v_strideM)},
-            p.value_ptr + iter_key_start * p.v_strideM,
-            {problem_size_1_k, problem_size_1_n},
-            thread_id(),
-            cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN});
-        typename MM1::Mma mma_pv(
-            shared_storage.after_mm0.mm1.mm,
-            shared_storage.after_mm0.si,
-            (int)thread_id(),
-            (int)warp_id(),
-            (int)lane_id(),
-            (int)problem_size_1_k);
-        mma_pv.set_prologue_done(kPreloadV);
-        if (!kKeepOutputInRF) {
-          accum_o.clear();
-        }
-        mma_pv(gemm_k_iterations, accum_o, iterator_V, accum_o);
-        __syncthreads();
-
-        if (kPreloadV && !kSingleValueIteration && blockN + 1 < nBlockN) {
-          prologueV(blockN + 1);
-        }
-
-        if (!kKeepOutputInRF) {
-          DISPATCH_BOOL(
-              iter_key_start == 0, kIsFirst, ([&] {
-                DISPATCH_BOOL(
-                    (iter_key_start + kKeysPerBlock) >= p.num_keys,
-                    kIsLast,
-                    ([&] {
-                      using DefaultEpilogue = typename MM1::DefaultEpilogue;
-                      using DefaultOp =
-                          typename MM1::DefaultConfig::EpilogueOutputOp;
-                      using ElementCompute = typename DefaultOp::ElementCompute;
-                      using EpilogueOutputOp = typename cutlass::epilogue::
-                          thread::MemoryEfficientAttentionNormalize<
-                              typename cutlass::platform::conditional<
-                                  kIsLast,
-                                  output_t,
-                                  output_accum_t>::type,
-                              output_accum_t,
-                              DefaultOp::kCount,
-                              typename DefaultOp::ElementAccumulator,
-                              ElementCompute,
-                              kIsFirst,
-                              kIsLast,
-                              cutlass::Array<ElementCompute, kQueriesPerBlock>>;
-                      using Epilogue = typename cutlass::epilogue::threadblock::
-                          EpiloguePipelined<
-                              typename DefaultEpilogue::Shape,
-                              typename MM1::Mma::Operator,
-                              DefaultEpilogue::kPartitionsK,
-                              typename cutlass::platform::conditional<
-                                  kIsLast,
-                                  typename MM1::OutputTileIterator,
-                                  typename MM1::OutputTileIteratorAccum>::type,
-                              typename DefaultEpilogue::
-                                  AccumulatorFragmentIterator,
-                              typename DefaultEpilogue::WarpTileIterator,
-                              typename DefaultEpilogue::SharedLoadIterator,
-                              EpilogueOutputOp,
-                              typename DefaultEpilogue::Padding,
-                              DefaultEpilogue::kFragmentsPerIteration,
-                              true, // IterationsUnroll
-                              typename MM1::OutputTileIteratorAccum // Read
-                                                                    // iterator
-                              >;
-
-                      int col = blockN * MM1::Mma::Shape::kN;
-                      auto source_iter = createOutputAccumIter(col);
-                      auto dest_iter = call_conditional<
-                          kIsLast,
-                          decltype(createOutputIter),
-                          decltype(createOutputAccumIter)>::
-                          apply(createOutputIter, createOutputAccumIter, col);
-                      EpilogueOutputOp rescale(s_prime, m_prime);
-                      Epilogue epilogue(
-                          shared_storage.epilogue_shared_storage(),
-                          thread_id(),
-                          warp_id(),
-                          lane_id());
-                      epilogue(rescale, dest_iter, accum_o, source_iter);
-                    }));
-              }));
-          if (!kSingleValueIteration) {
-            __syncthreads();
-          }
-        }
-      }
-      __syncthreads(); // we modify `m_prime` after
-    }
-
-    if (kKeepOutputInRF) {
-      constexpr bool kIsFirst = true;
-      constexpr bool kIsLast = true;
-      using DefaultEpilogue = typename MM1::DefaultEpilogue;
-      using DefaultOp = typename MM1::DefaultConfig::EpilogueOutputOp;
-      using ElementCompute = typename DefaultOp::ElementCompute;
-      using EpilogueOutputOp =
-          typename cutlass::epilogue::thread::MemoryEfficientAttentionNormalize<
-              output_t, // output
-              output_accum_t, // source
-              DefaultOp::kCount,
-              typename DefaultOp::ElementAccumulator, // accum
-              output_accum_t, // compute
-              kIsFirst,
-              kIsLast,
-              cutlass::Array<ElementCompute, kQueriesPerBlock>>;
-      using Epilogue =
-          typename cutlass::epilogue::threadblock::EpiloguePipelined<
-              typename DefaultEpilogue::Shape,
-              typename MM1::Mma::Operator,
-              DefaultEpilogue::kPartitionsK,
-              typename MM1::OutputTileIterator, // destination
-              typename DefaultEpilogue::AccumulatorFragmentIterator,
-              typename DefaultEpilogue::WarpTileIterator,
-              typename DefaultEpilogue::SharedLoadIterator,
-              EpilogueOutputOp,
-              typename DefaultEpilogue::Padding,
-              DefaultEpilogue::kFragmentsPerIteration,
-              true, // IterationsUnroll
-              typename MM1::OutputTileIteratorAccum // source tile
-              >;
-      auto dest_iter = createOutputIter(0);
-      EpilogueOutputOp rescale(s_prime, m_prime);
-      Epilogue epilogue(
-          shared_storage.epilogue_shared_storage(),
-          thread_id(),
-          warp_id(),
-          lane_id());
-      epilogue(rescale, dest_iter, accum_o);
-    }
-
-    // 7. Calculate logsumexp
-    // To make the backward easier, we pad logsumexp with `inf`
-    // this avoids a few bound checks, and is not more expensive during fwd
-    static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, "");
-    if (p.logsumexp_ptr && thread_id() < kQueriesPerBlock) {
-      auto lse_dim = ceil_div((int32_t)p.num_queries, kAlignLSE) * kAlignLSE;
-      if (thread_id() < p.num_queries) {
-        p.logsumexp_ptr[thread_id()] = accum_t(mi[thread_id()]) +
-            cutlass::fast_log(accum_t(s_prime[thread_id()]));
-      } else if (thread_id() < lse_dim) {
-        p.logsumexp_ptr[thread_id()] =
-            cutlass::platform::numeric_limits<accum_t>::infinity();
-      }
-    }
-  }
-
-  static CUTLASS_DEVICE int8_t lane_id() {
-    return threadIdx.x;
-  }
-  static CUTLASS_DEVICE int8_t warp_id() {
-    return threadIdx.y;
-  }
-  static CUTLASS_DEVICE int16_t thread_id() {
-    return threadIdx.x + threadIdx.y * blockDim.x;
-  }
-};
-
-template <typename AK>
-__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
-    attention_kernel_batched_impl(typename AK::Params p) {
-  if (!p.advance_to_block()) {
-    return;
-  }
-  AK::attention_kernel(p);
-}
-
-template <typename AK>
-__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm)
-    attention_kernel_batched(typename AK::Params params);
-
-#define _ATTENTION_KERNEL_FORWARD_BEGIN(...)                                  \
-  template <>                                                                 \
-  __global__ void __launch_bounds__(                                          \
-      __VA_ARGS__::kNumThreads, __VA_ARGS__::kMinBlocksPerSm)                 \
-      attention_kernel_batched<__VA_ARGS__>(typename __VA_ARGS__::Params p) { \
-    using Kernel = __VA_ARGS__;
-#define _ATTENTION_KERNEL_FORWARD_END() }
-
-#ifdef __CUDA_ARCH__
-#define __CUDA_ARCH_OR_ZERO__ __CUDA_ARCH__
-#else
-#define __CUDA_ARCH_OR_ZERO__ 0
-#endif
-
-#define INSTANTIATE_ATTENTION_KERNEL_FORWARD(              \
-    ARCH,                                                  \
-    SCALAR_T,                                              \
-    IS_ALIGNED,                                            \
-    QUERIES_PER_BLOCK,                                     \
-    KEYS_PER_BLOCK,                                        \
-    SINGLE_VALUE_ITER)                                     \
-  _ATTENTION_KERNEL_FORWARD_BEGIN(AttentionKernel<         \
-                                  SCALAR_T,                \
-                                  cutlass::arch::Sm##ARCH, \
-                                  IS_ALIGNED,              \
-                                  QUERIES_PER_BLOCK,       \
-                                  KEYS_PER_BLOCK,          \
-                                  SINGLE_VALUE_ITER>)      \
-  if (!p.advance_to_block()) {                             \
-    return;                                                \
-  }                                                        \
-  Kernel::attention_kernel(p);                             \
-  _ATTENTION_KERNEL_FORWARD_END();
-
-#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(              \
-    ARCH,                                                           \
-    SCALAR_T,                                                       \
-    IS_ALIGNED,                                                     \
-    QUERIES_PER_BLOCK,                                              \
-    KEYS_PER_BLOCK,                                                 \
-    SINGLE_VALUE_ITER)                                              \
-  _ATTENTION_KERNEL_FORWARD_BEGIN(AttentionKernel<                  \
-                                  SCALAR_T,                         \
-                                  cutlass::arch::Sm##ARCH,          \
-                                  IS_ALIGNED,                       \
-                                  QUERIES_PER_BLOCK,                \
-                                  KEYS_PER_BLOCK,                   \
-                                  SINGLE_VALUE_ITER>)               \
-  printf(                                                           \
-      "FATAL: this function is for sm%d, but was built for sm%d\n", \
-      int(ARCH),                                                    \
-      int(__CUDA_ARCH_OR_ZERO__));                                  \
-  _ATTENTION_KERNEL_FORWARD_END();
-
-// All kernels are disabled by default
-#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50(...) \
-  INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(50, __VA_ARGS__)
-#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70(...) \
-  INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(70, __VA_ARGS__)
-#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75(...) \
-  INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(75, __VA_ARGS__)
-#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80(...) \
-  INSTANTIATE_ATTENTION_KERNEL_FORWARD_DISABLED(80, __VA_ARGS__)
-
-// Enable the right one based on __CUDA_ARCH__
-#ifndef __CUDA_ARCH__
-#elif __CUDA_ARCH__ < 500
-//#error "Need cuda arch at least 5.0"
-#elif __CUDA_ARCH__ < 700
-#undef INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50
-#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM50(...) \
-  INSTANTIATE_ATTENTION_KERNEL_FORWARD(50, __VA_ARGS__)
-#elif __CUDA_ARCH__ < 750
-#undef INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70
-#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM70(...) \
-  INSTANTIATE_ATTENTION_KERNEL_FORWARD(70, __VA_ARGS__)
-#elif __CUDA_ARCH__ < 800
-#undef INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75
-#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM75(...) \
-  INSTANTIATE_ATTENTION_KERNEL_FORWARD(75, __VA_ARGS__)
-#elif __CUDA_ARCH__ >= 800
-#undef INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80
-#define INSTANTIATE_ATTENTION_KERNEL_FORWARD_SM80(...) \
-  INSTANTIATE_ATTENTION_KERNEL_FORWARD(80, __VA_ARGS__)
-#endif
-
-#endif
diff --git a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/memory_efficient_attention.h b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/memory_efficient_attention.h
index d4484628b6f32..3cd86674f1325 100644
--- a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/memory_efficient_attention.h
+++ b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/memory_efficient_attention.h
@@ -21,15 +21,21 @@ struct MemoryEfficientAttentionParams {
   int32_t qk_head_size;
   int32_t v_head_size;
   bool causal;
+  // The default shape of attn_bias is [1, N, S, S*]. Sometimes we need to use [B, N, S, S*] in custom models.
+  bool is_attn_bias_batched;
 
-  int32_t* cu_seqlens_q;
-  int32_t* cu_seqlens_k;
+  float scale;
 
-  const void* query;  // [B, S, N, H]
-  const void* key;    // [B, L, N, H], where L is kv_sequence_length
-  const void* value;  // [B, L, N, H_v]
-  void* output;       // [B, S, N, H_v]
-  void* workspace;    // [B, S, N, H_v] when kNeedsOutputAccumulatorBuffer, nullptr otherwise
+  int32_t* seqstart_q_ptr;
+  int32_t* seqstart_k_ptr;
+  int32_t* seqlen_k_ptr;
+
+  const void* query;        // [B, S, N, H]
+  const void* key;          // [B, L, N, H], where L is kv_sequence_length
+  const void* value;        // [B, L, N, H_v]
+  const void* attn_bias;    // [N, S, S*] or null
+  void* output;             // [B, S, N, H_v]
+  void* workspace;          // [B, S, N, H_v] when kNeedsOutputAccumulatorBuffer, nullptr otherwise
   cudaStream_t stream;
 
   static bool need_workspace(size_t v_head_size, bool is_float) {
diff --git a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
index ac3c7afcb11f8..f077d56f03b78 100644
--- a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc
@@ -116,6 +116,7 @@ Status MultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) const {
   int sm = device_prop.major * 10 + device_prop.minor;
 
   bool is_mask_1d_seq_len = parameters.mask_type == AttentionMaskType::MASK_1D_KEY_SEQ_LEN;
+  bool is_mask_1d_key_seq_len_start = parameters.mask_type == AttentionMaskType::MASK_1D_KEY_SEQ_LEN_START;
 
   bool use_fused_cross_attention = !disable_fused_cross_attention_ &&
                                    nullptr == key_padding_mask &&
@@ -168,12 +169,14 @@ Status MultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) const {
                           parameters.sequence_length >= attention::kMinSequenceLengthForMemoryEfficientAttentionFp32 ||
                           parameters.kv_sequence_length >= attention::kMinSequenceLengthForMemoryEfficientAttentionFp32;
 
+  bool is_good_for_rpb = relative_position_bias != nullptr && parameters.sequence_length % (4 * sizeof(T)) == 0;
+
   bool use_memory_efficient_attention = fused_runner == nullptr &&
                                         fused_cross_attention_kernel == nullptr &&
                                         !disable_memory_efficient_attention_ &&
                                         is_long_sequence &&
-                                        nullptr == key_padding_mask &&  // TODO: support 1D mask
-                                        nullptr == relative_position_bias &&
+                                        (relative_position_bias == nullptr || is_good_for_rpb) &&
+                                        (nullptr == key_padding_mask || is_mask_1d_key_seq_len_start) &&
                                         has_memory_efficient_attention(sm, sizeof(T) == 2);
 #else
   constexpr bool use_memory_efficient_attention = false;
diff --git a/onnxruntime/core/graph/contrib_ops/bert_defs.cc b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
index b205b64954559..4cc07d876413e 100644
--- a/onnxruntime/core/graph/contrib_ops/bert_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
@@ -272,7 +272,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
                "mask_index",
                "Attention mask with shape (batch_size, 1, max_sequence_length, max_sequence_length), "
                "(batch_size, total_sequence_length) or (batch_size, sequence_length, total_sequence_length), "
-               "or index with shape (batch_size) or (2 * batch_size)",
+               "or index with shape (batch_size) or (2 * batch_size) or (3 * batch_size + 2)",
                "M",
                OpSchema::Optional)
         .Input(4,
@@ -590,7 +590,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
                OpSchema::Optional)
         .Input(4,
                "key_padding_mask",
-               "Key padding mask with shape (batch_size) or (batch_size, kv_sequence_length)",
+               "Key padding mask with shape (batch_size) or (3 * batch_size + 2) or (batch_size, kv_sequence_length)",
                "M",
                OpSchema::Optional)
         .Input(5,
diff --git a/onnxruntime/test/contrib_ops/attention_op_test_helper.cc b/onnxruntime/test/contrib_ops/attention_op_test_helper.cc
index 5c7f9dfab06c8..c7f7c7b653044 100644
--- a/onnxruntime/test/contrib_ops/attention_op_test_helper.cc
+++ b/onnxruntime/test/contrib_ops/attention_op_test_helper.cc
@@ -3090,6 +3090,195 @@ void GetSelfAttentionDataWithPast(AttentionTestData& data) {
   data.is_static_kv = false;
 }
 
+void GetAttentionDataCutlassRelPosBias(AttentionTestData& data) {
+  data.hidden_size = 8;
+  data.v_hidden_size = 8;
+  data.num_heads = 2;
+  data.batch_size = 1;
+  data.sequence_length = 8;
+  data.kv_sequence_length = 0;
+  data.mask_type = AttentionMaskType::MASK_1D_KEY_SEQ_LEN_START;
+
+  data.key_padding_mask_data = {8, 0, 8, 0, 8};
+
+  data.skip_kernel_types = {
+      AttentionKernelType::AttentionKernel_TrtFlashAttention,
+      AttentionKernelType::AttentionKernel_TrtFusedCrossAttention,
+      AttentionKernelType::AttentionKernel_TrtFusedAttention};
+
+  {
+    data.query_data = {
+      -0.029273793f, 0.079709493f, 0.064531095f, 0.24270254f,
+      -0.28326464f, 0.20984903f, -0.10173888f, 0.18373983f,
+
+      0.089472905f, -0.0063416883f, -0.049477674f, 0.36512995f,
+      -0.23620239f, 0.1464397f, 0.068258412f, 0.31627196f,
+
+      0.12436871f, -0.0075563118f, -0.11576633f, 0.41008925f,
+      -0.19456652f, 0.20145792f, 0.11790096f, 0.39789933f,
+
+      0.002485469f, 0.029660821f, -0.043821491f, 0.3892332f,
+      -0.26994205f, 0.14530671f, 0.12950704f, 0.36185294f,
+
+      -0.029273793f, 0.079709493f, 0.064531095f, 0.24270254f,
+      -0.28326464f, 0.20984903f, -0.10173888f, 0.18373983f,
+
+      0.089472905f, -0.0063416883f, -0.049477674f, 0.36512995f,
+      -0.23620239f, 0.1464397f, 0.068258412f, 0.31627196f,
+
+      0.12436871f, -0.0075563118f, -0.11576633f, 0.41008925f,
+      -0.19456652f, 0.20145792f, 0.11790096f, 0.39789933f,
+
+      0.002485469f, 0.029660821f, -0.043821491f, 0.3892332f,
+      -0.26994205f, 0.14530671f, 0.12950704f, 0.36185294f,
+    };
+  }
+  {
+    data.key_data = {
+      -0.32538497f, 0.34121913f, -0.18170178f, -0.015152611f,
+      0.20429322f, 0.25979176f, 0.21269324f, 0.0025638193f,
+
+      -0.24246037f, 0.21112341f, -0.36959589f, -0.16091451f,
+      0.24183474f, 0.18856162f, 0.094487116f, -0.3053959f,
+
+      -0.35736683f, 0.29276621f, -0.4217523f, -0.20031664f,
+      0.33148992f, 0.26928401f, 0.19360018f, -0.39494509f,
+
+      -0.28043351f, 0.24279942f, -0.29154932f, -0.13657911f,
+      0.31932494f, 0.3500579f, 0.027172565f, -0.19327414f,
+
+      -0.32538497f, 0.34121913f, -0.18170178f, -0.015152611f,
+      0.20429322f, 0.25979176f, 0.21269324f, 0.0025638193f,
+
+      -0.24246037f, 0.21112341f, -0.36959589f, -0.16091451f,
+      0.24183474f, 0.18856162f, 0.094487116f, -0.3053959f,
+
+      -0.35736683f, 0.29276621f, -0.4217523f, -0.20031664f,
+      0.33148992f, 0.26928401f, 0.19360018f, -0.39494509f,
+
+      -0.28043351f, 0.24279942f, -0.29154932f, -0.13657911f,
+      0.31932494f, 0.3500579f, 0.027172565f, -0.19327414f,
+    };
+  }
+
+  {
+    data.value_data = {
+      0.56916672f, -0.2443777f, 0.47111356f, -0.52134115f,
+      0.010381341f, 0.0696759f, -0.071910433f, -0.35201436f,
+
+      0.70809275f, -0.24479815f, 0.41633749f, -0.34744334f,
+      -0.0044222325f, 0.25929695f, -0.087832771f, -0.281232f,
+
+      0.90039468f, -0.28931504f, 0.56394172f, -0.43948689f,
+      -0.05856207f, 0.33713666f, -0.10320446f, -0.38833332f,
+
+      0.76054728f, -0.29080144f, 0.50414616f, -0.42371163f,
+      -0.047198489f, 0.31959397f, -0.22683662f, -0.30321664f,
+
+      0.56916672f, -0.2443777f, 0.47111356f, -0.52134115f,
+      0.010381341f, 0.0696759f, -0.071910433f, -0.35201436f,
+
+      0.70809275f, -0.24479815f, 0.41633749f, -0.34744334f,
+      -0.0044222325f, 0.25929695f, -0.087832771f, -0.281232f,
+
+      0.90039468f, -0.28931504f, 0.56394172f, -0.43948689f,
+      -0.05856207f, 0.33713666f, -0.10320446f, -0.38833332f,
+
+      0.76054728f, -0.29080144f, 0.50414616f, -0.42371163f,
+      -0.047198489f, 0.31959397f, -0.22683662f, -0.30321664f,
+    };
+  }
+
+  {
+    data.bias_data = {
+        -0.38124341f, 0.02696526f, -0.11914945f, -0.43795273f,
+        0.04772711f, -0.03419551f, -0.30606642f, 0.42656231f,
+        -0.25891554f, 0.13431972f, 0.22861153f, 0.06360734f,
+        -0.10595283f, -0.42839217f, 0.28931111f, -0.13180739f,
+        0.27079183f, 0.42074734f, -0.40314156f, -0.43726659f,
+        -0.40546918f, 0.06927037f, 0.16979086f, 0.41458064f
+    };
+  }
+
+  {
+    data.rel_pos_bias_data = {
+      -10.808288f, -10.887209f, 7.8799553f, -4.6565766f,
+      -1.6700006f, -0.033962168f, 7.4929152f, 10.944146f,
+      8.640254f, -18.862164f, -3.1202927f, -6.3049207f,
+      3.4508536f, 11.722519f, 3.3550568f, -5.4888172f,
+
+      -2.0828252f, -13.241742f, 2.9868939f, 1.4455698f,
+      -15.262972f, -10.457437f, -8.4519463f, -4.4281874f,
+      10.212368f, -0.28622282f, 12.087646f, 6.5218501f,
+      8.1785011f, 13.985523f, -8.2068987f, 5.4260745f,
+
+      -10.808288f, -10.887209f, 7.8799553f, -4.6565766f,
+      -1.6700006f, -0.033962168f, 7.4929152f, 10.944146f,
+      8.640254f, -18.862164f, -3.1202927f, -6.3049207f,
+      3.4508536f, 11.722519f, 3.3550568f, -5.4888172f,
+
+      -2.0828252f, -13.241742f, 2.9868939f, 1.4455698f,
+      -15.262972f, -10.457437f, -8.4519463f, -4.4281874f,
+      10.212368f, -0.28622282f, 12.087646f, 6.5218501f,
+      8.1785011f, 13.985523f, -8.2068987f, 5.4260745f,
+
+      -10.808288f, -10.887209f, 7.8799553f, -4.6565766f,
+      -1.6700006f, -0.033962168f, 7.4929152f, 10.944146f,
+      8.640254f, -18.862164f, -3.1202927f, -6.3049207f,
+      3.4508536f, 11.722519f, 3.3550568f, -5.4888172f,
+
+      -2.0828252f, -13.241742f, 2.9868939f, 1.4455698f,
+      -15.262972f, -10.457437f, -8.4519463f, -4.4281874f,
+      10.212368f, -0.28622282f, 12.087646f, 6.5218501f,
+      8.1785011f, 13.985523f, -8.2068987f, 5.4260745f,
+
+      -10.808288f, -10.887209f, 7.8799553f, -4.6565766f,
+      -1.6700006f, -0.033962168f, 7.4929152f, 10.944146f,
+      8.640254f, -18.862164f, -3.1202927f, -6.3049207f,
+      3.4508536f, 11.722519f, 3.3550568f, -5.4888172f,
+
+      -2.0828252f, -13.241742f, 2.9868939f, 1.4455698f,
+      -15.262972f, -10.457437f, -8.4519463f, -4.4281874f,
+      10.212368f, -0.28622282f, 12.087646f, 6.5218501f,
+      8.1785011f, 13.985523f, -8.2068987f, 5.4260745f,
+    };
+  }
+
+  {
+    data.fp16_output_data = {
+      1.0419922f, 0.13000488f, 0.10528564f, -0.86230469f,
+      -0.45336914f, 0.39013672f, -0.048858643f, 0.10571289f,
+
+      0.97265625f, 0.17590332f, 0.015625f, -0.79248047f,
+      -0.40917969f, 0.31933594f, 0.082763672f, 0.12976074f,
+
+      1.1455078f, 0.13134766f, 0.15014648f, -0.87451172f,
+      -0.46142578f, 0.40161133f, 0.04309082f, 0.042663574f,
+
+      1.0009766f, 0.17004395f, 0.033752441f, -0.80078125f,
+      -0.41625977f, 0.33349609f, 0.080383301f, 0.11846924f,
+
+      1.0419922f, 0.13000488f, 0.10528564f, -0.86230469f,
+      -0.45336914f, 0.39013672f, -0.048858643f, 0.10571289f,
+
+      0.97265625f, 0.17590332f, 0.015625f, -0.79248047f,
+      -0.40917969f, 0.31933594f, 0.082763672f, 0.12976074f,
+
+      1.1455078f, 0.13134766f, 0.15014648f, -0.87451172f,
+      -0.46142578f, 0.40161133f, 0.04309082f, 0.042663574f,
+
+      1.0009766f, 0.17004395f, 0.033752441f, -0.80078125f,
+      -0.41625977f, 0.33349609f, 0.080383301f, 0.11846924f,
+    };
+  }
+
+  {
+    data.fp32_output_data = {};
+  }
+
+  data.is_static_kv = false;
+}
+
 bool SkipAttentionKernel(AttentionTestData& data, AttentionKernelType kernel_type) {
   return std::find(data.skip_kernel_types.begin(), data.skip_kernel_types.end(), kernel_type) != data.skip_kernel_types.end();
 }
diff --git a/onnxruntime/test/contrib_ops/attention_op_test_helper.h b/onnxruntime/test/contrib_ops/attention_op_test_helper.h
index 807e1e207dac4..bfd65c5794ae5 100644
--- a/onnxruntime/test/contrib_ops/attention_op_test_helper.h
+++ b/onnxruntime/test/contrib_ops/attention_op_test_helper.h
@@ -62,6 +62,8 @@ void GetCrossAttentionData_HeadSize16(AttentionTestData& data);
 void GetCrossAttentionDataWithPast(AttentionTestData& data);
 void GetSelfAttentionDataWithPast(AttentionTestData& data);
 
+void GetAttentionDataCutlassRelPosBias(AttentionTestData& data);
+
 bool SkipAttentionKernel(AttentionTestData& data, AttentionKernelType kernel_type);
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/contrib_ops/multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/multihead_attention_op_test.cc
index 415a1c6f8f393..d01e07a46ab3f 100644
--- a/onnxruntime/test/contrib_ops/multihead_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/multihead_attention_op_test.cc
@@ -55,7 +55,9 @@ static void RunMultiHeadAttentionTest(
     std::vector<int64_t> key_dims =  {batch_size, is_static_kv ? kv_sequence_length : sequence_length, hidden_size};
     std::vector<int64_t> value_dims = {batch_size, is_static_kv ? kv_sequence_length : sequence_length, v_hidden_size};
     std::vector<int64_t> bias_dims = {hidden_size + hidden_size + v_hidden_size};
-    std::vector<int64_t> rel_pos_bias_dims = {1, num_heads, sequence_length, sequence_length + kv_sequence_length};
+    // TODO(wy): Introduce past sequence length to avoid using kv_sequence_length.
+    std::vector<int64_t> rel_pos_bias_dims =
+                         {1, num_heads, sequence_length, past_key_data.size() ? sequence_length + kv_sequence_length : sequence_length};
     std::vector<int64_t> past_key_dims = {batch_size, num_heads, kv_sequence_length, hidden_size / num_heads};
     std::vector<int64_t> past_value_dims = past_key_dims;
     std::vector<int64_t> output_dims = {batch_size, sequence_length, v_hidden_size};
@@ -82,9 +84,10 @@ static void RunMultiHeadAttentionTest(
 
     std::vector<int64_t> mask_dims_1 = {batch_size};
     std::vector<int64_t> mask_dims_2 = {batch_size, kv_sequence_length};
+    std::vector<int64_t> mask_dims_3 = {3 * batch_size + 2};
     std::vector<int64_t>& key_padding_mask_dims = (mask_type == AttentionMaskType::MASK_1D_KEY_SEQ_LEN)
-                                                      ? mask_dims_1
-                                                      : mask_dims_2;
+                                                   ? mask_dims_1
+                                                   : (mask_type == AttentionMaskType::MASK_2D_KEY_PADDING ? mask_dims_2 : mask_dims_3);
 
     if (use_float16) {
       tester.AddInput<MLFloat16>("query", query_dims, ToFloat16(query));
@@ -487,5 +490,11 @@ TEST(MultiHeadAttentionTest, SelfAttentionWithPast) {
   RunMultiHeadAttentionTests(data);
 }
 
+TEST(MultiHeadAttentionTest, AttentionCutlassRelPosBias) {
+  AttentionTestData data;
+  GetAttentionDataCutlassRelPosBias(data);
+  RunMultiHeadAttentionTests(data);
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
index f5e35a3899e89..4a55eaa33e89e 100644
--- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
@@ -11,7 +11,7 @@ steps:
       packageType: upack
       feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
       definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0'
-      version: 1.0.32
+      version: 1.0.36
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # The private ADO project
@@ -22,7 +22,7 @@ steps:
       packageType: upack
       feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325'
       definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a'
-      version: 1.0.32
+      version: 1.0.36
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # You can add more ADO accounts at here.

From 44ba23e0f52d3aca3cc9b194497664a96752992b Mon Sep 17 00:00:00 2001
From: Ye Wang <52801275+wangyems@users.noreply.github.com>
Date: Thu, 23 Mar 2023 12:31:38 -0700
Subject: [PATCH 07/20] Rename DecoderMaskedMHA to DecoderMaskedSelfAttn
 (#15166)

### Description
<!-- Describe your changes. -->

As synced offline, rename this op and will create another op for mha
that supports both self and cross attention.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

---------

Co-authored-by: Ubuntu <wy@v100-2.0cdb2e52twzevn1i4fi45bylyg.jx.internal.cloudapp.net>
---
 docs/ContribOperators.md                      |  6 +-
 docs/OperatorKernels.md                       |  2 +-
 .../cpu/transformers/beam_search_impl_base.h  |  6 +-
 .../cpu/transformers/beam_search_impl_gpt.h   | 28 ++++-----
 .../transformers/generation_device_helper.cc  |  6 +-
 .../transformers/generation_device_helper.h   |  4 +-
 .../transformers/greedy_search_impl_base.h    |  6 +-
 .../cpu/transformers/greedy_search_impl_gpt.h |  8 +--
 .../cpu/transformers/subgraph_base.cc         |  6 +-
 .../cpu/transformers/subgraph_base.h          |  2 +-
 .../cpu/transformers/subgraph_gpt.cc          | 12 ++--
 .../cpu/transformers/subgraph_gpt.h           |  2 +-
 .../contrib_ops/cuda/cuda_contrib_kernels.cc  |  8 +--
 ...on.cc => decoder_masked_self_attention.cc} | 22 +++----
 ...tion.h => decoder_masked_self_attention.h} |  4 +-
 .../decoder_masked_multihead_attention_128.cu | 10 ++--
 .../decoder_masked_multihead_attention_64.cu  | 10 ++--
 ...decoder_masked_multihead_attention_impl.cu | 30 +++++-----
 .../decoder_masked_multihead_attention_impl.h |  6 +-
 ...er_masked_multihead_attention_impl_utils.h |  8 +--
 .../cuda/transformers/generation_cuda_impl.cu |  6 +-
 .../cuda/transformers/generation_cuda_impl.h  |  2 +-
 .../transformers/generation_device_helper.cc  | 18 +++---
 .../transformers/generation_device_helper.h   |  2 +-
 .../core/graph/contrib_ops/bert_defs.cc       | 16 ++---
 onnxruntime/core/graph/contrib_ops/ms_opset.h |  4 +-
 .../tools/transformers/convert_generation.py  | 59 ++++++++-----------
 ...oder_masked_multihead_attention_op_test.cc |  8 +--
 .../python/transformers/test_generation.py    | 18 +++---
 29 files changed, 155 insertions(+), 164 deletions(-)
 rename onnxruntime/contrib_ops/cuda/decoder/{decoder_masked_multihead_attention.cc => decoder_masked_self_attention.cc} (92%)
 rename onnxruntime/contrib_ops/cuda/decoder/{decoder_masked_multihead_attention.h => decoder_masked_self_attention.h} (71%)

diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index 7bf1e3d0f646c..99ad6a6d0f63e 100755
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -20,7 +20,7 @@ Do not modify directly.*
   * <a href="#com.microsoft.ConvTransposeWithDynamicPads">com.microsoft.ConvTransposeWithDynamicPads</a>
   * <a href="#com.microsoft.CropAndResize">com.microsoft.CropAndResize</a>
   * <a href="#com.microsoft.DecoderAttention">com.microsoft.DecoderAttention</a>
-  * <a href="#com.microsoft.DecoderMaskedMultiheadAttention">com.microsoft.DecoderMaskedMultiheadAttention</a>
+  * <a href="#com.microsoft.DecoderMaskedSelfAttention">com.microsoft.DecoderMaskedSelfAttention</a>
   * <a href="#com.microsoft.DequantizeBFP">com.microsoft.DequantizeBFP</a>
   * <a href="#com.microsoft.DequantizeLinear">com.microsoft.DequantizeLinear</a>
   * <a href="#com.microsoft.DequantizeWithOrder">com.microsoft.DequantizeWithOrder</a>
@@ -1102,9 +1102,9 @@ This version of the operator has been available since version 1 of the 'com.micr
 </dl>
 
 
-### <a name="com.microsoft.DecoderMaskedMultiheadAttention"></a><a name="com.microsoft.decodermaskedmultiheadattention">**com.microsoft.DecoderMaskedMultiheadAttention**</a>
+### <a name="com.microsoft.DecoderMaskedSelfAttention"></a><a name="com.microsoft.decodermaskedselfattention">**com.microsoft.DecoderMaskedSelfAttention**</a>
 
-  Uni-directional attention that supports input sequence length of 1.
+  Self attention that supports input sequence length of 1.
   
   The weights for input projection of Q, K and V are merged. The data is stacked on the second dimension. Its shape
   is (input_hidden_size, hidden_size + hidden_size + v_hidden_size). Here hidden_size is the hidden dimension of Q and K,
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 845defa7efdb4..03a6d10667ae6 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -798,7 +798,7 @@ Do not modify directly.*
 |ComplexMulConj|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|1+|**T** = tensor(float), tensor(float16)|
 |ConvTransposeWithDynamicPads|*in* X:**T**<br> *in* W:**T**<br> *in* Pads:**tensor(int64)**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float)|
 |DecoderAttention|*in* query:**T**<br> *in* key:**T**<br> *in* q_weight:**T**<br> *in* kv_weight:**T**<br> *in* bias:**T**<br> *in* key_padding_mask:**B**<br> *in* key_cache:**T**<br> *in* value_cache:**T**<br> *in* static_kv:**B**<br> *in* use_past:**B**<br> *in* has_layer_state:**B**<br> *in* has_key_padding_mask:**B**<br> *out* output:**T**<br> *out* new_key_cache:**T**<br> *out* new_value_cache:**T**|1+|**T** = tensor(float), tensor(float16)|
-|DecoderMaskedMultiheadAttention|*in* input:**T**<br> *in* weights:**T**<br> *in* bias:**T**<br> *in* mask_index:**M**<br> *in* past:**T**<br> *in* relative_position_bias:**T**<br> *in* past_sequence_length:**M**<br> *in* beam_width:**M**<br> *in* cache_indirection:**M**<br> *out* output:**T**<br> *out* present:**T**|1+|**T** = tensor(float), tensor(float16)|
+|DecoderMaskedSelfAttention|*in* input:**T**<br> *in* weights:**T**<br> *in* bias:**T**<br> *in* mask_index:**M**<br> *in* past:**T**<br> *in* relative_position_bias:**T**<br> *in* past_sequence_length:**M**<br> *in* beam_width:**M**<br> *in* cache_indirection:**M**<br> *out* output:**T**<br> *out* present:**T**|1+|**T** = tensor(float), tensor(float16)|
 |DequantizeLinear|*in* x:**T1**<br> *in* x_scale:**T2**<br> *in* x_zero_point:**T1**<br> *out* y:**T2**|1+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(float16)|
 |DequantizeWithOrder|*in* input:**Q**<br> *in* scale_input:**S**<br> *out* output:**F**|1+|**F** = tensor(float), tensor(float16)<br/> **Q** = tensor(int8)<br/> **S** = tensor(float)|
 |EmbedLayerNormalization|*in* input_ids:**T1**<br> *in* segment_ids:**T1**<br> *in* word_embedding:**T**<br> *in* position_embedding:**T**<br> *in* segment_embedding:**T**<br> *in* gamma:**T**<br> *in* beta:**T**<br> *in* mask:**T1**<br> *in* position_ids:**T1**<br> *out* output:**T**<br> *out* mask_index:**T1**<br> *out* embedding_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_base.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_base.h
index 75d161a2cd1e5..9869c95f2ec41 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_base.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_base.h
@@ -22,7 +22,7 @@ struct BeamSearchState : public IBeamSearchState<T> {
             int max_length,
             int num_heads,
             int head_size,
-            int has_decoder_masked_multihead_attention,
+            int has_decoder_masked_self_attention,
             bool output_scores,
             bool use_position) {
     size_t batch_beam_size = SafeInt<size_t>(batch_size) * num_beams;
@@ -53,9 +53,9 @@ struct BeamSearchState : public IBeamSearchState<T> {
       this->remaining_scores = this->scores;
     }
 
-    if (has_decoder_masked_multihead_attention) {
+    if (has_decoder_masked_self_attention) {
       // We need a temp staging buffer to do the past 'K' state re-ordering that is needed
-      // when using DecoderMaskedMultiheadAttention
+      // when using DecoderMaskedSelfAttention
       TensorShape staging_for_past_state_reorder_buffer_shape = {static_cast<int64_t>(batch_beam_size), num_heads, max_length, head_size};
 
       Tensor temp(DataTypeImpl::GetType<T>(), staging_for_past_state_reorder_buffer_shape, allocator);
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
index dc6d33da9974d..b8229387196cb 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
@@ -49,7 +49,7 @@ class BeamSearchGpt : public BeamSearchBase<T> {
         update_feeds_func_(update_feeds_func),
         cuda_device_prop_(cuda_device_prop),
         cuda_device_arch_(cuda_device_arch) {
-    if (gpt_subgraph_.has_decoder_masked_multihead_attention_) {
+    if (gpt_subgraph_.has_decoder_masked_self_attention_) {
       ORT_ENFORCE(cuda_device_arch_ >= 530,
                   "Decoder masked multihead attention can only be used on "
                   "GPU cards of compute capability 5.3 or higher. "
@@ -69,7 +69,7 @@ class BeamSearchGpt : public BeamSearchBase<T> {
                             OrtValue& expanded_input_ids,
                             std::vector<OrtValue>& feeds,
                             IAllocatorUniquePtr<char>& buffer,
-                            bool add_beam_search_specific_inputs_for_decoder_masked_multihead_attention);
+                            bool add_beam_search_specific_inputs_for_decoder_masked_self_attention);
 
   // Update the input for next iteration.
   Status UpdateFeeds(
@@ -83,7 +83,7 @@ class BeamSearchGpt : public BeamSearchBase<T> {
       gsl::span<const int32_t> beam_indices_gpu,
       int past_sequence_length,
       int input_sequence_len,
-      bool has_beam_search_specific_inputs_for_decoder_masked_multihead_attention);
+      bool has_beam_search_specific_inputs_for_decoder_masked_self_attention);
 
   const SessionState* init_run_decoder_session_state_ = nullptr;
   GptSubgraph* init_run_gpt_subgraph_ = nullptr;
@@ -105,7 +105,7 @@ Status BeamSearchGpt<T>::CreateInitialFeeds(gsl::span<int32_t>& sequence_lengths
                                             OrtValue& expanded_input_ids,
                                             std::vector<OrtValue>& feeds,
                                             IAllocatorUniquePtr<char>& buffer,
-                                            bool add_beam_search_specific_inputs_for_decoder_masked_multihead_attention) {
+                                            bool add_beam_search_specific_inputs_for_decoder_masked_self_attention) {
   const OrtValue* input_ids_value = this->context_.GetInputOrtValue(0);
   const Tensor& input_ids = input_ids_value->Get<Tensor>();
   const OrtValue* attn_mask_value = this->context_.GetInputOrtValue(9);
@@ -124,7 +124,7 @@ Status BeamSearchGpt<T>::CreateInitialFeeds(gsl::span<int32_t>& sequence_lengths
                                                       buffer,
                                                       this->ort_stream_,
                                                       this->parameters_->max_length,
-                                                      add_beam_search_specific_inputs_for_decoder_masked_multihead_attention);
+                                                      add_beam_search_specific_inputs_for_decoder_masked_self_attention);
   }
 
   return gpt_subgraph_.CreateInitialFeeds(input_ids,
@@ -140,7 +140,7 @@ Status BeamSearchGpt<T>::CreateInitialFeeds(gsl::span<int32_t>& sequence_lengths
                                           buffer,
                                           this->ort_stream_,
                                           this->parameters_->max_length,
-                                          add_beam_search_specific_inputs_for_decoder_masked_multihead_attention);
+                                          add_beam_search_specific_inputs_for_decoder_masked_self_attention);
 }
 
 template <typename T>
@@ -155,7 +155,7 @@ Status BeamSearchGpt<T>::UpdateFeeds(
     gsl::span<const int32_t> beam_indices_gpu,
     int past_sequence_length,
     int input_sequence_len,
-    bool has_beam_search_specific_inputs_for_decoder_masked_multihead_attention) {
+    bool has_beam_search_specific_inputs_for_decoder_masked_self_attention) {
   return update_feeds_func_(this->temp_space_allocator_,
                             this->ort_stream_,
                             last_outputs,
@@ -172,7 +172,7 @@ Status BeamSearchGpt<T>::UpdateFeeds(
                             gpt_subgraph_.past_present_share_buffer_,
                             past_sequence_length,
                             input_sequence_len,
-                            has_beam_search_specific_inputs_for_decoder_masked_multihead_attention);
+                            has_beam_search_specific_inputs_for_decoder_masked_self_attention);
 }
 
 template <typename T>
@@ -227,7 +227,7 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager* init_run_feeds_fetch
   IAllocatorUniquePtr<char> buffer;
   OrtValue expanded_input_ids_in_cpu;
   ORT_RETURN_IF_ERROR(CreateInitialFeeds(cpu_state.sequence_lengths, expanded_input_ids_in_cpu, feeds, buffer,
-                                         gpt_subgraph_.has_decoder_masked_multihead_attention_));
+                                         gpt_subgraph_.has_decoder_masked_self_attention_));
 
   if (gpt_subgraph_.past_present_share_buffer_) {  // Reuse past and present
     fetches.reserve(static_cast<int64_t>(gpt_subgraph_.GetFirstPresentOutputIndex()) + gpt_subgraph_.num_layers);
@@ -253,7 +253,7 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager* init_run_feeds_fetch
                   parameters->max_length,
                   parameters->num_heads,
                   parameters->head_size,
-                  gpt_subgraph_.has_decoder_masked_multihead_attention_,
+                  gpt_subgraph_.has_decoder_masked_self_attention_,
                   parameters->output_scores,
                   use_position);
 
@@ -350,8 +350,8 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager* init_run_feeds_fetch
     ++current_length;
 
     // Reorder past state after first run if the GPT subgraph (the one used after the first iteration)
-    // contains DecoderMaskedMultiheadAttention nodes
-    if (iteration_counter == 1 && gpt_subgraph_.has_decoder_masked_multihead_attention_) {
+    // contains DecoderMaskedSelfAttention nodes
+    if (iteration_counter == 1 && gpt_subgraph_.has_decoder_masked_self_attention_) {
       size_t offset = static_cast<size_t>(gpt_subgraph_.GetFirstPresentOutputIndex());
       // We will use the same staging buffer while transposing all the layers' past state
       // and this is okay because we use the same stream to do the staging copy and the transpose
@@ -376,12 +376,12 @@ Status BeamSearchGpt<T>::Execute(const FeedsFetchesManager* init_run_feeds_fetch
                                       position_ids, increase_position,
                                       ReinterpretAsSpan<const int32_t>(beam_next_tokens),
                                       ReinterpretAsSpan<const int32_t>(beam_indices),
-                                      gpt_subgraph_.has_decoder_masked_multihead_attention_
+                                      gpt_subgraph_.has_decoder_masked_self_attention_
                                           ? ReinterpretAsSpan<const int32_t>(beam_state.chosen_indices)
                                           : place_holder,
                                       current_length - 1,
                                       parameters->sequence_length,
-                                      gpt_subgraph_.has_decoder_masked_multihead_attention_));
+                                      gpt_subgraph_.has_decoder_masked_self_attention_));
     }
 
     if (gpt_subgraph_.past_present_share_buffer_) {
diff --git a/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.cc
index a9cde364a3039..e63f4b377726f 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.cc
@@ -582,13 +582,13 @@ Status UpdateGptFeeds(
     bool past_present_share_buffer,
     int past_sequence_len,
     int input_sequence_len,
-    bool has_beam_search_specific_inputs_for_decoder_masked_multihead_attention) {
+    bool has_beam_search_specific_inputs_for_decoder_masked_self_attention) {
   // last_outputs: logits, present_0, present_1, ...
   // next_inputs: input_ids, position_id, attention_mask, past_0, past_1
   ORT_UNUSED_PARAMETER(stream);
   ORT_UNUSED_PARAMETER(beam_indices_gpu);
   ORT_UNUSED_PARAMETER(input_sequence_len);
-  ORT_UNUSED_PARAMETER(has_beam_search_specific_inputs_for_decoder_masked_multihead_attention);
+  ORT_UNUSED_PARAMETER(has_beam_search_specific_inputs_for_decoder_masked_self_attention);
 
   // The following updates inputs for subgraph
 
@@ -904,7 +904,7 @@ template Status UpdateGptFeeds<float>(
     bool past_present_share_buffer,
     int past_sequence_len,
     int input_sequence_len,
-    bool has_beam_search_specific_inputs_for_decoder_masked_multihead_attention);
+    bool has_beam_search_specific_inputs_for_decoder_masked_self_attention);
 
 template Status UpdateDecoderFeeds<float>(
     AllocatorPtr allocator,
diff --git a/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.h b/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.h
index 6a9b2e93ec425..3ad7be76a1800 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.h
@@ -136,7 +136,7 @@ using UpdateGptFeedsFunc = std::function<Status(
     bool past_present_share_buffer,
     int past_sequence_len,
     int input_sequence_len,
-    bool has_beam_search_specific_inputs_for_decoder_masked_multihead_attention)>;
+    bool has_beam_search_specific_inputs_for_decoder_masked_self_attention)>;
 
 // Create encoder inputs (for encoder-decoder model like T5).
 using CreateEncoderInputsFunc = std::function<Status(
@@ -273,7 +273,7 @@ Status UpdateGptFeeds(
     bool past_present_share_buffer,
     int past_sequence_len,
     int input_sequence_len,
-    bool has_beam_search_specific_inputs_for_decoder_masked_multihead_attention);
+    bool has_beam_search_specific_inputs_for_decoder_masked_self_attention);
 
 // ---------------------------------------------------------------
 // Functions for encoder-decoder model like T5
diff --git a/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_base.h b/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_base.h
index ccf387a895338..4fd3370040723 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_base.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_base.h
@@ -81,7 +81,7 @@ struct GreedySearchState : public IGreedySearchState<T> {
             int max_length,
             int num_heads,
             int head_size,
-            bool has_decoder_masked_multihead_attention,
+            bool has_decoder_masked_self_attention,
             bool is_cuda) {
     // below buffers are on cpu
     this->sequences_space = AllocateBuffer<int32_t>(cpu_allocator,
@@ -112,8 +112,8 @@ struct GreedySearchState : public IGreedySearchState<T> {
           this->topk_tokens_buffer);
 
       // If at all we need to, we only need to re-order past state for CUDA as
-      //`DecoderMaskedMultiheadAttention` is only supported on CUDA
-      if (has_decoder_masked_multihead_attention) {
+      //`DecoderMaskedSelfAttention` is only supported on CUDA
+      if (has_decoder_masked_self_attention) {
         TensorShape staging_for_past_state_reorder_buffer_shape = {batch_size, num_heads, max_length, head_size};
 
         Tensor temp(DataTypeImpl::GetType<T>(), staging_for_past_state_reorder_buffer_shape, allocator);
diff --git a/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_gpt.h b/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_gpt.h
index d1afcbc6c6865..4f5271ff4672f 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_gpt.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_gpt.h
@@ -65,7 +65,7 @@ class GreedySearchGpt : public GreedySearchBase<T, ParametersT> {
         update_feeds_func_(update_feeds_func),
         cuda_device_prop_(cuda_device_prop),
         cuda_device_arch_(cuda_device_arch) {
-    if (gpt_subgraph_.has_decoder_masked_multihead_attention_) {
+    if (gpt_subgraph_.has_decoder_masked_self_attention_) {
       ORT_ENFORCE(cuda_device_arch_ >= 530,
                   "Decoder masked multihead attention can only be used on "
                   "GPU cards of compute capability 5.3 or higher. "
@@ -203,7 +203,7 @@ Status GreedySearchGpt<T, ParametersT>::Execute(const FeedsFetchesManager* init_
                     static_cast<int>(parameters->max_length),
                     static_cast<int>(parameters->num_heads),
                     static_cast<int>(parameters->head_size),
-                    gpt_subgraph_.has_decoder_masked_multihead_attention_,
+                    gpt_subgraph_.has_decoder_masked_self_attention_,
                     this->IsCuda());
 
   SamplingState<T> sampling_state;
@@ -330,8 +330,8 @@ Status GreedySearchGpt<T, ParametersT>::Execute(const FeedsFetchesManager* init_
     ++current_length;
 
     // Reorder past state after first run if the GPT subgraph (the one used after the first iteration)
-    // contains DecoderMaskedMultiheadAttention nodes
-    if (iteration_counter == 1 && gpt_subgraph_.has_decoder_masked_multihead_attention_) {
+    // contains DecoderMaskedSelfAttention nodes
+    if (iteration_counter == 1 && gpt_subgraph_.has_decoder_masked_self_attention_) {
       size_t offset = static_cast<size_t>(gpt_subgraph_.GetFirstPresentOutputIndex());
       // We will use the same staging buffer while transposing all the layers' past state
       // and this is okay because we use the same stream to do the staging copy and the transpose
diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc b/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc
index 012102b45c5ec..0477e82e577a6 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc
@@ -27,7 +27,7 @@ Subgraph::Subgraph(
       vocab_size(0),
       num_layers(0),
       past_present_share_buffer_(false),
-      has_decoder_masked_multihead_attention_(false),
+      has_decoder_masked_self_attention_(false),
       allocator_(nullptr),
       is_output_float16_(false) {
   num_implicit_inputs = static_cast<int>(node.ImplicitInputDefs().size());
@@ -52,8 +52,8 @@ Subgraph::Subgraph(
   }
 
   for (const auto& n : subgraph.Nodes()) {
-    if (n.OpType() == "DecoderMaskedMultiheadAttention") {
-      has_decoder_masked_multihead_attention_ = true;
+    if (n.OpType() == "DecoderMaskedSelfAttention") {
+      has_decoder_masked_self_attention_ = true;
       break;
     }
   }
diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.h b/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.h
index d8bcd77f49b42..36c2821cd1e87 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.h
@@ -44,7 +44,7 @@ class Subgraph {
   int vocab_size;
   int num_layers;
   bool past_present_share_buffer_;
-  bool has_decoder_masked_multihead_attention_;
+  bool has_decoder_masked_self_attention_;
 
   // Setup execution
   Status Setup(const SessionState& session_state,
diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_gpt.cc b/onnxruntime/contrib_ops/cpu/transformers/subgraph_gpt.cc
index c12301803a04e..ead7a1e0d91f6 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_gpt.cc
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_gpt.cc
@@ -28,7 +28,7 @@ Status GptSubgraph::CreateInitialFeeds(
     IAllocatorUniquePtr<char>& buffer,
     Stream* ort_stream,
     int past_present_share_buffer_max_seq_len,
-    bool add_beam_search_specific_inputs_for_decoder_masked_multihead_attention) {
+    bool add_beam_search_specific_inputs_for_decoder_masked_self_attention) {
   ORT_ENFORCE(session_state_ != nullptr, "Setup must be called before CreateInitialFeeds");
 
   const IExecutionProvider* provider = GetProvider();
@@ -89,11 +89,11 @@ Status GptSubgraph::CreateInitialFeeds(
     TensorShape past_shape(&past_state_dims[0], 5);
 
     // The remaining inputs are past state except the last one or three (see below for details)
-    // If `add_beam_search_specific_inputs_for_decoder_masked_multihead_attention` is false, then the last input is `past_sequence_length`
+    // If `add_beam_search_specific_inputs_for_decoder_masked_self_attention` is false, then the last input is `past_sequence_length`
 
-    // If `add_beam_search_specific_inputs_for_decoder_masked_multihead_attention` is true, then the last inputs are `past_sequence_length`,
+    // If `add_beam_search_specific_inputs_for_decoder_masked_self_attention` is true, then the last inputs are `past_sequence_length`,
     // `beam_width`, and `cache_indirection`
-    auto past_end_iter = add_beam_search_specific_inputs_for_decoder_masked_multihead_attention ? num_subgraph_inputs - 3 : num_subgraph_inputs - 1;
+    auto past_end_iter = add_beam_search_specific_inputs_for_decoder_masked_self_attention ? num_subgraph_inputs - 3 : num_subgraph_inputs - 1;
     for (int i = first_past_input_index_; i < past_end_iter; ++i) {
       OrtValue past_tensor;
       Tensor::InitOrtValue(past_type, past_shape, default_allocator, past_tensor);
@@ -109,7 +109,7 @@ Status GptSubgraph::CreateInitialFeeds(
     *past_seq_len_tensor_value.GetMutable<Tensor>()->MutableData<int32_t>() = 0;
 
     // Add beam search specific inputs
-    if (add_beam_search_specific_inputs_for_decoder_masked_multihead_attention) {
+    if (add_beam_search_specific_inputs_for_decoder_masked_self_attention) {
       // Beam width feed
       int64_t num_beams_dims[] = {1};
       TensorShape num_beams_shape(&num_beams_dims[0], 1);
@@ -146,7 +146,7 @@ Status GptSubgraph::Validate(const std::vector<const NodeArg*>& subgraph_inputs,
                   (num_subgraph_inputs == num_subgraph_outputs + 5)),
                 "Invalid GPT-2 subgraph: number of inputs shall be number of outputs plus 2 or "
                 "3 (if past_present_share_buffer) or "
-                "5 (if past_present_share_buffer and use_decoder_masked_multihead_attention for BeamSearch)");
+                "5 (if past_present_share_buffer and use_decoder_masked_self_attention for BeamSearch)");
 
   ORT_RETURN_IF(subgraph_inputs[0]->Name() != "input_ids",
                 "subgraph input 0 shall be named as input_ids, got: ", subgraph_inputs[0]->Name());
diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_gpt.h b/onnxruntime/contrib_ops/cpu/transformers/subgraph_gpt.h
index c24ac3a5d1e50..4867083de70e7 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_gpt.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_gpt.h
@@ -35,7 +35,7 @@ class GptSubgraph : public Subgraph {
       IAllocatorUniquePtr<char>& buffer,
       Stream* ort_stream,
       int past_present_share_buffer_max_seq_len = -1,
-      bool add_beam_search_specific_inputs_for_decoder_masked_multihead_attention = false);
+      bool add_beam_search_specific_inputs_for_decoder_masked_self_attention = false);
 
   Status Validate(const std::vector<const NodeArg*>& subgraph_inputs,
                   const std::vector<const NodeArg*>& subgraph_outputs) override;
diff --git a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
index 8ec0d7889037d..7a14267176ebe 100644
--- a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
@@ -131,8 +131,8 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Quan
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, DequantizeWithOrder);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, QOrderedAttention);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, QOrderedLongformerAttention);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, DecoderMaskedMultiheadAttention);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DecoderMaskedMultiheadAttention);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, DecoderMaskedSelfAttention);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DecoderMaskedSelfAttention);
 
 #ifdef ENABLE_ATEN
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kPytorchAtenDomain, 1, ATen);
@@ -277,8 +277,8 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, DequantizeWithOrder)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, QOrderedAttention)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, QOrderedLongformerAttention)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, DecoderMaskedMultiheadAttention)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DecoderMaskedMultiheadAttention)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, DecoderMaskedSelfAttention)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DecoderMaskedSelfAttention)>,
 
 #ifdef ENABLE_ATEN
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kPytorchAtenDomain, 1, ATen)>,
diff --git a/onnxruntime/contrib_ops/cuda/decoder/decoder_masked_multihead_attention.cc b/onnxruntime/contrib_ops/cuda/decoder/decoder_masked_self_attention.cc
similarity index 92%
rename from onnxruntime/contrib_ops/cuda/decoder/decoder_masked_multihead_attention.cc
rename to onnxruntime/contrib_ops/cuda/decoder/decoder_masked_self_attention.cc
index 49000855f242b..e56dfeeec377d 100644
--- a/onnxruntime/contrib_ops/cuda/decoder/decoder_masked_multihead_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/decoder/decoder_masked_self_attention.cc
@@ -4,7 +4,7 @@
 #include "core/providers/cuda/cuda_common.h"
 #include "core/providers/cuda/shared_inc/fpgeneric.h"
 #include "core/platform/env_var_utils.h"
-#include "contrib_ops/cuda/decoder/decoder_masked_multihead_attention.h"
+#include "contrib_ops/cuda/decoder/decoder_masked_self_attention.h"
 #include "contrib_ops/cuda/decoder/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h"
 
 using namespace onnxruntime::cuda;
@@ -23,7 +23,7 @@ static constexpr int kPresentOutputIndex = 1;
 
 #define REGISTER_KERNEL_TYPED(T1, T2)                                         \
   ONNX_OPERATOR_TYPED_KERNEL_EX(                                              \
-      DecoderMaskedMultiheadAttention,                                        \
+      DecoderMaskedSelfAttention,                                        \
       kMSDomain,                                                              \
       1,                                                                      \
       T1,                                                                     \
@@ -33,13 +33,13 @@ static constexpr int kPresentOutputIndex = 1;
           .TypeConstraint("T", DataTypeImpl::GetTensorType<T1>())             \
           .InputMemoryType(OrtMemTypeCPUInput, kPastSequenceLengthInputIndex) \
           .InputMemoryType(OrtMemTypeCPUInput, kBeamWidthInputIndex),         \
-      DecoderMaskedMultiheadAttention<T1, T2>);
+      DecoderMaskedSelfAttention<T1, T2>);
 
 REGISTER_KERNEL_TYPED(float, float)
 REGISTER_KERNEL_TYPED(MLFloat16, uint16_t)
 
 template <typename T1, typename T2>
-Status DecoderMaskedMultiheadAttention<T1, T2>::ComputeInternal(OpKernelContext* context) const {
+Status DecoderMaskedSelfAttention<T1, T2>::ComputeInternal(OpKernelContext* context) const {
   const Tensor* input = context->Input<Tensor>(0);
   const Tensor* weights = context->Input<Tensor>(1);
   const Tensor* bias = context->Input<Tensor>(2);
@@ -51,7 +51,7 @@ Status DecoderMaskedMultiheadAttention<T1, T2>::ComputeInternal(OpKernelContext*
   const Tensor* cache_indir = context->Input<Tensor>(kCacheIndirectionInputIndex);
 
   auto& device_prop = GetDeviceProp();
-  DecoderMaskedMultiheadAttentionParams parameters;
+  DecoderMaskedSelfAttentionParams parameters;
   ORT_RETURN_IF_ERROR(CheckInputs(input->Shape(),
                                   weights->Shape(),
                                   bias->Shape(),
@@ -70,29 +70,29 @@ Status DecoderMaskedMultiheadAttention<T1, T2>::ComputeInternal(OpKernelContext*
 
   // This kernel is for decoding only (i.e.) sequence length has to be 1
   if (sequence_length != 1) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input sequence length should be 1 to use DecoderMaskedMultiheadAttention");
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input sequence length should be 1 to use DecoderMaskedSelfAttention");
   }
 
   // TODO(hasesh): In future, we may support CrossAttention. Currently, this kernel only supports SelfAttention.
   if (parameters.sequence_length != parameters.kv_sequence_length) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "DecoderMaskedMultiheadAttention only supports self attention currently");
+    return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "DecoderMaskedSelfAttention only supports self attention currently");
   }
 
   // TODO(hasesh): If there is a need, we will support this later
   if (parameters.head_size != parameters.v_head_size) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "QK head size should be same as V head size to use DecoderMaskedMultiheadAttention");
+    return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "QK head size should be same as V head size to use DecoderMaskedSelfAttention");
   }
 
   // TODO(hasesh): If there is a need, we will support this later
   if (relative_position_bias != nullptr) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "DecoderMaskedMultiheadAttention does not support relative position bias currently");
+    return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "DecoderMaskedSelfAttention does not support relative position bias currently");
   }
 
   // TODO(hasesh): Support more mask types. Currently, it only supports the HuggingFace GreedySearch/BeamSearch pattern.
   if (parameters.mask_type != AttentionMaskType::MASK_2D_KEY_PADDING &&
       parameters.mask_type != AttentionMaskType::MASK_NONE) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
-                           "DecoderMaskedMultiheadAttention only supports no mask or 2D key "
+                           "DecoderMaskedSelfAttention only supports no mask or 2D key "
                            "padding mask of shape [batch, total_seq_length] currently");
   }
 
@@ -199,7 +199,7 @@ Status DecoderMaskedMultiheadAttention<T1, T2>::ComputeInternal(OpKernelContext*
 
     default:
       return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
-                             "Unsupported head size in DecoderMaskedMultiheadAttention. "
+                             "Unsupported head size in DecoderMaskedSelfAttention. "
                              "Got head size: ",
                              parameters.head_size);
   }
diff --git a/onnxruntime/contrib_ops/cuda/decoder/decoder_masked_multihead_attention.h b/onnxruntime/contrib_ops/cuda/decoder/decoder_masked_self_attention.h
similarity index 71%
rename from onnxruntime/contrib_ops/cuda/decoder/decoder_masked_multihead_attention.h
rename to onnxruntime/contrib_ops/cuda/decoder/decoder_masked_self_attention.h
index 2f00728a59ce2..1a009473e4287 100644
--- a/onnxruntime/contrib_ops/cuda/decoder/decoder_masked_multihead_attention.h
+++ b/onnxruntime/contrib_ops/cuda/decoder/decoder_masked_self_attention.h
@@ -13,9 +13,9 @@ namespace cuda {
 using namespace onnxruntime::cuda;
 
 template <typename T1, typename T2>
-class DecoderMaskedMultiheadAttention final : public CudaKernel, public AttentionBase {
+class DecoderMaskedSelfAttention final : public CudaKernel, public AttentionBase {
  public:
-  DecoderMaskedMultiheadAttention(const OpKernelInfo& info) : CudaKernel(info), AttentionBase(info, true) {}
+  DecoderMaskedSelfAttention(const OpKernelInfo& info) : CudaKernel(info), AttentionBase(info, true) {}
   Status ComputeInternal(OpKernelContext* context) const override;
 };
 
diff --git a/onnxruntime/contrib_ops/cuda/decoder/fastertransformer_decoder_attention/decoder_masked_multihead_attention_128.cu b/onnxruntime/contrib_ops/cuda/decoder/fastertransformer_decoder_attention/decoder_masked_multihead_attention_128.cu
index 505f7de7a915f..4cf00e222b0ed 100644
--- a/onnxruntime/contrib_ops/cuda/decoder/fastertransformer_decoder_attention/decoder_masked_multihead_attention_128.cu
+++ b/onnxruntime/contrib_ops/cuda/decoder/fastertransformer_decoder_attention/decoder_masked_multihead_attention_128.cu
@@ -1,6 +1,6 @@
 /*
  * The implementation of this file is based on code provided by https://github.com/NVIDIA/FasterTransformer
- * 
+ *
  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -26,7 +26,7 @@ namespace onnxruntime {
 namespace contrib {
 namespace cuda {
 
-using namespace decoder_masked_multihead_attention_details;
+using namespace decoder_masked_self_attention_details;
 
 #define MMHA_LAUNCH_KERNEL(                                                                        \
     T, head_size, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK)                                    \
@@ -40,7 +40,7 @@ using namespace decoder_masked_multihead_attention_details;
       <<<grid, THDS_PER_BLOCK, dynamic_block_memory, stream>>>(params)
 
 template <typename T, int head_size>
-void mmha_launch_kernel(const DecoderMaskedMultiheadAttentionParams& params, cudaStream_t stream) {
+void mmha_launch_kernel(const DecoderMaskedSelfAttentionParams& params, cudaStream_t stream) {
   constexpr int THREADS_PER_VALUE = ThreadsPerValue<T, head_size>::value;
   int total_sequence_length = params.total_sequence_length;
 
@@ -54,9 +54,9 @@ void mmha_launch_kernel(const DecoderMaskedMultiheadAttentionParams& params, cud
 }
 
 // Instantiate templates
-template void mmha_launch_kernel<float, 128>(const DecoderMaskedMultiheadAttentionParams& params, cudaStream_t stream);
+template void mmha_launch_kernel<float, 128>(const DecoderMaskedSelfAttentionParams& params, cudaStream_t stream);
 
-template void mmha_launch_kernel<uint16_t, 128>(const DecoderMaskedMultiheadAttentionParams& params, cudaStream_t stream);
+template void mmha_launch_kernel<uint16_t, 128>(const DecoderMaskedSelfAttentionParams& params, cudaStream_t stream);
 
 }  // namespace cuda
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cuda/decoder/fastertransformer_decoder_attention/decoder_masked_multihead_attention_64.cu b/onnxruntime/contrib_ops/cuda/decoder/fastertransformer_decoder_attention/decoder_masked_multihead_attention_64.cu
index 7d1703f7a332a..325681b0e1deb 100644
--- a/onnxruntime/contrib_ops/cuda/decoder/fastertransformer_decoder_attention/decoder_masked_multihead_attention_64.cu
+++ b/onnxruntime/contrib_ops/cuda/decoder/fastertransformer_decoder_attention/decoder_masked_multihead_attention_64.cu
@@ -1,6 +1,6 @@
 /*
  * The implementation of this file is based on code provided by https://github.com/NVIDIA/FasterTransformer
- * 
+ *
  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -26,7 +26,7 @@ namespace onnxruntime {
 namespace contrib {
 namespace cuda {
 
-using namespace decoder_masked_multihead_attention_details;
+using namespace decoder_masked_self_attention_details;
 
 #define MMHA_LAUNCH_KERNEL(                                                                        \
     T, head_size, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK)                                    \
@@ -40,7 +40,7 @@ using namespace decoder_masked_multihead_attention_details;
       <<<grid, THDS_PER_BLOCK, dynamic_block_memory, stream>>>(params)
 
 template <typename T, int head_size>
-void mmha_launch_kernel(const DecoderMaskedMultiheadAttentionParams& params, cudaStream_t stream) {
+void mmha_launch_kernel(const DecoderMaskedSelfAttentionParams& params, cudaStream_t stream) {
   constexpr int THREADS_PER_VALUE = ThreadsPerValue<T, head_size>::value;
   int total_sequence_length = params.total_sequence_length;
 
@@ -54,9 +54,9 @@ void mmha_launch_kernel(const DecoderMaskedMultiheadAttentionParams& params, cud
 }
 
 // Instantiate templates
-template void mmha_launch_kernel<float, 64>(const DecoderMaskedMultiheadAttentionParams& params, cudaStream_t stream);
+template void mmha_launch_kernel<float, 64>(const DecoderMaskedSelfAttentionParams& params, cudaStream_t stream);
 
-template void mmha_launch_kernel<uint16_t, 64>(const DecoderMaskedMultiheadAttentionParams& params, cudaStream_t stream);
+template void mmha_launch_kernel<uint16_t, 64>(const DecoderMaskedSelfAttentionParams& params, cudaStream_t stream);
 
 }  // namespace cuda
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cuda/decoder/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu b/onnxruntime/contrib_ops/cuda/decoder/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu
index 27a2b6de83d95..26bc6f53b4c22 100644
--- a/onnxruntime/contrib_ops/cuda/decoder/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/decoder/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu
@@ -1,6 +1,6 @@
 /*
  * The implementation of this file is based on code provided by https://github.com/NVIDIA/FasterTransformer
- * 
+ *
  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,7 +33,7 @@ namespace onnxruntime {
 namespace contrib {
 namespace cuda {
 
-using namespace decoder_masked_multihead_attention_details;
+using namespace decoder_masked_self_attention_details;
 
 template <
     // The type of the inputs. Supported types: float and half.
@@ -46,7 +46,7 @@ template <
     int THREADS_PER_VALUE,
     // The number of threads in a threadblock.
     int THREADS_PER_BLOCK>
-__global__ void masked_multihead_attention_kernel(DecoderMaskedMultiheadAttentionParams params) {
+__global__ void masked_multihead_attention_kernel(DecoderMaskedSelfAttentionParams params) {
   // This kernel contains some code that cannot be compiled on CUDA ARCH 5.3 or lower
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530
   (void)(params);
@@ -498,32 +498,32 @@ __global__ void masked_multihead_attention_kernel(DecoderMaskedMultiheadAttentio
 // Template instantiation(s)
 
 // fp32 + head size = 64
-template void __global__ masked_multihead_attention_kernel<float, 64, 4, 16, 64>(DecoderMaskedMultiheadAttentionParams params);
+template void __global__ masked_multihead_attention_kernel<float, 64, 4, 16, 64>(DecoderMaskedSelfAttentionParams params);
 
-template void __global__ masked_multihead_attention_kernel<float, 64, 2, 16, 128>(DecoderMaskedMultiheadAttentionParams params);
+template void __global__ masked_multihead_attention_kernel<float, 64, 2, 16, 128>(DecoderMaskedSelfAttentionParams params);
 
-template void __global__ masked_multihead_attention_kernel<float, 64, 1, 16, 256>(DecoderMaskedMultiheadAttentionParams params);
+template void __global__ masked_multihead_attention_kernel<float, 64, 1, 16, 256>(DecoderMaskedSelfAttentionParams params);
 
 // fp16 + head size = 64
-template void __global__ masked_multihead_attention_kernel<uint16_t, 64, 4, 8, 64>(DecoderMaskedMultiheadAttentionParams params);
+template void __global__ masked_multihead_attention_kernel<uint16_t, 64, 4, 8, 64>(DecoderMaskedSelfAttentionParams params);
 
-template void __global__ masked_multihead_attention_kernel<uint16_t, 64, 2, 8, 128>(DecoderMaskedMultiheadAttentionParams params);
+template void __global__ masked_multihead_attention_kernel<uint16_t, 64, 2, 8, 128>(DecoderMaskedSelfAttentionParams params);
 
-template void __global__ masked_multihead_attention_kernel<uint16_t, 64, 1, 8, 256>(DecoderMaskedMultiheadAttentionParams params);
+template void __global__ masked_multihead_attention_kernel<uint16_t, 64, 1, 8, 256>(DecoderMaskedSelfAttentionParams params);
 
 // fp32 + head size = 128
-template void __global__ masked_multihead_attention_kernel<float, 128, 4, 32, 64>(DecoderMaskedMultiheadAttentionParams params);
+template void __global__ masked_multihead_attention_kernel<float, 128, 4, 32, 64>(DecoderMaskedSelfAttentionParams params);
 
-template void __global__ masked_multihead_attention_kernel<float, 128, 2, 32, 128>(DecoderMaskedMultiheadAttentionParams params);
+template void __global__ masked_multihead_attention_kernel<float, 128, 2, 32, 128>(DecoderMaskedSelfAttentionParams params);
 
-template void __global__ masked_multihead_attention_kernel<float, 128, 1, 32, 256>(DecoderMaskedMultiheadAttentionParams params);
+template void __global__ masked_multihead_attention_kernel<float, 128, 1, 32, 256>(DecoderMaskedSelfAttentionParams params);
 
 // fp16 + head size = 128
-template void __global__ masked_multihead_attention_kernel<uint16_t, 128, 4, 16, 64>(DecoderMaskedMultiheadAttentionParams params);
+template void __global__ masked_multihead_attention_kernel<uint16_t, 128, 4, 16, 64>(DecoderMaskedSelfAttentionParams params);
 
-template void __global__ masked_multihead_attention_kernel<uint16_t, 128, 2, 16, 128>(DecoderMaskedMultiheadAttentionParams params);
+template void __global__ masked_multihead_attention_kernel<uint16_t, 128, 2, 16, 128>(DecoderMaskedSelfAttentionParams params);
 
-template void __global__ masked_multihead_attention_kernel<uint16_t, 128, 1, 16, 256>(DecoderMaskedMultiheadAttentionParams params);
+template void __global__ masked_multihead_attention_kernel<uint16_t, 128, 1, 16, 256>(DecoderMaskedSelfAttentionParams params);
 
 }  // namespace cuda
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cuda/decoder/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h b/onnxruntime/contrib_ops/cuda/decoder/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h
index be99918832aa7..fe1e0cb702525 100644
--- a/onnxruntime/contrib_ops/cuda/decoder/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h
+++ b/onnxruntime/contrib_ops/cuda/decoder/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h
@@ -10,7 +10,7 @@ namespace onnxruntime {
 namespace contrib {
 namespace cuda {
 
-struct DecoderMaskedMultiheadAttentionParams : AttentionParameters {
+struct DecoderMaskedSelfAttentionParams : AttentionParameters {
   int beam_width = 1;
 
   void* q = nullptr;
@@ -43,10 +43,10 @@ template<
     int THREADS_PER_VALUE,
     // The number of threads in a threadblock.
     int THREADS_PER_BLOCK>
-__global__ void masked_multihead_attention_kernel(DecoderMaskedMultiheadAttentionParams params);
+__global__ void masked_multihead_attention_kernel(DecoderMaskedSelfAttentionParams params);
 
 template<typename T, int head_size>
-void mmha_launch_kernel(const DecoderMaskedMultiheadAttentionParams& params, cudaStream_t stream);
+void mmha_launch_kernel(const DecoderMaskedSelfAttentionParams& params, cudaStream_t stream);
 
 
diff --git a/onnxruntime/contrib_ops/cuda/decoder/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl_utils.h b/onnxruntime/contrib_ops/cuda/decoder/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl_utils.h
index 2b358da6a2eb7..5c8da6da7dd28 100644
--- a/onnxruntime/contrib_ops/cuda/decoder/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl_utils.h
+++ b/onnxruntime/contrib_ops/cuda/decoder/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl_utils.h
@@ -1,6 +1,6 @@
 /*
  * The implementation of this file is based on code provided by https://github.com/NVIDIA/FasterTransformer
- * 
+ *
  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -30,7 +30,7 @@
 namespace onnxruntime {
 namespace contrib {
 namespace cuda {
-namespace decoder_masked_multihead_attention_details {
+namespace decoder_masked_self_attention_details {
 
 //------------------------------------------------------------
 // Qk_vec
@@ -741,7 +741,7 @@ inline __device__ void ConvertFromFloat(uint4& dst, Float8_ src) {
 //------------------------------------------------------------
 
 template <typename T>
-inline size_t CalcDynamicBlockMemory(const DecoderMaskedMultiheadAttentionParams& params,
+inline size_t CalcDynamicBlockMemory(const DecoderMaskedSelfAttentionParams& params,
                                      int threads_per_value, int threads_per_block) {
   // The amount of shared memory needed to store the Q*K^T values in float.
 
@@ -768,7 +768,7 @@ inline size_t CalcDynamicBlockMemory(const DecoderMaskedMultiheadAttentionParams
   return std::max(softmax_sz, red_sz);
 }
 
-}  // namespace decoder_masked_multihead_attention_details
+}  // namespace decoder_masked_self_attention_details
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
index b354b1c9a61a8..d5e3330d01127 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.cu
@@ -746,7 +746,7 @@ void TorchMultinomialKernelLauncher(float* d_input,
                                                 d_presence_mask);
 }
 
-__global__ void UpdateDecoderMaskedMultiheadAttentionCacheIndirectionKernel(int32_t* tgt_indir_cache,
+__global__ void UpdateDecoderMaskedSelfAttentionCacheIndirectionKernel(int32_t* tgt_indir_cache,
                                                                             const int32_t* src_indir_cache,
                                                                             const int32_t* beam_ids,
                                                                             int batch_size,
@@ -785,7 +785,7 @@ __global__ void UpdateDecoderMaskedMultiheadAttentionCacheIndirectionKernel(int3
   }
 }
 
-void UpdateDecoderMaskedMultiheadAttentionCacheIndirection(int32_t* tgt_indir_cache,
+void UpdateDecoderMaskedSelfAttentionCacheIndirection(int32_t* tgt_indir_cache,
                                                            const int32_t* src_indir_cache,
                                                            const int32_t* beam_ids,
                                                            int batch_size,
@@ -796,7 +796,7 @@ void UpdateDecoderMaskedMultiheadAttentionCacheIndirection(int32_t* tgt_indir_ca
                                                            cudaStream_t stream) {
   const dim3 block(32);
   const dim3 grid((current_length + block.x - 1) / block.x, batch_size * beam_width);
-  UpdateDecoderMaskedMultiheadAttentionCacheIndirectionKernel<<<grid, block, 0, stream>>>(tgt_indir_cache,
+  UpdateDecoderMaskedSelfAttentionCacheIndirectionKernel<<<grid, block, 0, stream>>>(tgt_indir_cache,
                                                                                           src_indir_cache,
                                                                                           beam_ids,
                                                                                           batch_size,
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h
index 85b97df7a0fe9..2e97bacc32d78 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_cuda_impl.h
@@ -109,7 +109,7 @@ void TorchMultinomialKernelLauncher(float* d_input,
                                     int* d_presence_mask,
                                     cudaStream_t stream);
 
-void UpdateDecoderMaskedMultiheadAttentionCacheIndirection(int32_t* tgt_indir_cache,
+void UpdateDecoderMaskedSelfAttentionCacheIndirection(int32_t* tgt_indir_cache,
                                                            const int32_t* src_indir_cache,
                                                            const int32_t* beam_ids,
                                                            int batch_size,
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
index eed24b4a1f03d..67f370665f90d 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc
@@ -570,7 +570,7 @@ Status ProcessLogits(const OrtValue& logits,                                 //
   if (!beam_state_chosen_indices.empty()) {
     // If we have allocated `chosen_indices` in beam_state, it means that we
     // will be needing the chosen indices from BeamScorer as we are using
-    // DecoderMaskedMultiheadAttention, so copy it over.
+    // DecoderMaskedSelfAttention, so copy it over.
     CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(beam_state_chosen_indices.data(),
                                          chosen_indices.data(),
                                          chosen_indices.size_bytes(),
@@ -899,7 +899,7 @@ Status UpdateGptFeeds(
     bool past_present_share_buffer,
     int past_sequence_len,
     int input_sequence_len,
-    bool has_beam_search_specific_inputs_for_decoder_masked_multihead_attention) {
+    bool has_beam_search_specific_inputs_for_decoder_masked_self_attention) {
 #ifdef ENABLE_NVTX_PROFILE
   profile::NvtxNestedRangeCreator updateFeedsRange("UpdateGptFeeds", profile::Color::Yellow);
   updateFeedsRange.Begin();
@@ -945,12 +945,12 @@ Status UpdateGptFeeds(
     const int past_sequence_length_idx = (static_cast<int>(last_outputs.size()) - gpt_subgraph_first_present_output_idx) + gpt_subgraph_first_past_input_idx;
     *(next_inputs[past_sequence_length_idx].GetMutable<Tensor>()->MutableData<int32_t>()) = past_sequence_len;
 
-    // Update beam search specific input for DecoderMaskedMultiheadAttention (cache indirection) if present
+    // Update beam search specific input for DecoderMaskedSelfAttention (cache indirection) if present
 
     // If the last input is not `past_sequence_length`, then the beam search specific inputs
-    // for `DecoderMaskedMultiheadAttention` is present
-    if (has_beam_search_specific_inputs_for_decoder_masked_multihead_attention) {
-      ORT_ENFORCE(!beam_indices_gpu.empty(), "Beam indices must be present on CUDA while using DecoderMaskedMultiheadAttention with BeamSearch");
+    // for `DecoderMaskedSelfAttention` is present
+    if (has_beam_search_specific_inputs_for_decoder_masked_self_attention) {
+      ORT_ENFORCE(!beam_indices_gpu.empty(), "Beam indices must be present on CUDA while using DecoderMaskedSelfAttention with BeamSearch");
 
       // The cache indirection feed comes 2 feeds after the `past_sequence_length` feed
       const OrtValue& old_cache_indirection = next_inputs[past_sequence_length_idx + 2];
@@ -964,7 +964,7 @@ Status UpdateGptFeeds(
       int max_sequence_length = static_cast<int>(last_outputs[gpt_subgraph_first_present_output_idx].Get<Tensor>().Shape()[3]);
 
       // Launch kernel to update the cache indirection buffer
-      cuda::UpdateDecoderMaskedMultiheadAttentionCacheIndirection(cache_indirection.GetMutable<Tensor>()->MutableData<int32_t>(),
+      cuda::UpdateDecoderMaskedSelfAttentionCacheIndirection(cache_indirection.GetMutable<Tensor>()->MutableData<int32_t>(),
                                                                   old_cache_indirection.Get<Tensor>().Data<int32_t>(),
                                                                   reinterpret_cast<const int32_t*>(beam_indices_gpu.data()),
                                                                   batch_beam_size / num_beams,
@@ -1182,7 +1182,7 @@ template Status UpdateGptFeeds<float>(
     bool past_present_share_buffer,
     int past_sequence_len,
     int input_sequence_len,
-    bool has_beam_search_specific_inputs_for_decoder_masked_multihead_attention);
+    bool has_beam_search_specific_inputs_for_decoder_masked_self_attention);
 
 // Float16
 template void InitBeamState<MLFloat16>(
@@ -1242,7 +1242,7 @@ template Status UpdateGptFeeds<MLFloat16>(
     bool past_present_share_buffer,
     int past_sequence_len,
     int input_sequence_len,
-    bool has_beam_search_specific_inputs_for_decoder_masked_multihead_attention);
+    bool has_beam_search_specific_inputs_for_decoder_masked_self_attention);
 
 template Status UpdateDecoderFeeds<float>(
     AllocatorPtr allocator,
diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.h b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.h
index 42a7a32959528..2f833ece3c243 100644
--- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.h
+++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.h
@@ -105,7 +105,7 @@ Status UpdateGptFeeds(
     bool past_present_share_buffer,
     int past_sequence_len,
     int input_sequence_len,
-    bool has_beam_search_specific_inputs_for_decoder_masked_multihead_attention);
+    bool has_beam_search_specific_inputs_for_decoder_masked_self_attention);
 
 // ---------------------------------------------------------------
 // Functions for encoder-decoder model like T5
diff --git a/onnxruntime/core/graph/contrib_ops/bert_defs.cc b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
index 4cc07d876413e..e4e0f53886aa0 100644
--- a/onnxruntime/core/graph/contrib_ops/bert_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
@@ -218,11 +218,11 @@ For self attention, kv_sequence_length equals to sequence_length (sequence lengt
 For cross attention, query and key might have different lengths.
 )DOC";
 
-// Currently, the `convert_generation.py` script renames the `Attention` nodes to `DecoderMaskedMultiheadAttention`
-// if the user requests it. Hence, the schemas of `DecoderMaskedMultiheadAttention` and `Attention` schemas
-// are tightly coupled. A change in Attention also needs corresponding schema updates in `DecoderMaskedMultiheadAttention`
+// Currently, the `convert_generation.py` script renames the `Attention` nodes to `DecoderMaskedSelfAttention`
+// if the user requests it. Hence, the schemas of `DecoderMaskedSelfAttention` and `Attention` schemas
+// are tightly coupled. A change in Attention also needs corresponding schema updates in `DecoderMaskedSelfAttention`
 // and its kernel.
-// TODO(hasesh): Decouple the schema of `DecoderMaskedMultiheadAttention` from the schema of the `Attention` operator
+// TODO(hasesh): Decouple the schema of `DecoderMaskedSelfAttention` from the schema of the `Attention` operator
 // by making appropriate tool changes.
 
 ONNX_MS_OPERATOR_SET_SCHEMA(
@@ -442,8 +442,8 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
           PackedAttentionTypeAndShapeInference(ctx);
         }));
 
-constexpr const char* DecoderMaskedMultiheadAttention_ver1_doc = R"DOC(
-Uni-directional attention that supports input sequence length of 1.
+constexpr const char* DecoderMaskedSelfAttention_ver1_doc = R"DOC(
+Self attention that supports input sequence length of 1.
 
 The weights for input projection of Q, K and V are merged. The data is stacked on the second dimension. Its shape
 is (input_hidden_size, hidden_size + hidden_size + v_hidden_size). Here hidden_size is the hidden dimension of Q and K,
@@ -460,9 +460,9 @@ Currently, only self attention is supported which means that kv_sequence_length
 )DOC";
 
 ONNX_MS_OPERATOR_SET_SCHEMA(
-    DecoderMaskedMultiheadAttention, 1,
+    DecoderMaskedSelfAttention, 1,
     OpSchema()
-        .SetDoc(DecoderMaskedMultiheadAttention_ver1_doc)
+        .SetDoc(DecoderMaskedSelfAttention_ver1_doc)
         .Attr("num_heads", "Number of attention heads", AttributeProto::INT)
         .Attr("past_present_share_buffer",
               "Corresponding past and present are same tensor, its size is "
diff --git a/onnxruntime/core/graph/contrib_ops/ms_opset.h b/onnxruntime/core/graph/contrib_ops/ms_opset.h
index 9b1dd82a00985..3066804577f9a 100644
--- a/onnxruntime/core/graph/contrib_ops/ms_opset.h
+++ b/onnxruntime/core/graph/contrib_ops/ms_opset.h
@@ -99,7 +99,7 @@ class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Trilu);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Unique);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, WordConvEmbedding);
 class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GemmFastGelu);
-class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, DecoderMaskedMultiheadAttention);
+class ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, DecoderMaskedSelfAttention);
 
 class OpSet_Microsoft_ver1 {
  public:
@@ -193,7 +193,7 @@ class OpSet_Microsoft_ver1 {
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, Unique)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, WordConvEmbedding)>());
     fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, GemmFastGelu)>());
-    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, DecoderMaskedMultiheadAttention)>());
+    fn(GetOpSchema<ONNX_OPERATOR_SET_SCHEMA_CLASS_NAME(Microsoft, 1, DecoderMaskedSelfAttention)>());
   }
 };
 }  // namespace contrib
diff --git a/onnxruntime/python/tools/transformers/convert_generation.py b/onnxruntime/python/tools/transformers/convert_generation.py
index 22690dc18efdc..acbee9c429154 100644
--- a/onnxruntime/python/tools/transformers/convert_generation.py
+++ b/onnxruntime/python/tools/transformers/convert_generation.py
@@ -250,13 +250,13 @@ def parse_arguments(argv: Optional[List[str]] = None) -> argparse.Namespace:
     model_group.set_defaults(past_present_share_buffer=False)
 
     model_group.add_argument(
-        "--use_decoder_masked_multihead_attention",
+        "--use_decoder_masked_self_attention",
         required=False,
         action="store_true",
-        help="Uses `DecoderMaskedMultiheadAttention` to optimize the unidirectional decoding Attention computation. "
+        help="Uses `DecoderMaskedSelfAttention` to optimize the unidirectional decoding Attention computation. "
         "Must be used with `past_present_share_buffer`. Currently, only Attention head sizes of 64 and 128 are supported.",
     )
-    model_group.set_defaults(use_decoder_masked_multihead_attention=False)
+    model_group.set_defaults(use_decoder_masked_self_attention=False)
 
     model_group.add_argument(
         "--prefix_vocab_mask",
@@ -1074,15 +1074,15 @@ def update_decoder_subgraph_past_present_share_buffer(subg: GraphProto):
     return subg
 
 
-def update_decoder_subgraph_use_decoder_masked_multihead_attention(
+def update_decoder_subgraph_use_decoder_masked_self_attention(
     subg: GraphProto, is_beam_search: bool, switch_attention: bool
 ) -> bool:
-    """Update the Attention nodes to DecoderMaskedMultiheadAttention.
+    """Update the Attention nodes to DecoderMaskedSelfAttention.
 
     Args:
         subg (GraphProto): GraphProto of the decoder subgraph
         is_beam_search (bool): Boolean specifying if the sampling algo is BeamSearch
-        switch_attention (bool): Boolean specifying if `Attention` is to be switched with `DecoderMaskedMultiheadAttention`
+        switch_attention (bool): Boolean specifying if `Attention` is to be switched with `DecoderMaskedSelfAttention`
     """
     if is_beam_search:
         new_inputs = []
@@ -1127,7 +1127,7 @@ def update_decoder_subgraph_use_decoder_masked_multihead_attention(
                         # decoding attention kernels are unidirectional by definition.
                         if k != "unidirectional":
                             logger.warning(
-                                f"Removing attribute: {k} from Attention node while switching to DecoderMaskedMultiheadAttention"
+                                f"Removing attribute: {k} from Attention node while switching to DecoderMaskedSelfAttention"
                             )
 
                         del kwargs[k]
@@ -1144,9 +1144,7 @@ def update_decoder_subgraph_use_decoder_masked_multihead_attention(
                     if len(nis) < 9:
                         nis.extend(["cache_indirection"])
 
-                node = onnx.helper.make_node(
-                    "DecoderMaskedMultiheadAttention", nis, node.output, name=node.name, **kwargs
-                )
+                node = onnx.helper.make_node("DecoderMaskedSelfAttention", nis, node.output, name=node.name, **kwargs)
             new_nodes.extend([node])
         subg.ClearField("node")
         subg.node.extend(new_nodes)
@@ -1511,27 +1509,25 @@ def convert_generation_model(args: argparse.Namespace, generation_type: Generati
             raise NotImplementedError("output_token_scores currently is not supported in greedy search/sampling")
 
     # For BeamSearch, sharing buffers for past and present states is only supported
-    # when using `use_decoder_masked_multihead_attention`
-    if past_present_share_buffer and is_beamsearch and not args.use_decoder_masked_multihead_attention:
+    # when using `use_decoder_masked_self_attention`
+    if past_present_share_buffer and is_beamsearch and not args.use_decoder_masked_self_attention:
         raise ValueError(
-            "`use_decoder_masked_multihead_attention` MUST be turned on to use `past_present_share_buffer` in case of BeamSearch"
+            "`use_decoder_masked_self_attention` MUST be turned on to use `past_present_share_buffer` in case of BeamSearch"
         )
 
     # For any kind of sampling, using decoder masked multihead attention is only supported
     # when using `past_present_share_buffer`
-    if args.use_decoder_masked_multihead_attention and not past_present_share_buffer:
-        raise ValueError(
-            "`past_present_share_buffer` MUST be turned on to use `use_decoder_masked_multihead_attention`"
-        )
+    if args.use_decoder_masked_self_attention and not past_present_share_buffer:
+        raise ValueError("`past_present_share_buffer` MUST be turned on to use `use_decoder_masked_self_attention`")
 
     # For any kind of sampling, using decoder masked multihead attention is only supported
     # on GPUs
-    if args.use_decoder_masked_multihead_attention and not args.use_gpu:
-        raise ValueError("`use_decoder_masked_multihead_attention` option is only supported on GPUs")
+    if args.use_decoder_masked_self_attention and not args.use_gpu:
+        raise ValueError("`use_decoder_masked_self_attention` option is only supported on GPUs")
 
     # Using decoder masked multihead attention is only supported for GPT2
-    if args.use_decoder_masked_multihead_attention and args.model_type in ["t5", "mt5"]:
-        raise ValueError("`use_decoder_masked_multihead_attention` option is only supported for GPT2")
+    if args.use_decoder_masked_self_attention and args.model_type in ["t5", "mt5"]:
+        raise ValueError("`use_decoder_masked_self_attention` option is only supported for GPT2")
 
     if is_gpt2:
         if args.decoder_onnx and os.path.exists(args.decoder_onnx):
@@ -1821,17 +1817,17 @@ def convert_generation_model(args: argparse.Namespace, generation_type: Generati
                 logger.info("*****update init decoder subgraph to make past and present share buffer******************")
                 update_decoder_subgraph_past_present_share_buffer(gpt2_init_decoder_model.graph)
 
-            # Update init decoder subgraph in preparation to use DecoderMaskedMultiheadAttention
-            # NOTE: Even if we will not use DecoderMaskedMultiheadAttention in the init decoder subgraph
+            # Update init decoder subgraph in preparation to use DecoderMaskedSelfAttention
+            # NOTE: Even if we will not use DecoderMaskedSelfAttention in the init decoder subgraph
             # it makes the runtime changes cleaner if we keep both the init decoder and decoder subgraphs
             # same in terms of the subgraph inputs.
             if (
-                args.use_decoder_masked_multihead_attention
-                and not update_decoder_subgraph_use_decoder_masked_multihead_attention(
+                args.use_decoder_masked_self_attention
+                and not update_decoder_subgraph_use_decoder_masked_self_attention(
                     gpt2_init_decoder_model.graph, is_beamsearch, False
                 )
             ):
-                raise ValueError("Could not update the init decoder subgraph to use DecoderMaskedMultiheadAttention")
+                raise ValueError("Could not update the init decoder subgraph to use DecoderMaskedSelfAttention")
 
             node.attribute.append(onnx.helper.make_attribute("init_decoder", gpt2_init_decoder_model.graph))
         else:
@@ -1844,14 +1840,11 @@ def convert_generation_model(args: argparse.Namespace, generation_type: Generati
             logger.info("*****update decoder subgraph to make past and present share buffer******************")
             update_decoder_subgraph_past_present_share_buffer(decoder_model.graph)
 
-        # Update decoder subgraph in preparation to use DecoderMaskedMultiheadAttention
-        if (
-            args.use_decoder_masked_multihead_attention
-            and not update_decoder_subgraph_use_decoder_masked_multihead_attention(
-                decoder_model.graph, is_beamsearch, True
-            )
+        # Update decoder subgraph in preparation to use DecoderMaskedSelfAttention
+        if args.use_decoder_masked_self_attention and not update_decoder_subgraph_use_decoder_masked_self_attention(
+            decoder_model.graph, is_beamsearch, True
         ):
-            raise ValueError("Could not update the decoder subgraph to use DecoderMaskedMultiheadAttention")
+            raise ValueError("Could not update the decoder subgraph to use DecoderMaskedSelfAttention")
 
         node.attribute.append(onnx.helper.make_attribute("decoder", decoder_model.graph))
 
diff --git a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
index 11972e556e57c..52b41d52c1667 100644
--- a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc
@@ -634,7 +634,7 @@ std::vector<MLFloat16> Softmax_QK_Transpose_V(MLFloat16* softmax_qk_transpose_ma
 
   return output;
 }
-TEST(DecoderMaskedMultiheadAttentionTest, Test_fp32) {
+TEST(DecoderMaskedSelfAttentionTest, Test_fp32) {
   // The kernel is only supported on CC 5.3 or higher GPUs
   if (NeedSkipIfCudaArchLowerThan(530)) {
     return;
@@ -652,7 +652,7 @@ TEST(DecoderMaskedMultiheadAttentionTest, Test_fp32) {
         int total_sequence_length = sequence_length + past_sequence_length;
         int max_sequence_length = past_sequence_length + 1;  // Always keep >  past_sequence_length
 
-        OpTester tester("DecoderMaskedMultiheadAttention", 1, onnxruntime::kMSDomain);
+        OpTester tester("DecoderMaskedSelfAttention", 1, onnxruntime::kMSDomain);
         tester.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(number_of_heads));
         tester.AddAttribute<int64_t>("past_present_share_buffer", static_cast<int64_t>(1));
 
@@ -746,7 +746,7 @@ TEST(DecoderMaskedMultiheadAttentionTest, Test_fp32) {
   }
 }
 
-TEST(DecoderMaskedMultiheadAttentionTest, Test_fp16) {
+TEST(DecoderMaskedSelfAttentionTest, Test_fp16) {
   // The kernel is only supported on CC 5.3 or higher GPUs
   if (NeedSkipIfCudaArchLowerThan(530)) {
     return;
@@ -765,7 +765,7 @@ TEST(DecoderMaskedMultiheadAttentionTest, Test_fp16) {
         int total_sequence_length = sequence_length + past_sequence_length;
         int max_sequence_length = past_sequence_length + 1;  // Always keep >  past_sequence_length
 
-        OpTester tester("DecoderMaskedMultiheadAttention", 1, onnxruntime::kMSDomain);
+        OpTester tester("DecoderMaskedSelfAttention", 1, onnxruntime::kMSDomain);
         tester.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(number_of_heads));
         tester.AddAttribute<int64_t>("past_present_share_buffer", static_cast<int64_t>(1));
 
diff --git a/onnxruntime/test/python/transformers/test_generation.py b/onnxruntime/test/python/transformers/test_generation.py
index 33122f08b7906..f044f98e1da05 100644
--- a/onnxruntime/test/python/transformers/test_generation.py
+++ b/onnxruntime/test/python/transformers/test_generation.py
@@ -137,17 +137,17 @@ def test_greedy_search_past_present_share_buffer_fp16(self):
             self.run_beam_search("--past_present_share_buffer --use_gpu -p fp16", is_greedy=True)
 
     @pytest.mark.slow
-    def test_greedy_search_use_decoder_masked_multihead_attention(self):
+    def test_greedy_search_use_decoder_masked_self_attention(self):
         if self.enable_cuda:
             self.run_beam_search(
-                "--past_present_share_buffer --use_decoder_masked_multihead_attention --use_gpu", is_greedy=True
+                "--past_present_share_buffer --use_decoder_masked_self_attention --use_gpu", is_greedy=True
             )
 
     @pytest.mark.slow
-    def test_greedy_search_use_decoder_masked_multihead_attention_fp16(self):
+    def test_greedy_search_use_decoder_masked_self_attention_fp16(self):
         if self.enable_cuda:
             self.run_beam_search(
-                "--past_present_share_buffer --use_decoder_masked_multihead_attention --use_gpu -p fp16", is_greedy=True
+                "--past_present_share_buffer --use_decoder_masked_self_attention --use_gpu -p fp16", is_greedy=True
             )
 
     @pytest.mark.slow
@@ -157,16 +157,14 @@ def test_greedy_search_float16(self):
             self.run_beam_search("--repetition_penalty 1.0 --use_gpu -p fp16", is_greedy=True)
 
     @pytest.mark.slow
-    def test_beam_search_use_decoder_masked_multihead_attention(self):
+    def test_beam_search_use_decoder_masked_self_attention(self):
         if self.enable_cuda:
-            self.run_beam_search(f"--past_present_share_buffer --use_decoder_masked_multihead_attention --use_gpu")
+            self.run_beam_search(f"--past_present_share_buffer --use_decoder_masked_self_attention --use_gpu")
 
     @pytest.mark.slow
-    def test_beam_search_use_decoder_masked_multihead_attention_fp16(self):
+    def test_beam_search_use_decoder_masked_self_attention_fp16(self):
         if self.enable_cuda:
-            self.run_beam_search(
-                f"--past_present_share_buffer --use_decoder_masked_multihead_attention --use_gpu -p fp16"
-            )
+            self.run_beam_search(f"--past_present_share_buffer --use_decoder_masked_self_attention --use_gpu -p fp16")
 
     @pytest.mark.slow
     def test_external_data(self):

From 28f64066de828decef47709f9030f36976ec71e4 Mon Sep 17 00:00:00 2001
From: "Nat Kershaw (MSFT)" <nakersha@microsoft.com>
Date: Thu, 23 Mar 2023 15:08:49 -0700
Subject: [PATCH 08/20] Auto deploy API docs (#15088)

---
 .github/workflows/publish-c-apidocs.yml       |  42 +++----
 .github/workflows/publish-csharp-apidocs.yml  |  65 +++++------
 .github/workflows/publish-gh-pages.yml        | 101 +++++++++++++++++
 .github/workflows/publish-java-apidocs.yml    |  29 +++--
 .github/workflows/publish-python-apidocs.yml  |  44 ++++----
 .../plot_convert_pipeline_vectorizer.py       | 104 ------------------
 docs/python/inference/examples_md.rst         |   9 +-
 7 files changed, 204 insertions(+), 190 deletions(-)
 create mode 100644 .github/workflows/publish-gh-pages.yml
 delete mode 100644 docs/python/inference/examples/plot_convert_pipeline_vectorizer.py

diff --git a/.github/workflows/publish-c-apidocs.yml b/.github/workflows/publish-c-apidocs.yml
index 487336428ad3b..4e478204f1f66 100644
--- a/.github/workflows/publish-c-apidocs.yml
+++ b/.github/workflows/publish-c-apidocs.yml
@@ -1,15 +1,25 @@
 name: Update C/C++ API Docs
+
+# Run when the C API changes or every month so that the artifact does not expire
 on:
   push:
     branches:
       - main
     paths:
       - include/onnxruntime/core/session
-
+  schedule:
+    - cron: '0 0 1 * *'
   workflow_dispatch:
 
+concurrency:
+  group: "apidocs-c"
+  cancel-in-progress: false
+
+permissions:
+  contents: write
+
 jobs:
-  publish:
+  build:
     name: Generate C/C++ API docs
     runs-on: ubuntu-latest
     steps:
@@ -21,27 +31,21 @@ jobs:
           sudo apt-get install libclang-cpp14
           wget https://www.doxygen.nl/files/doxygen-1.9.6.linux.bin.tar.gz
           tar xvzf doxygen-1.9.6.linux.bin.tar.gz
-      - name: Set commit ID
-        id: vars
-        run: echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
       - name: Run doxygen
         run: |
           mkdir -p build/doxygen
           cd docs/c_cxx
           ../../doxygen-1.9.6/bin/doxygen
-      - uses: actions/checkout@v2
-        with:
-          ref: gh-pages
-          clean: false
-      - name: Move API docs into target area
+      - name: Log source commit
+        run: git rev-parse --short HEAD > build/doxygen/html/source-version.txt
+      - name: Move C/C++ docs into site
         run: |
-          rm -rf docs/api/c
-          mv build/doxygen/html docs/api/c
-      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v3
+          mkdir -p _site/docs/api
+          rm -rf site/docs/api/c
+          mv build/doxygen/html _site/docs/api/c
+      - name: Upload new site
+        uses: actions/upload-artifact@v3
         with:
-          branch: gh-pages-pr-c-docs
-          base: gh-pages
-          title: '[Automated]: Update C/C++ API docs'
-          commit-message: 'Update C/C++ API docs to commit ${{ steps.vars.outputs.sha_short }}'
-          add-paths: docs/api/c
+          name: onnxruntime-c-apidocs
+          path: _site
+          retention-days: 60
diff --git a/.github/workflows/publish-csharp-apidocs.yml b/.github/workflows/publish-csharp-apidocs.yml
index 095b5297c1000..7d33a782fb488 100644
--- a/.github/workflows/publish-csharp-apidocs.yml
+++ b/.github/workflows/publish-csharp-apidocs.yml
@@ -1,11 +1,28 @@
 name: Update C# API Docs
+
+# Run when the C# API changes or every month so that the artifact does not expire
 on:
   push:
     branches:
       - main
+    paths:
+      - csharp
+  schedule:
+    - cron: '0 0 1 * *'
+  workflow_dispatch:
+
+concurrency:
+  group: "apidocs-csharp"
+  cancel-in-progress: false
+
+permissions:
+  contents: write
+
 jobs:
-  publish:
-    runs-on: windows-latest
+  build:
+    runs-on: ubuntu-latest
+    env:
+      DOCFXVERSION: 2.62.2
     steps:
     - uses: actions/checkout@v3
     - name: Setup .NET
@@ -17,39 +34,25 @@ jobs:
     - name: Download DocFX
       run: |
         mkdir -p build/docfx
-        Invoke-WebRequest -Uri "https://github.com/dotnet/docfx/releases/download/v${env:DOCFXVERSION}/docfx.zip" -OutFile "build/docfx/docfx.zip"
-        [System.IO.Compression.ZipFile]::ExtractToDirectory("build/docfx/docfx.zip", "build/docfx" )
-        cd build/docfx
-        ls
-      env:
-        DOCFXVERSION: 2.59.3
+        wget https://github.com/dotnet/docfx/releases/download/v${DOCFXVERSION}/docfx-linux-x64-v${DOCFXVERSION}.zip -O build/docfx/docfx.zip
+        unzip build/docfx/docfx.zip -d build/docfx
     - name: Install NuGet
       uses: nuget/setup-nuget@v1
     - name: Build Documentation
       run: |
-        ls
-        build/docfx/docfx.exe metadata csharp/ApiDocs/docfx.json
+        build/docfx/docfx metadata csharp/ApiDocs/docfx.json
         dotnet build csharp/ApiDocs/ApiDocs.csproj --no-restore
-        build/docfx/docfx.exe build csharp/ApiDocs/docfx.json
-    - name: Set commit ID
-      id: vars
-      run: echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
-    - uses: actions/checkout@v2
-      with:
-        ref: gh-pages
-        clean: false
-    - name: Move API docs into target area
+        build/docfx/docfx build csharp/ApiDocs/docfx.json
+    - name: Log source commit
+      run: git rev-parse --short HEAD > csharp/ApiDocs/csharp/source-version.txt
+    - name: Move C# docs into site
       run: |
-        if (Test-Path -Path docs/api/csharp) {rm -r -fo docs/api/csharp}
-        MOVE csharp/ApiDocs/csharp docs/api
-        rm -r -fo csharp/ApiDocs
-        rm -r -fo csharp/src
-    - name: Git Checkin
-      run: git add .
-    - name: Create Pull Request
-      uses: peter-evans/create-pull-request@v3
+        mkdir -p _site/docs/api
+        rm -rf _site/docs/api/csharp
+        mv csharp/ApiDocs/csharp _site/docs/api/csharp
+    - name: Upload docs artifact
+      uses: actions/upload-artifact@v3
       with:
-        branch: gh-pages-pr-csharp-docs
-        base: gh-pages
-        title: '[Automated]: Update C# API docs'
-        commit-message: 'Update C# API docs to commit ${{ steps.vars.outputs.sha_short }}'
+        name: onnxruntime-csharp-apidocs
+        path: _site
+        retention-days: 60
diff --git a/.github/workflows/publish-gh-pages.yml b/.github/workflows/publish-gh-pages.yml
new file mode 100644
index 0000000000000..5ddb1e3bb03d1
--- /dev/null
+++ b/.github/workflows/publish-gh-pages.yml
@@ -0,0 +1,101 @@
+# Sample workflow for building and deploying a Jekyll site to GitHub Pages
+name: Publish site
+
+on:
+  # Runs on pushes targeting the branch where the website sources live
+  push:
+    branches: ["gh-pages"]
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
+# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
+concurrency:
+  group: "pages"
+  cancel-in-progress: false
+
+jobs:
+  # Build job
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          ref: gh-pages
+
+      - name: Setup Pages
+        uses: actions/configure-pages@v3
+
+      - name: Build with Jekyll
+        uses: actions/jekyll-build-pages@v1
+        with:
+          source: ./
+          destination: ./_site
+
+      - name: Download C apidocs artifact
+        uses: dawidd6/action-download-artifact@v2
+        with:
+          name: onnxruntime-c-apidocs
+          workflow: publish-c-apidocs.yml
+          branch: main
+          path: apidocs
+
+      - name: Download C# apidocs artifact
+        uses: dawidd6/action-download-artifact@v2
+        with:
+          name: onnxruntime-csharp-apidocs
+          workflow: publish-csharp-apidocs.yml
+          branch: main
+          path: apidocs
+
+      - name: Download Java apidocs artifact
+        uses: dawidd6/action-download-artifact@v2
+        with:
+          name: onnxruntime-java-apidocs
+          workflow: publish-java-apidocs.yml
+          branch: main
+          path: apidocs
+
+      - name: Download Python apidocs artifact
+        uses: dawidd6/action-download-artifact@v2
+        with:
+          name: onnxruntime-python-apidocs
+          workflow: publish-python-apidocs.yml
+          branch: main
+          path: apidocs
+
+      - name: Move apidocs folder into place
+        run: |
+          sudo rm -rf _site/docs/api/c
+          sudo mv apidocs/docs/api/c _site/docs/api
+          sudo rm -rf _site/docs/api/csharp
+          sudo mv apidocs/docs/api/csharp _site/docs/api
+          sudo rm -rf _site/docs/api/java
+          sudo mv apidocs/docs/api/java _site/docs/api
+          sudo rm -rf _site/docs/api/python
+          sudo mv apidocs/docs/api/python _site/docs/api
+
+      - name: Upload site
+        uses: actions/upload-pages-artifact@v1
+        with:
+          retention-days: 21
+
+  # Deployment job
+  deploy:
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    runs-on: ubuntu-latest
+    needs: build
+    steps:
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v1
diff --git a/.github/workflows/publish-java-apidocs.yml b/.github/workflows/publish-java-apidocs.yml
index c4d462b581caa..b81ea47c7fc37 100644
--- a/.github/workflows/publish-java-apidocs.yml
+++ b/.github/workflows/publish-java-apidocs.yml
@@ -1,21 +1,25 @@
 name: Update Java API Docs
+
+# Run when the Java API changes or every month so that the artifact does not expire
 on:
   push:
     branches:
       - main
     paths:
       - java
+  schedule:
+    - cron: '0 0 1 * *'
   workflow_dispatch:
 
 concurrency:
   group: "apidocs-java"
-  cancel-in-progress: true
+  cancel-in-progress: false
 
 permissions:
   contents: write
 
 jobs:
-  publish:
+  build:
     name: Generate Java docs
     runs-on: ubuntu-latest
     steps:
@@ -31,15 +35,16 @@ jobs:
           build-root-directory: java
           gradle-executable: java/gradlew
           arguments: javadoc
-      - name: Get source commit
-        id: vars
-        run: echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
       - name: Log source commit
-        run: echo $(git rev-parse --short HEAD) > java/build/docs/javadoc/version.txt
-      - name: Deploy
-        uses: JamesIves/github-pages-deploy-action@v4
+        run: git rev-parse --short HEAD > java/build/docs/javadoc/source-version.txt
+      - name: Move Java docs into site
+        run: |
+          rm -rf _site/docs/api/java
+          mkdir -p _site/docs/api
+          mv java/build/docs/javadoc _site/docs/api/java
+      - name: Upload new site
+        uses: actions/upload-artifact@v3
         with:
-          branch: gh-pages
-          folder: java/build/docs/javadoc
-          target-folder: docs/api/java
-          clean: true
+          name: onnxruntime-java-apidocs
+          path: _site
+          retention-days: 60
diff --git a/.github/workflows/publish-python-apidocs.yml b/.github/workflows/publish-python-apidocs.yml
index 0ac99c08831c9..263dbe92299a5 100644
--- a/.github/workflows/publish-python-apidocs.yml
+++ b/.github/workflows/publish-python-apidocs.yml
@@ -1,12 +1,25 @@
 name: Update Python API Docs
+
+# Run when the Python API changes or every month so that the artifact does not expire
 on:
   push:
     branches:
       - main
+    paths:
+      - onnxruntime/python
+  schedule:
+    - cron: '0 0 1 * *'
   workflow_dispatch:
 
+concurrency:
+  group: "apidocs-python"
+  cancel-in-progress: true
+
+permissions:
+  contents: write
+
 jobs:
-  publish:
+  build:
     name: Generate Python API docs
     runs-on: ubuntu-latest
     steps:
@@ -27,23 +40,16 @@ jobs:
         run: |
           cd tools/doc
           ./builddoc.sh /usr/bin ../.. ../../build
-      - name: Set vars
-        id: vars
-        run: echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
-      - uses: actions/checkout@v2
-        with:
-          ref: gh-pages
-          clean: false
-      - name: Move API docs into target area
+      - name: Log source commit
+        run: git rev-parse --short HEAD > build/docs/inference/html/source-version.txt
+      - name: Move Python docs into site
         run: |
-          ls docs/api
-          rm -rf docs/api/python
-          mv build/docs/inference/html docs/api/python
-      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v3
+          rm -rf _site/docs/api/python
+          mkdir -p _site/docs/api
+          mv build/docs/inference/html _site/docs/api/python
+      - name: Upload docs artifact
+        uses: actions/upload-artifact@v3
         with:
-          branch: gh-pages-pr-python-docs
-          base: gh-pages
-          title: '[Automated]: Update Python API docs'
-          commit-message: 'Update Python API docs to commit ${{ steps.vars.outputs.sha_short }}'
-          add-paths: docs/api/python
+          name: onnxruntime-python-apidocs
+          path: _site
+          retention-days: 60
diff --git a/docs/python/inference/examples/plot_convert_pipeline_vectorizer.py b/docs/python/inference/examples/plot_convert_pipeline_vectorizer.py
deleted file mode 100644
index 3df6d6dfea9bf..0000000000000
--- a/docs/python/inference/examples/plot_convert_pipeline_vectorizer.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-"""
-Train, convert and predict with ONNX Runtime
-============================================
-
-This example demonstrates an end to end scenario
-starting with the training of a scikit-learn pipeline
-which takes as inputs not a regular vector but a
-dictionary ``{ int: float }`` as its first step is a
-`DictVectorizer <http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html>`_.
-
-.. contents::
-    :local:
-
-Train a pipeline
-++++++++++++++++
-
-The first step consists in retrieving the boston datset.
-"""
-import pandas
-from sklearn.datasets import load_boston
-
-boston = load_boston()
-X, y = boston.data, boston.target
-
-from sklearn.model_selection import train_test_split
-
-X_train, X_test, y_train, y_test = train_test_split(X, y)
-X_train_dict = pandas.DataFrame(X_train[:, 1:]).T.to_dict().values()
-X_test_dict = pandas.DataFrame(X_test[:, 1:]).T.to_dict().values()
-
-####################################
-# We create a pipeline.
-
-from sklearn.ensemble import GradientBoostingRegressor
-from sklearn.feature_extraction import DictVectorizer
-from sklearn.pipeline import make_pipeline
-
-pipe = make_pipeline(DictVectorizer(sparse=False), GradientBoostingRegressor())
-
-pipe.fit(X_train_dict, y_train)
-
-####################################
-# We compute the prediction on the test set
-# and we show the confusion matrix.
-from sklearn.metrics import r2_score
-
-pred = pipe.predict(X_test_dict)
-print(r2_score(y_test, pred))
-
-####################################
-# Conversion to ONNX format
-# +++++++++++++++++++++++++
-#
-# We use module
-# `sklearn-onnx <https://github.com/onnx/sklearn-onnx>`_
-# to convert the model into ONNX format.
-
-from skl2onnx import convert_sklearn
-from skl2onnx.common.data_types import DictionaryType, FloatTensorType, Int64TensorType, SequenceType
-
-# initial_type = [('float_input', DictionaryType(Int64TensorType([1]), FloatTensorType([])))]
-initial_type = [("float_input", DictionaryType(Int64TensorType([1]), FloatTensorType([])))]
-onx = convert_sklearn(pipe, initial_types=initial_type)
-with open("pipeline_vectorize.onnx", "wb") as f:
-    f.write(onx.SerializeToString())
-
-##################################
-# We load the model with ONNX Runtime and look at
-# its input and output.
-import onnxruntime as rt
-from onnxruntime.capi.onnxruntime_pybind11_state import InvalidArgument
-
-sess = rt.InferenceSession("pipeline_vectorize.onnx", providers=rt.get_available_providers())
-
-import numpy
-
-inp, out = sess.get_inputs()[0], sess.get_outputs()[0]
-print("input name='{}' and shape={} and type={}".format(inp.name, inp.shape, inp.type))
-print("output name='{}' and shape={} and type={}".format(out.name, out.shape, out.type))
-
-##################################
-# We compute the predictions.
-# We could do that in one call:
-
-try:
-    pred_onx = sess.run([out.name], {inp.name: X_test_dict})[0]
-except (RuntimeError, InvalidArgument) as e:
-    print(e)
-
-#############################
-# But it fails because, in case of a DictVectorizer,
-# ONNX Runtime expects one observation at a time.
-pred_onx = [sess.run([out.name], {inp.name: row})[0][0, 0] for row in X_test_dict]
-
-###############################
-# We compare them to the model's ones.
-print(r2_score(pred, pred_onx))
-
-#########################
-# Very similar. *ONNX Runtime* uses floats instead of doubles,
-# that explains the small discrepencies.
diff --git a/docs/python/inference/examples_md.rst b/docs/python/inference/examples_md.rst
index b3426e824efd5..454bd423edf8d 100644
--- a/docs/python/inference/examples_md.rst
+++ b/docs/python/inference/examples_md.rst
@@ -4,21 +4,20 @@
 
     Gallery of examples
     ===================
-    
+
     This series of examples briefly goes into the main
-    feature *ONNX Runtime* implements. Each of them run in a 
+    feature *ONNX Runtime* implements. Each of them run in a
     few seconds and relies on machine learned models
     trained with `scikit-learn <http://scikit-learn.org/stable/>`_.
-    
+
     .. toctree::
         :maxdepth: 1
         :caption: Contents:
-        
+
         auto_examples/plot_load_and_predict
         auto_examples/plot_common_errors
         auto_examples/plot_train_convert_predict
         auto_examples/plot_pipeline
         auto_examples/plot_backend
-        auto_examples/plot_convert_pipeline_vectorizer
         auto_examples/plot_metadata
         auto_examples/plot_profiling

From 0200995058a16b9dcb5def5a9c047d9cd250ca69 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 23 Mar 2023 15:17:52 -0700
Subject: [PATCH 09/20] Bump webpack from 5.75.0 to 5.76.0 in /js (#15159)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [webpack](https://github.com/webpack/webpack) from 5.75.0 to
5.76.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/webpack/webpack/releases">webpack's
releases</a>.</em></p>
<blockquote>
<h2>v5.76.0</h2>
<h2>Bugfixes</h2>
<ul>
<li>Avoid cross-realm object access by <a
href="https://github.com/Jack-Works"><code>@​Jack-Works</code></a> in <a
href="https://redirect.github.com/webpack/webpack/pull/16500">webpack/webpack#16500</a></li>
<li>Improve hash performance via conditional initialization by <a
href="https://github.com/lvivski"><code>@​lvivski</code></a> in <a
href="https://redirect.github.com/webpack/webpack/pull/16491">webpack/webpack#16491</a></li>
<li>Serialize <code>generatedCode</code> info to fix bug in asset module
cache restoration by <a
href="https://github.com/ryanwilsonperkin"><code>@​ryanwilsonperkin</code></a>
in <a
href="https://redirect.github.com/webpack/webpack/pull/16703">webpack/webpack#16703</a></li>
<li>Improve performance of <code>hashRegExp</code> lookup by <a
href="https://github.com/ryanwilsonperkin"><code>@​ryanwilsonperkin</code></a>
in <a
href="https://redirect.github.com/webpack/webpack/pull/16759">webpack/webpack#16759</a></li>
</ul>
<h2>Features</h2>
<ul>
<li>add <code>target</code> to <code>LoaderContext</code> type by <a
href="https://github.com/askoufis"><code>@​askoufis</code></a> in <a
href="https://redirect.github.com/webpack/webpack/pull/16781">webpack/webpack#16781</a></li>
</ul>
<h2>Security</h2>
<ul>
<li><a
href="https://github.com/advisories/GHSA-3rfm-jhwj-7488">CVE-2022-37603</a>
fixed by <a
href="https://github.com/akhilgkrishnan"><code>@​akhilgkrishnan</code></a>
in <a
href="https://redirect.github.com/webpack/webpack/pull/16446">webpack/webpack#16446</a></li>
</ul>
<h2>Repo Changes</h2>
<ul>
<li>Fix HTML5 logo in README by <a
href="https://github.com/jakebailey"><code>@​jakebailey</code></a> in <a
href="https://redirect.github.com/webpack/webpack/pull/16614">webpack/webpack#16614</a></li>
<li>Replace TypeScript logo in README by <a
href="https://github.com/jakebailey"><code>@​jakebailey</code></a> in <a
href="https://redirect.github.com/webpack/webpack/pull/16613">webpack/webpack#16613</a></li>
<li>Update actions/cache dependencies by <a
href="https://github.com/piwysocki"><code>@​piwysocki</code></a> in <a
href="https://redirect.github.com/webpack/webpack/pull/16493">webpack/webpack#16493</a></li>
</ul>
<h2>New Contributors</h2>
<ul>
<li><a
href="https://github.com/Jack-Works"><code>@​Jack-Works</code></a> made
their first contribution in <a
href="https://redirect.github.com/webpack/webpack/pull/16500">webpack/webpack#16500</a></li>
<li><a href="https://github.com/lvivski"><code>@​lvivski</code></a> made
their first contribution in <a
href="https://redirect.github.com/webpack/webpack/pull/16491">webpack/webpack#16491</a></li>
<li><a
href="https://github.com/jakebailey"><code>@​jakebailey</code></a> made
their first contribution in <a
href="https://redirect.github.com/webpack/webpack/pull/16614">webpack/webpack#16614</a></li>
<li><a
href="https://github.com/akhilgkrishnan"><code>@​akhilgkrishnan</code></a>
made their first contribution in <a
href="https://redirect.github.com/webpack/webpack/pull/16446">webpack/webpack#16446</a></li>
<li><a
href="https://github.com/ryanwilsonperkin"><code>@​ryanwilsonperkin</code></a>
made their first contribution in <a
href="https://redirect.github.com/webpack/webpack/pull/16703">webpack/webpack#16703</a></li>
<li><a href="https://github.com/piwysocki"><code>@​piwysocki</code></a>
made their first contribution in <a
href="https://redirect.github.com/webpack/webpack/pull/16493">webpack/webpack#16493</a></li>
<li><a href="https://github.com/askoufis"><code>@​askoufis</code></a>
made their first contribution in <a
href="https://redirect.github.com/webpack/webpack/pull/16781">webpack/webpack#16781</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a
href="https://github.com/webpack/webpack/compare/v5.75.0...v5.76.0">https://github.com/webpack/webpack/compare/v5.75.0...v5.76.0</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/webpack/webpack/commit/97b1718720c33f1b17302a74c5284b01e02ec001"><code>97b1718</code></a>
Merge pull request <a
href="https://redirect.github.com/webpack/webpack/issues/16781">#16781</a>
from askoufis/loader-context-target-type</li>
<li><a
href="https://github.com/webpack/webpack/commit/b84efe6224b276bf72e4c5e2f4e76acddfaeef07"><code>b84efe6</code></a>
Merge pull request <a
href="https://redirect.github.com/webpack/webpack/issues/16759">#16759</a>
from ryanwilsonperkin/real-content-hash-regex-perf</li>
<li><a
href="https://github.com/webpack/webpack/commit/c98e9e001441b165c7ed4845700839730b505833"><code>c98e9e0</code></a>
Merge pull request <a
href="https://redirect.github.com/webpack/webpack/issues/16493">#16493</a>
from piwysocki/patch-1</li>
<li><a
href="https://github.com/webpack/webpack/commit/5f34acfbc074da6cc09f48944d7f2b4273ffb3f8"><code>5f34acf</code></a>
feat: Add <code>target</code> to <code>LoaderContext</code> type</li>
<li><a
href="https://github.com/webpack/webpack/commit/b7fc4d876deb958d7ee81ecc00a312e39a354a44"><code>b7fc4d8</code></a>
Merge pull request <a
href="https://redirect.github.com/webpack/webpack/issues/16703">#16703</a>
from ryanwilsonperkin/ryanwilsonperkin/fix-16160</li>
<li><a
href="https://github.com/webpack/webpack/commit/63ea82da4d4e4242b6a6285fc937f0684f264fe8"><code>63ea82d</code></a>
Merge branch 'webpack:main' into patch-1</li>
<li><a
href="https://github.com/webpack/webpack/commit/4ba225225b1348c8776ca5b5fe53468519413bc0"><code>4ba2252</code></a>
Merge pull request <a
href="https://redirect.github.com/webpack/webpack/issues/16446">#16446</a>
from akhilgkrishnan/patch-1</li>
<li><a
href="https://github.com/webpack/webpack/commit/1acd6350be3d74d4ac70b64cbbc60f27724b618b"><code>1acd635</code></a>
Merge pull request <a
href="https://redirect.github.com/webpack/webpack/issues/16613">#16613</a>
from jakebailey/ts-logo</li>
<li><a
href="https://github.com/webpack/webpack/commit/302eb37fe19ed7ca60eaf895aca4f9da9dfd7931"><code>302eb37</code></a>
Merge pull request <a
href="https://redirect.github.com/webpack/webpack/issues/16614">#16614</a>
from jakebailey/html5-logo</li>
<li><a
href="https://github.com/webpack/webpack/commit/cfdb1dfe59b33bf7441b8a8e4fc58d75e4f54cee"><code>cfdb1df</code></a>
Improve performance of hashRegExp lookup</li>
<li>Additional commits viewable in <a
href="https://github.com/webpack/webpack/compare/v5.75.0...v5.76.0">compare
view</a></li>
</ul>
</details>
<details>
<summary>Maintainer changes</summary>
<p>This version was pushed to npm by <a
href="https://www.npmjs.com/~evilebottnawi">evilebottnawi</a>, a new
releaser for webpack since your current version.</p>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=webpack&package-manager=npm_and_yarn&previous-version=5.75.0&new-version=5.76.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the
[Security Alerts
page](https://github.com/microsoft/onnxruntime/network/alerts).

</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 js/package-lock.json | 14 +++++++-------
 js/package.json      |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/js/package-lock.json b/js/package-lock.json
index 4116727644eec..deb97d1a076c7 100644
--- a/js/package-lock.json
+++ b/js/package-lock.json
@@ -26,7 +26,7 @@
         "terser": "^5.16.5",
         "ts-loader": "^9.4.2",
         "typescript": "^4.9.5",
-        "webpack": "^5.75.0",
+        "webpack": "^5.76.0",
         "webpack-bundle-analyzer": "^4.8.0",
         "webpack-cli": "^5.0.1",
         "worker-loader": "^3.0.8"
@@ -5381,9 +5381,9 @@
       }
     },
     "node_modules/webpack": {
-      "version": "5.75.0",
-      "resolved": "https://registry.npmjs.org/webpack/-/webpack-5.75.0.tgz",
-      "integrity": "sha512-piaIaoVJlqMsPtX/+3KTTO6jfvrSYgauFVdt8cr9LTHKmcq/AMd4mhzsiP7ZF/PGRNPGA8336jldh9l2Kt2ogQ==",
+      "version": "5.76.0",
+      "resolved": "https://registry.npmjs.org/webpack/-/webpack-5.76.0.tgz",
+      "integrity": "sha512-l5sOdYBDunyf72HW8dF23rFtWq/7Zgvt/9ftMof71E/yUb1YLOBmTgA2K4vQthB3kotMrSj609txVE0dnr2fjA==",
       "dev": true,
       "dependencies": {
         "@types/eslint-scope": "^3.7.3",
@@ -9741,9 +9741,9 @@
       }
     },
     "webpack": {
-      "version": "5.75.0",
-      "resolved": "https://registry.npmjs.org/webpack/-/webpack-5.75.0.tgz",
-      "integrity": "sha512-piaIaoVJlqMsPtX/+3KTTO6jfvrSYgauFVdt8cr9LTHKmcq/AMd4mhzsiP7ZF/PGRNPGA8336jldh9l2Kt2ogQ==",
+      "version": "5.76.0",
+      "resolved": "https://registry.npmjs.org/webpack/-/webpack-5.76.0.tgz",
+      "integrity": "sha512-l5sOdYBDunyf72HW8dF23rFtWq/7Zgvt/9ftMof71E/yUb1YLOBmTgA2K4vQthB3kotMrSj609txVE0dnr2fjA==",
       "dev": true,
       "requires": {
         "@types/eslint-scope": "^3.7.3",
diff --git a/js/package.json b/js/package.json
index eab0feda990be..0d57a1d86bf1f 100644
--- a/js/package.json
+++ b/js/package.json
@@ -20,7 +20,7 @@
     "terser": "^5.16.5",
     "ts-loader": "^9.4.2",
     "typescript": "^4.9.5",
-    "webpack": "^5.75.0",
+    "webpack": "^5.76.0",
     "webpack-bundle-analyzer": "^4.8.0",
     "webpack-cli": "^5.0.1",
     "worker-loader": "^3.0.8"

From ea245c94e78ef7fe6aa2ab02f2bba7e961a12388 Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Fri, 24 Mar 2023 08:46:07 +1000
Subject: [PATCH 10/20] Add constant folding for simple QDQ Node Units (#15138)

### Description
<!-- Describe your changes. -->
Currently we bail on constant folding if QDQ is enabled and we hit a DQ
node. However, if we have a simple DQ -> X -> Q node unit where the DQ
and X do not produce graph outputs, their output only has one consumer,
and X is deterministic, we can constant fold all three nodes.

Add support for this simple scenario primarily to constant fold a QDQ
model that has had initializers updated by layout transformation, which
results in patterns like `initializer -> DQ -> Transpose -> Q` or
`initializer- > DQ -> Unsqueeze -> Q -> DQ -> Transpose -> Q` if the
initializer is broadcast.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Improve end result of layout transformation on a QDQ model.
---
 .../core/optimizer/constant_folding.cc        |  55 +++++++++++---
 .../test/optimizer/graph_transform_test.cc    |  71 +++++++++++++++---
 ...nt_folding_qdq_node_unit.graph_output.onnx | Bin 0 -> 1335 bytes
 .../constant_folding_qdq_node_unit.onnx       | Bin 0 -> 1933 bytes
 4 files changed, 106 insertions(+), 20 deletions(-)
 create mode 100644 onnxruntime/test/testdata/transform/fusion/constant_folding_qdq_node_unit.graph_output.onnx
 create mode 100644 onnxruntime/test/testdata/transform/fusion/constant_folding_qdq_node_unit.onnx

diff --git a/onnxruntime/core/optimizer/constant_folding.cc b/onnxruntime/core/optimizer/constant_folding.cc
index d32cb87fed1c3..80e2bbedef974 100644
--- a/onnxruntime/core/optimizer/constant_folding.cc
+++ b/onnxruntime/core/optimizer/constant_folding.cc
@@ -7,6 +7,7 @@
 #include "core/optimizer/utils.h"
 #include "core/graph/graph_utils.h"
 #include "core/optimizer/optimizer_execution_frame.h"
+#include "core/optimizer/utils.h"
 #include "core/framework/op_kernel.h"
 #include "core/framework/tensorprotoutils.h"
 
@@ -106,11 +107,6 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
       continue;
     }
 
-    // avoid to constant fold DequantizeLinear for QDQ format
-    if (skip_dequantize_linear_ && node->OpType().compare("DequantizeLinear") == 0) {
-      continue;
-    }
-
     ORT_RETURN_IF_ERROR(Recurse(*node, modified, graph_level, logger));
 
     // Updating a node may allow shape inferencing to infer output shapes of following nodes,
@@ -139,15 +135,52 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
       }
 
       // Check if constant folding can be applied on this node.
-      if (!graph_utils::IsSupportedProvider(*node, GetCompatibleExecutionProviders()) ||
-          !optimizer_utils::IsOperationDeterministic(node->Domain(), node->OpType()) ||
-          // constant folding does not support executing a node that includes subgraphs (control flow operators,
-          // such as If/Loop/Scan, fall into this category). individual nodes in the subgraph will be processed
-          // by the Recurse call above
-          node->ContainsSubgraph() || !graph_utils::AllNodeInputsAreConstant(graph, *node, constant_inputs, excluded_initializers_)) {
+      const auto can_constant_fold_node = [&](const Node& n, bool skip_inputs_constant_check = false) {
+        return graph_utils::IsSupportedProvider(n, GetCompatibleExecutionProviders()) &&
+               optimizer_utils::IsOperationDeterministic(n.Domain(), n.OpType()) &&
+               // constant folding does not support executing a node that includes subgraphs (control flow operators,
+               // such as If/Loop/Scan, fall into this category). individual nodes in the subgraph will be processed
+               // by the Recurse call above
+               !n.ContainsSubgraph() &&
+               (skip_inputs_constant_check ||
+                graph_utils::AllNodeInputsAreConstant(graph, n, constant_inputs, excluded_initializers_));
+      };
+
+      if (!can_constant_fold_node(*node)) {
         continue;
       }
 
+      // if skip_dequantize_linear is true we want to maintain QDQ node units so avoid constant folding
+      // DequantizeLinear unless we can fold the whole QDQ node unit
+      if (skip_dequantize_linear_ && node->OpType() == "DequantizeLinear") {
+        bool can_constant_fold_qdq_node_unit = false;
+
+        // Simplest scenario where the whole QDQ node unit of (DQ -> X -> Q) can be constant folded is if:
+        //   - the DQ node does not produce a graph output, and its output is only consumed by X
+        //   - X is a deterministic node with a single input and single output
+        //   - the output from X is not a graph output and is only consumed by a Q node
+        if (optimizer_utils::CheckOutputEdges(graph, *node, 1)) {  // DQ does not produce graph output, single consumer
+          const Node& node_x = *node->OutputNodesBegin();
+          if (node_x.InputDefs().size() == 1 &&
+              node_x.OutputDefs().size() == 1 &&
+              optimizer_utils::CheckOutputEdges(graph, node_x, 1)) {
+            const Node& probably_q = *node_x.OutputNodesBegin();
+
+            if (probably_q.OpType() == "QuantizeLinear") {
+              // the inputs to these nodes are not const yet, but will be if we constant fold,
+              // so set skip_const_check to simulate that having happened
+              constexpr bool skip_const_check = true;
+              can_constant_fold_qdq_node_unit = can_constant_fold_node(node_x, skip_const_check) &&
+                                                can_constant_fold_node(probably_q, skip_const_check);
+            }
+          }
+        }
+
+        if (!can_constant_fold_qdq_node_unit) {
+          continue;
+        }
+      }
+
 #if !defined(DISABLE_SPARSE_TENSORS)
       // Create execution frame for executing constant nodes.
       OptimizerExecutionFrame::Info info({node}, constant_inputs, graph.ModelPath(), execution_provider_,
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index 813614493b51b..ca5011e28b7b5 100755
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -725,9 +725,7 @@ TEST_F(GraphTransformationTests, ConstantFoldingWithScalarShapeToInitializer) {
   ASSERT_TRUE(op_to_count["Add"] == 1);
 }
 
-static void VerifyConstantFoldingWithDequantizeLinear(int quantize_linear_count,
-                                                      int dequantize_linear_count,
-                                                      int conv_count,
+static void VerifyConstantFoldingWithDequantizeLinear(const std::unordered_map<std::string, int>& expected_op_count,
                                                       Graph& graph,
                                                       SessionOptions& session_options,
                                                       const Logger& logger) {
@@ -748,9 +746,15 @@ static void VerifyConstantFoldingWithDequantizeLinear(int quantize_linear_count,
   ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, logger));
 
   std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
-  ASSERT_TRUE(op_to_count["QuantizeLinear"] == quantize_linear_count);
-  ASSERT_TRUE(op_to_count["DequantizeLinear"] == dequantize_linear_count);
-  ASSERT_TRUE(op_to_count["Conv"] == conv_count);
+  for (const auto& entry : expected_op_count) {
+    if (entry.second == 0) {
+      ASSERT_TRUE(op_to_count.find(entry.first) == op_to_count.end())
+          << entry.first << " should not exist in the graph";
+    } else {
+      ASSERT_TRUE(op_to_count[entry.first] == entry.second)
+          << entry.first << " mismatch. Expected:" << entry.second << " Got:" << op_to_count[entry.first];
+    }
+  }
 }
 
 TEST_F(GraphTransformationTests, ConstantFoldingWithDequantizeLinear) {
@@ -763,17 +767,66 @@ TEST_F(GraphTransformationTests, ConstantFoldingWithDequantizeLinear) {
   ASSERT_TRUE(op_to_count["DequantizeLinear"] == 3);
   ASSERT_TRUE(op_to_count["Conv"] == 1);
 
+  std::unordered_map<std::string, int> expected_op_counts = {{"QuantizeLinear", 1},
+                                                             {"DequantizeLinear", 3},
+                                                             {"Conv", 1}};
+
   SessionOptions session_options;
   // Check DequantizeLinear aren't constant folded for default setting.
-  VerifyConstantFoldingWithDequantizeLinear(1, 3, 1, graph, session_options, *logger_);
+  VerifyConstantFoldingWithDequantizeLinear(expected_op_counts, graph, session_options, *logger_);
 
   // set kOrtSessionOptionsDisableQuantQDQ to enable it explicitly
   ASSERT_STATUS_OK(session_options.config_options.AddConfigEntry(kOrtSessionOptionsDisableQuantQDQ, "0"));
-  VerifyConstantFoldingWithDequantizeLinear(1, 3, 1, graph, session_options, *logger_);
+  VerifyConstantFoldingWithDequantizeLinear(expected_op_counts, graph, session_options, *logger_);
 
   // set SessionOptionsEnableQuantQDQ to disable it
+  expected_op_counts["DequantizeLinear"] = 1;
   ASSERT_STATUS_OK(session_options.config_options.AddConfigEntry(kOrtSessionOptionsDisableQuantQDQ, "1"));
-  VerifyConstantFoldingWithDequantizeLinear(1, 1, 1, graph, session_options, *logger_);
+  VerifyConstantFoldingWithDequantizeLinear(expected_op_counts, graph, session_options, *logger_);
+}
+
+// model with 2 QDQ node units that can be constant folded as they are simple DQ -> Node -> Q where DQ and Node have
+// single consumer and do not produce graph outputs. Node is deterministic.
+// there are also other DQ nodes that should be ignored.
+TEST_F(GraphTransformationTests, ConstantFoldingQDQNodeUnit) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/constant_folding_qdq_node_unit.onnx";
+  std::shared_ptr<Model> model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, model, nullptr, *logger_));
+  Graph& graph = model->MainGraph();
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_TRUE(op_to_count["QuantizeLinear"] == 3);
+  ASSERT_TRUE(op_to_count["DequantizeLinear"] == 4);
+  ASSERT_TRUE(op_to_count["Unsqueeze"] == 1);
+  ASSERT_TRUE(op_to_count["Transpose"] == 1);
+
+  SessionOptions session_options;
+
+  // 2 QDQ node units should be constant folded and go away
+  std::unordered_map<std::string, int> expected_op_counts = {{"QuantizeLinear", 1},
+                                                             {"DequantizeLinear", 2},
+                                                             {"Transpose", 0},
+                                                             {"Unsqueeze", 0}};
+
+  VerifyConstantFoldingWithDequantizeLinear(expected_op_counts, graph, session_options, *logger_);
+}
+
+// Simple QDQ Node Unit but shouldn't be constant folded as the node in the middle produces a graph output
+TEST_F(GraphTransformationTests, ConstantFoldingQDQNodeUnitGraphOutput) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/constant_folding_qdq_node_unit.graph_output.onnx";
+  std::shared_ptr<Model> model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, model, nullptr, *logger_));
+  Graph& graph = model->MainGraph();
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_TRUE(op_to_count["QuantizeLinear"] == 2);
+  ASSERT_TRUE(op_to_count["DequantizeLinear"] == 3);
+  ASSERT_TRUE(op_to_count["Unsqueeze"] == 1);
+
+  std::unordered_map<std::string, int> expected_op_counts = {{"QuantizeLinear", 2},
+                                                             {"DequantizeLinear", 3},
+                                                             {"Unsqueeze", 1}};
+
+  SessionOptions session_options;
+  VerifyConstantFoldingWithDequantizeLinear(expected_op_counts, graph, session_options, *logger_);
 }
 
 TEST_F(GraphTransformationTests, ConstantFolding_RemoveDanglingInputNodesToConstantFoldedNode) {
diff --git a/onnxruntime/test/testdata/transform/fusion/constant_folding_qdq_node_unit.graph_output.onnx b/onnxruntime/test/testdata/transform/fusion/constant_folding_qdq_node_unit.graph_output.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..76b64c0af17ab7e828d3ea1a020dbb5a42d46bde
GIT binary patch
literal 1335
zcmbVM(TdYR6zy~)jaO{Lu)EM8Y&W1tz?x=}UB!o5_QfKs1%2^lNS#p!+nFRYfptH?
zPtaH2{0Kk6Kk`K<T^rqLYNdn=$(=jro;mkqTo>PzLY()rA`zNjF#lTAAM~R~p8&oi
z=*nEHcS+7DiO9e_%Q6ak_L6#?%ozA~`hw+>q>>AbkL|Tnw#0kJ1xs>rR_L_Q_cNN2
zLi3p-v`9(hN1Ha<I|9#!hR+DL?T_23RGJh*Wd&nKBm8ptO0;~>1R2D^^(q&3z)8-T
z8a9S#*!+T8@vSG6GTP&!r}IQ8gT{I?)qHLQ&*3WqtHCzndRIA;%<oo=DwaeRIwD!n
zp8Ov7rS9?Vr$Vt@8;;Wty~*k1c`noBZKAmpFG|L5WIi`czady@vkljS1!yV5lG)KE
zv&9)`=KJdkXtp}q|7Aag78k}|B!bQ#Lp?CAj-Y)KpOC+-ip%e-BKb8rTS!n)Q?<F>
zZrRSZtY4d5JwKd5Z6$1y8N0>~3qkyraVUugN5PJH!JW#c_vVtaSs1v;YmV0624mC+
z4?HxgzZAv}=lo%KXdwP^J&e)Muxl{v*^He#KR!MV_sni<N~Y_p!)mB_A9h`YJ!rbf
d#EuE4jA;PXIoH<QwSPgYvP4|O$bp7;;}^k8x&i<I

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/testdata/transform/fusion/constant_folding_qdq_node_unit.onnx b/onnxruntime/test/testdata/transform/fusion/constant_folding_qdq_node_unit.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..9943af1080106d78d51c9753d43fb7d17276ab0c
GIT binary patch
literal 1933
zcmbtVO>Wab6wV|@aUUWvR*MviDybz{SRy#75CyRaQ8z?LB@%4dj2t|L5y^}_9!sSM
zV9gO&aRfFTfva!=HZV@f(E6vTQbi-hZ@zio`}yXLY1+3qV<-JM4Va{-#JS=32YvtP
zuNM47(BVQR9|A!_?Bjv@tzaSS<hPP{!JL4TH=h#0al|Q;_T&8CAzAr*Nf`+QK1yYj
z%7<|n<5bdlg2ObzzT>Yum_OlKdDrj_!KVE7?i-fGDIuo>Oa75CJQ!l0N*_mp;MlQ?
z+wPXOd2KL)6OxQ-BQ&aiMa|*e=V3^~9%Vf#0+vKPAw3>Rx={5O@EyVBe4CWsQX~?m
zI*r)r=m2v$#cpf*PYh>W8Qy!r5+bAudG@I{J)FK0JPO_il5+MsOW>G`g$n!!f&$GZ
zv39E>TjmhPj#C8)<6`5l8vPdp>xk{$Wx83vRyEtdpo8MJyQOX3Z7?Dt7D@+=t1Cet
zFJs)Kx@RqtE8!{mcB(k8uY_|no5b3!icBk^3UD1I%%Dk`8bjEJ_W!~2NKHQ<S_i`e
zd~T%K+4sfk;*{9(R|*m+L9M6T&gk-^%_+4OZvq@aDH7I|**4X%TNcD=s?mjqgMGK5
z{@88Z-v<jGlDX%a$f}R4UyTW>d0h*Q%O8)4K`DLW?I{<RJR=kI$LlB``QA*7`@cUw
z^LEtfT$dE<+dbV>`wncI$hM$vBDET78QGe-p!2N6c*PO7#MZUH%;$2=R#;sm99Ln-
J2Gp#Ze*y6Tiv0ip

literal 0
HcmV?d00001


From 338e6672dd5f9a42f94bde96ec2fe970f142432f Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Fri, 24 Mar 2023 10:01:22 +0800
Subject: [PATCH 11/20] use build.sourceversion in cache image key (#15019)

### Description
Use build.sourceversion in docker image cache key.


### Motivation and Context
We used filpath as the cache key in #14496.
In most cases, the docker base image tag is latest.
So, the hash of the files couldn't be aware of the change of base image.
As the result, the docker image restored, but the image will still be
rebuilt .
The maintenance cost would be huge if we pin image hash in docker file.
For example,
https://quay.io/repository/pypa/manylinux2014_x86_64?tab=tags&tag=latest,
it's updated almost every week.
So far, the build.sourceversion is the right way to keep cache is
updated and valid.
---
 tools/ci_build/get_docker_image.py                        | 2 +-
 .../github/azure-pipelines/linux-cpu-aten-pipeline.yml    | 3 ---
 .../azure-pipelines/templates/get-docker-image-steps.yml  | 8 +++-----
 3 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/tools/ci_build/get_docker_image.py b/tools/ci_build/get_docker_image.py
index 3040b873c08ba..3639e58a1dccd 100755
--- a/tools/ci_build/get_docker_image.py
+++ b/tools/ci_build/get_docker_image.py
@@ -36,7 +36,7 @@ def parse_args():
         "--container-registry",
         help="The Azure container registry name. " "If not provided, no container registry will be used.",
     )
-    parser.add_argument("--repository", help="The image repository name.")
+    parser.add_argument("--repository", required=True, help="The image repository name.")
 
     parser.add_argument("--use_imagecache", action="store_true", help="use cached image in pipeline cache")
 
diff --git a/tools/ci_build/github/azure-pipelines/linux-cpu-aten-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-cpu-aten-pipeline.yml
index 1a345a8bf0517..b4a92280ddc5a 100644
--- a/tools/ci_build/github/azure-pipelines/linux-cpu-aten-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-cpu-aten-pipeline.yml
@@ -36,9 +36,6 @@ jobs:
       Repository: 'onnxruntimecpubuildaten'
       UseImageCacheContainerRegistry: false
       UsePipelineCache: true
-      DockerCacheKeys: 'tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_aten_cpu, tools/ci_build/github/linux/docker/manylinux*,
-        tools/ci_build/github/linux/docker/scripts/**/*, tools/ci_build/github/linux/docker/build_scripts/**/*,
-        !tools/ci_build/github/linux/docker/scripts/deps.txt'
 
   - task: Cache@2
     inputs:
diff --git a/tools/ci_build/github/azure-pipelines/templates/get-docker-image-steps.yml b/tools/ci_build/github/azure-pipelines/templates/get-docker-image-steps.yml
index f312af599dc18..e4f467da45d59 100644
--- a/tools/ci_build/github/azure-pipelines/templates/get-docker-image-steps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/get-docker-image-steps.yml
@@ -25,9 +25,6 @@ parameters:
 - name: IMAGE_CACHE_DIR
   type: string
   default: $(Agent.TempDirectory)/docker
-- name: DockerCacheKeys
-  type: string
-  default: 'tools/ci_build/github/linux/docker/**/*, !tools/ci_build/github/linux/docker/scripts/deps.txt'
 
 steps:
 
@@ -74,10 +71,11 @@ steps:
   - task: Cache@2
     displayName: Cache Docker Image Task
     inputs:
-      key: ' "${{ parameters.Repository }}" | ${{ parameters.DockerCacheKeys }} '
+      key: ' "${{ parameters.Repository }}" | "$(Build.SourceVersion)" '
       path: ${{ parameters.IMAGE_CACHE_DIR }}
       restoreKeys: |
-        "${{ parameters.Repository }}" | ${{ parameters.DockerCacheKeys }}
+        "${{ parameters.Repository }}" | "$(Build.SourceVersion)"
+        "${{ parameters.Repository }}"
       cacheHitVar: CACHE_RESTORED
     condition: eq('${{ parameters.UsePipelineCache }}', 'true')
 

From 5c5c345abc9d4afe3f87b0ce4db1272ed519f2df Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Fri, 24 Mar 2023 12:30:05 +0800
Subject: [PATCH 12/20] Add smoking tests for all CPU Packages. (#15153)

### Description
So far, 2 packages are not supported.
1. Mac silicon, because there isn't Mac silicon agent in Azure.
2. Linux ARM64, because there isn't microsoft-hosted Linux ARM64 agent
in ADO and UsePythonVersion isn't supported in self-hosted Linux ARM
pool.

Test Run:

https://dev.azure.com/aiinfra/Lotus/_build/results?buildId=291132&view=logs&j=3a60a0ba-1640-5a1c-2d51-19af647b2d6b
---
 .../py-package-test-pipeline.yml              | 82 ++++++-------------
 .../templates/py-package-smoking-test.yml     | 74 +++++++++++++++++
 2 files changed, 100 insertions(+), 56 deletions(-)
 create mode 100644 tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml

diff --git a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
index 58f5dd0fa1c5b..989c263c42691 100644
--- a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml
@@ -25,61 +25,31 @@ stages:
 - stage: Packages_Somking_Test
   dependsOn: []
   jobs:
-    - job: MacOS_py_Wheels
-      timeoutInMinutes: 30
-      workspace:
-        clean: all
-      pool:
-        vmImage: 'macOS-12'
-      variables:
-        MACOSX_DEPLOYMENT_TARGET: '10.15'
-      strategy:
-        matrix:
-          Python38:
-            PythonVersion: '3.8'
-          Python39:
-            PythonVersion: '3.9'
-          Python310:
-            PythonVersion: '3.10'
-      steps:
-      - task: UsePythonVersion@0
-        displayName: 'Use Python'
-        inputs:
-          versionSpec: $(PythonVersion)
-
-      - task: DownloadPipelineArtifact@2
-        displayName: 'Download Pipeline Artifact'
-        inputs:
-          artifactName: 'onnxruntime'
-          targetPath: '$(Build.BinariesDirectory)/whl'
-          itemPattern: '*/*mac*x86_64.whl'
-          # The public ADO project
-          ${{ if eq(variables['System.CollectionId'], 'f3ad12f2-e480-4533-baf2-635c95467d29') }}:
-            buildType: current
-          # The private ADO project
-          ${{ if eq(variables['System.CollectionId'], 'bc038106-a83b-4dab-9dd3-5a41bc58f34c') }}:
-            project: '530acbc4-21bc-487d-8cd8-348ff451d2ff'
-            definition: 841
-            preferTriggeringPipeline: true
-            runVersion: 'latest'
-            buildType: specific
-
-      - script: |
-          set -ex
-          cd $(Build.BinariesDirectory)
-          files=(whl/*.whl)
-          FILE_NAME="${files[0]}"
-          FILE_NAME=$(basename $FILE_NAME)
-          PYTHON_PACKAGE_NAME=$(echo "$FILE_NAME" | cut -f 1 -d '-')
-          python3 -m pip install --find-links $(Build.BinariesDirectory)/whl $PYTHON_PACKAGE_NAME
-          pip show $PYTHON_PACKAGE_NAME
-          python -c "import onnxruntime as ort; print(ort.__version__)"
-        displayName: Test Package Installation
-
-      - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
-        displayName: 'Clean Agent Directories'
-        condition: always()
-
+    - template: templates/py-package-smoking-test.yml
+      parameters:
+        job_name: Test_MAC_Wheels
+        machine_pool:
+          vmImage: 'macOS-12'
+        itemPattern: '*/*mac*x86_64.whl'
+    - template: templates/py-package-smoking-test.yml
+      parameters:
+        job_name: Test_WIN_64_Wheels
+        itemPattern: '*/*win_amd64.whl'
+        machine_pool:
+          vmImage: 'windows-2022'
+    - template: templates/py-package-smoking-test.yml
+      parameters:
+        job_name: Test_WIN_32_Wheels
+        itemPattern: '*/*win32.whl'
+        python_arch: 'x86'
+        machine_pool:
+          vmImage: 'windows-2022'
+    - template: templates/py-package-smoking-test.yml
+      parameters:
+        job_name: Test_LINUX_x86_64_Wheels
+        itemPattern: '*/*manylinux*x86_64.whl'
+        machine_pool:
+          vmImage: 'ubuntu-22.04'
 
 # ****The following Stage depend on all previous tags. ***
 
@@ -106,7 +76,7 @@ stages:
     - Linux_Test_GPU_x86_64_stage
   jobs:
   - job: Final
-    #Run this step only if all previous steps are succeeded and (this build was triggered by a resource trigger or it was triggered by another build).
+    # Run this step only if all previous steps are succeeded and (this build was triggered by a resource trigger or it was triggered by another build).
     condition: and(succeeded(), eq(variables['Build.Reason'], 'ResourceTrigger'))
     timeoutInMinutes: 60
     variables:
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml b/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml
new file mode 100644
index 0000000000000..b9f9898551454
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/templates/py-package-smoking-test.yml
@@ -0,0 +1,74 @@
+parameters:
+- name: job_name
+  type: string
+
+- name: itemPattern
+  type: string
+  default: '**'
+
+- name: machine_pool
+  type: object
+
+- name: python_arch
+  type: string
+  default: 'x64'
+
+jobs:
+- job: ${{ parameters.job_name }}
+  timeoutInMinutes: 30
+  workspace:
+    clean: all
+  pool:
+    ${{ parameters.machine_pool }}
+  strategy:
+    matrix:
+      Python38:
+        PythonVersion: '3.8'
+      Python39:
+        PythonVersion: '3.9'
+      Python310:
+        PythonVersion: '3.10'
+  steps:
+  - checkout: none
+
+  - task: UsePythonVersion@0
+    displayName: 'Use Python'
+    inputs:
+      versionSpec: $(PythonVersion)
+      architecture: ${{ parameters.python_arch }}
+
+  - task: DownloadPipelineArtifact@2
+    displayName: 'Download Pipeline Artifact'
+    inputs:
+      artifactName: 'onnxruntime'
+      targetPath: '$(Build.BinariesDirectory)/whl'
+      itemPattern: ${{parameters.itemPattern}}
+      # The public ADO project
+      ${{ if eq(variables['System.CollectionId'], 'f3ad12f2-e480-4533-baf2-635c95467d29') }}:
+        buildType: current
+      # The private ADO project
+      ${{ if eq(variables['System.CollectionId'], 'bc038106-a83b-4dab-9dd3-5a41bc58f34c') }}:
+        project: '530acbc4-21bc-487d-8cd8-348ff451d2ff'
+        definition: 841
+        preferTriggeringPipeline: true
+        runVersion: 'latest'
+        buildType: specific
+
+  - task: Bash@3
+    inputs:
+      targetType: 'inline'
+      script: |
+        set -ex
+        files=(whl/*.whl)
+        FILE_NAME="${files[0]}"
+        FILE_NAME=$(basename $FILE_NAME)
+        PYTHON_PACKAGE_NAME=$(echo "$FILE_NAME" | cut -f 1 -d '-')
+        python3 -m pip install --find-links "$(Build.BinariesDirectory)/whl" $PYTHON_PACKAGE_NAME
+        pip show $PYTHON_PACKAGE_NAME
+        python -c "import onnxruntime as ort; print(ort.__version__)"
+      workingDirectory: $(Build.BinariesDirectory)
+    displayName: Test Package Installation
+
+  - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+    displayName: 'Clean Agent Directories'
+    condition: always()

From 0402f930f22e8c7a7e1b4f8f807aba157e2a84b5 Mon Sep 17 00:00:00 2001
From: Ye Wang <52801275+wangyems@users.noreply.github.com>
Date: Thu, 23 Mar 2023 22:40:06 -0700
Subject: [PATCH 13/20] exclude decoder files in hipify.cmake (#15188)

---
 cmake/onnxruntime_rocm_hipify.cmake                          | 4 ++--
 .../contrib_ops/cpu/transformers/beam_search_impl_gpt.h      | 2 +-
 .../contrib_ops/cpu/transformers/greedy_search_impl_gpt.h    | 2 +-
 .../cuda/decoder/decoder_masked_self_attention.cc            | 5 +----
 4 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake
index 8a9c54359e28f..ebd6229204bb1 100644
--- a/cmake/onnxruntime_rocm_hipify.cmake
+++ b/cmake/onnxruntime_rocm_hipify.cmake
@@ -90,8 +90,8 @@ set(contrib_ops_excluded_files
   "cuda_contrib_kernels.h"
   "inverse.cc"
   "fused_conv.cc"
-  "decoder/decoder_masked_multihead_attention.h"
-  "decoder/decoder_masked_multihead_attention.cc"
+  "decoder/decoder_masked_self_attention.h"
+  "decoder/decoder_masked_self_attention.cc"
   "decoder/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.h"
   "decoder/fastertransformer_decoder_attention/decoder_masked_multihead_attention.h"
   "decoder/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu"
diff --git a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
index b8229387196cb..e90738a2639d0 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/beam_search_impl_gpt.h
@@ -51,7 +51,7 @@ class BeamSearchGpt : public BeamSearchBase<T> {
         cuda_device_arch_(cuda_device_arch) {
     if (gpt_subgraph_.has_decoder_masked_self_attention_) {
       ORT_ENFORCE(cuda_device_arch_ >= 530,
-                  "Decoder masked multihead attention can only be used on "
+                  "Decoder masked self attention can only be used on "
                   "GPU cards of compute capability 5.3 or higher. "
                   "This card has compute capability ",
                   cuda_device_arch_);
diff --git a/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_gpt.h b/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_gpt.h
index 4f5271ff4672f..ddbf86dd82613 100644
--- a/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_gpt.h
+++ b/onnxruntime/contrib_ops/cpu/transformers/greedy_search_impl_gpt.h
@@ -67,7 +67,7 @@ class GreedySearchGpt : public GreedySearchBase<T, ParametersT> {
         cuda_device_arch_(cuda_device_arch) {
     if (gpt_subgraph_.has_decoder_masked_self_attention_) {
       ORT_ENFORCE(cuda_device_arch_ >= 530,
-                  "Decoder masked multihead attention can only be used on "
+                  "Decoder masked self attention can only be used on "
                   "GPU cards of compute capability 5.3 or higher. "
                   "This card has compute capability ",
                   cuda_device_arch_);
diff --git a/onnxruntime/contrib_ops/cuda/decoder/decoder_masked_self_attention.cc b/onnxruntime/contrib_ops/cuda/decoder/decoder_masked_self_attention.cc
index e56dfeeec377d..bba7649703220 100644
--- a/onnxruntime/contrib_ops/cuda/decoder/decoder_masked_self_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/decoder/decoder_masked_self_attention.cc
@@ -73,10 +73,7 @@ Status DecoderMaskedSelfAttention<T1, T2>::ComputeInternal(OpKernelContext* cont
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input sequence length should be 1 to use DecoderMaskedSelfAttention");
   }
 
-  // TODO(hasesh): In future, we may support CrossAttention. Currently, this kernel only supports SelfAttention.
-  if (parameters.sequence_length != parameters.kv_sequence_length) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "DecoderMaskedSelfAttention only supports self attention currently");
-  }
+  ORT_ENFORCE(parameters.sequence_length == parameters.kv_sequence_length);
 
   // TODO(hasesh): If there is a need, we will support this later
   if (parameters.head_size != parameters.v_head_size) {

From 3a4c895765c7a917279f9ace442ae9f5119258ea Mon Sep 17 00:00:00 2001
From: Patrice Vignola <vignola.patrice@gmail.com>
Date: Thu, 23 Mar 2023 23:47:21 -0700
Subject: [PATCH 14/20] [DML EP] Add support for SkipLayerNorm's fourth output
 (#15160)

---
 .../DmlOperatorSkipLayerNormalization.cpp     | 42 ++++++++++++++++++-
 .../src/Operators/OperatorRegistration.cpp    |  3 +-
 .../OperatorAuthorHelper/OperatorHelper.cpp   | 17 ++++++++
 .../dml/OperatorAuthorHelper/OperatorHelper.h | 10 ++++-
 .../test/contrib_ops/skiplayernorm_op_test.cc |  7 +---
 5 files changed, 70 insertions(+), 9 deletions(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorSkipLayerNormalization.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorSkipLayerNormalization.cpp
index 5f5a93282ee3a..5d527508606e2 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorSkipLayerNormalization.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorSkipLayerNormalization.cpp
@@ -13,11 +13,12 @@ class DmlOperatorSkipLayerNormalization : public DmlOperator
     :   DmlOperator(kernelCreationContext)
     {
         std::vector<std::optional<uint32_t>> kernelInputIndices = {0, 1, 2, 3, 4};
+        std::vector<std::optional<uint32_t>> kernelOutputIndices = {0, 1, 2, 3};
 
         DmlOperator::Initialize(
             kernelCreationContext,
             kernelInputIndices,
-            std::nullopt,
+            kernelOutputIndices,
             kernelCreationContext.GetTensorShapeDescription().GetInputTensorShape(0),
             std::nullopt,
             kernelCreationContext.GetTensorShapeDescription().GetInputTensorDimensionCount(0));
@@ -31,7 +32,7 @@ class DmlOperatorSkipLayerNormalization : public DmlOperator
         std::iota(onnxAxes.begin(), onnxAxes.end(), onnxAxis);
 
         assert(m_inputTensorDescs.size() == 5);
-        assert(m_outputTensorDescs.size() == 1);
+        assert(m_outputTensorDescs.size() == 4);
 
         auto inputDesc = m_inputTensorDescs[0].GetDmlDesc();
         auto skipDesc = m_inputTensorDescs[1].GetDmlDesc();
@@ -39,6 +40,7 @@ class DmlOperatorSkipLayerNormalization : public DmlOperator
         auto betaDesc = m_inputTensorDescs[3].GetDmlDesc();
         auto biasDesc = m_inputTensorDescs[4].GetDmlDesc();
         auto outputDesc = m_outputTensorDescs[0].GetDmlDesc();
+        auto inputSkipBiasSum = m_outputTensorDescs[3].GetDmlDesc();
 
         TensorDesc inputSkipBiasTensorDesc(m_inputTensorDescs[0].GetDmlDataType(), m_inputTensorDescs[0].GetSizes());
         DML_TENSOR_DESC inputSkipBiasDmlTensorDesc = inputSkipBiasTensorDesc.GetDmlDesc();
@@ -112,6 +114,23 @@ class DmlOperatorSkipLayerNormalization : public DmlOperator
             biasInputEdge.ToNodeIndex = 1;
             biasInputEdge.ToNodeInputIndex = 1;
             inputEdges.push_back(std::move(biasInputEdge));
+
+            if (inputSkipBiasSum.Desc)
+            {
+                DML_OUTPUT_GRAPH_EDGE_DESC inputSkipBiasSumEdge = {};
+                inputSkipBiasSumEdge.FromNodeIndex = 1;
+                inputSkipBiasSumEdge.FromNodeOutputIndex = 0;
+                inputSkipBiasSumEdge.GraphOutputIndex = 3;
+                outputEdges.push_back(std::move(inputSkipBiasSumEdge));
+            }
+        }
+        else if (inputSkipBiasSum.Desc)
+        {
+            DML_OUTPUT_GRAPH_EDGE_DESC inputSkipBiasSumEdge = {};
+            inputSkipBiasSumEdge.FromNodeIndex = 0;
+            inputSkipBiasSumEdge.FromNodeOutputIndex = 0;
+            inputSkipBiasSumEdge.GraphOutputIndex = 3;
+            outputEdges.push_back(std::move(inputSkipBiasSumEdge));
         }
 
         // Insert the MVN operation into the graph
@@ -159,6 +178,25 @@ class DmlOperatorSkipLayerNormalization : public DmlOperator
     }
 };
 
+void CALLBACK QuerySkipLayerNormalization(IMLOperatorSupportQueryContextPrivate* context, /*out*/ bool* isSupported)
+{
+    *isSupported = false;
+
+    // `mean` output tensor is not supported yet
+    if (context->IsOutputValid(1))
+    {
+        return;
+    }
+
+    // `inv_std_var` output tensor is not supported yet
+    if (context->IsOutputValid(2))
+    {
+        return;
+    }
+
+    *isSupported = true;
+}
+
 DML_OP_DEFINE_CREATION_FUNCTION(SkipLayerNormalization, DmlOperatorSkipLayerNormalization);
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
index 67a6ff79911e6..1a09c39ebda3e 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
@@ -390,6 +390,7 @@ DML_OP_EXTERN_QUERY_FUNCTION(RecurrentNeuralNetwork);
 DML_OP_EXTERN_QUERY_FUNCTION(BatchNormalization);
 DML_OP_EXTERN_QUERY_FUNCTION(Pad);
 DML_OP_EXTERN_QUERY_FUNCTION(LayerNormalization);
+DML_OP_EXTERN_QUERY_FUNCTION(SkipLayerNormalization);
 DML_OP_EXTERN_QUERY_FUNCTION(QLinearSigmoid);
 DML_OP_EXTERN_QUERY_FUNCTION(Attention);
 
@@ -872,7 +873,7 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
     {REG_INFO(     10,  ConvInteger,                        typeNameListThree,              supportedTypeListInteger,               DmlGraphSupport::Supported)},
     {REG_INFO(     11,  DynamicQuantizeLinear,              typeNameListTwo,                supportedTypeListDynamicQuantizeLinear, DmlGraphSupport::Supported)},
     {REG_INFO(      7,  LayerNormalization,                 typeNameListLayerNormContrib,   supportedTypeListLayerNormalizationContrib, DmlGraphSupport::Supported, requiredConstantCpuInputs(), std::nullopt, QueryLayerNormalization)},
-    {REG_INFO_MS(   1,  SkipLayerNormalization,             typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
+    {REG_INFO_MS(   1,  SkipLayerNormalization,             typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported, requiredConstantCpuInputs(), std::nullopt, QuerySkipLayerNormalization)},
     {REG_INFO_MS(   1,  EmbedLayerNormalization,            typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
 };
 
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
index f8887f8b85df2..00e3e1fcd502b 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
@@ -2524,4 +2524,21 @@ namespace OperatorHelper
         return outputShapes;
     }
 
+    std::vector<EdgeShapes> SkipLayerNormHelper::GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const
+    {
+        ML_CHECK_VALID_ARGUMENT(shapeInfo.GetInputCount() >= 3);
+
+        auto inputShape = shapeInfo.GetInputTensorShape(0);
+
+        std::vector<EdgeShapes> outputShapes(4);
+        outputShapes[0] = EdgeShapes(inputShape);
+
+        if (shapeInfo.IsOutputValid(3))
+        {
+            outputShapes[3] = EdgeShapes(inputShape);
+        }
+
+        return outputShapes;
+    }
+
 } // namespace OperatorHelper
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
index f95b3af3d3326..630994db59200 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
@@ -1405,6 +1405,14 @@ class EmbedLayerNormalizationHelper
     std::vector<EdgeShapes> GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const;
 };
 
+class SkipLayerNormHelper
+{
+public:
+    template <typename Info_t, typename Shape_t>
+    SkipLayerNormHelper(const Info_t& info, const Shape_t& shapeInfo) {}
+    std::vector<EdgeShapes> GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const;
+};
+
 using ShapeInferenceHelper_Conv = ConvHelper;
 using ShapeInferenceHelper_ConvTranspose = ConvTransposeHelper;
 using ShapeInferenceHelper_ConvTransposeWithDynamicPads = ConvTransposeWithDynamicPadsHelper;
@@ -1427,7 +1435,7 @@ using ShapeInferenceHelper_LRN = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_MeanVarianceNormalization = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_LayerNormalization = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_LayerNormalization17 = GetOutputShapeAsInputShapeHelper;
-using ShapeInferenceHelper_SkipLayerNormalization = GetOutputShapeAsInputShapeHelper;
+using ShapeInferenceHelper_SkipLayerNormalization = SkipLayerNormHelper;
 using ShapeInferenceHelper_EmbedLayerNormalization = EmbedLayerNormalizationHelper;
 using ShapeInferenceHelper_LpNormalization = GetOutputShapeAsInputShapeHelper;
 using ShapeInferenceHelper_RNN = RecurrentHelper;
diff --git a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc
index 638b7565a3ef0..a6620eb5282cc 100644
--- a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc
+++ b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc
@@ -448,8 +448,6 @@ TEST(SkipLayerNormTest, SkipLayerNormBatch2_Bias) {
           hidden_size);
 }
 
-// Don't enable this test for DML builds as these EP doesn't produce the new optional output yet
-#if !defined(USE_DML)
 TEST(SkipLayerNormTest, SkipLayerNormBatch2_Bias_ProducingOptionalOutput) {
   int batch_size = 2;
   int sequence_length = 2;
@@ -506,8 +504,8 @@ TEST(SkipLayerNormTest, SkipLayerNormBatch2_Bias_ProducingOptionalOutput) {
           hidden_size);
 }
 
-// SkipSimplifiedLayerNorm has not been enabled for ROCm yet
-#if !defined(USE_ROCM)
+// SkipSimplifiedLayerNorm has not been enabled for ROCm and DML yet
+#if !defined(USE_ROCM) && !defined(USE_DML)
 TEST(SkipLayerNormTest, SkipSimplifiedLayerNormBatch1_Float16) {
   int batch_size = 1;
   int sequence_length = 2;
@@ -544,7 +542,6 @@ TEST(SkipLayerNormTest, SkipSimplifiedLayerNormBatch1_Float16) {
           true);
 }
 #endif
-#endif
 
 }  // namespace test
 }  // namespace onnxruntime

From 7eb6dbe7d862117b865cbf51a0329db0dae37779 Mon Sep 17 00:00:00 2001
From: PeixuanZuo <94887879+PeixuanZuo@users.noreply.github.com>
Date: Fri, 24 Mar 2023 19:31:14 +0800
Subject: [PATCH 15/20] [ROCm] Add compute type for Skiplayernorm to fix ROCm
 CI (#15192)

- Add compute type for Skiplayernorm to fix ROCm CI and get more
accurate results.

SkipLayerNorm:
type T: input, skip, bias
type U: epsilon, compute result
type V: output, beta, gamma

- refactor the usage of aligned_vector, reduce the usage of
`reinterpret_cast`.
---
 .../contrib_ops/rocm/bert/layer_norm.cuh      |  89 ++++++--------
 .../contrib_ops/rocm/bert/skip_layer_norm.cc  |   2 +-
 .../rocm/bert/skip_layer_norm_impl.cu         |  17 +--
 .../rocm/bert/skip_layer_norm_impl.h          |   8 +-
 .../rocm/bert/skip_layer_norm_impl_kernel.h   | 112 ++++++++----------
 .../rocm/bert/skip_layer_norm_tunable_op.h    |  81 +++++++------
 .../kernels/rocm/skip_layer_norm.cu           |  22 ++--
 .../kernels/skip_layer_norm_test.py           |   2 +-
 8 files changed, 157 insertions(+), 176 deletions(-)

diff --git a/onnxruntime/contrib_ops/rocm/bert/layer_norm.cuh b/onnxruntime/contrib_ops/rocm/bert/layer_norm.cuh
index 169d0ed3b0147..9b7dbd5291a8b 100644
--- a/onnxruntime/contrib_ops/rocm/bert/layer_norm.cuh
+++ b/onnxruntime/contrib_ops/rocm/bert/layer_norm.cuh
@@ -80,16 +80,16 @@ struct KeyValuePairSum {
   }
 };
 
-template <typename T, int TPB>
+template <typename U, typename V, int TPB>
 __device__ inline void LayerNorm(
-    const hipcub::KeyValuePair<T, T>& thread_data, const int ld, const int offset, const T* beta,
-    const T* gamma, const T epsilon, T* output) {
+    const hipcub::KeyValuePair<U, U>& thread_data, const int ld, const int offset, const V* beta,
+    const V* gamma, const U epsilon, V* output) {
   // Assuming thread_data is already divided by ld
 
-  using BlockReduce = hipcub::BlockReduce<hipcub::KeyValuePair<T, T>, TPB>;
+  using BlockReduce = hipcub::BlockReduce<hipcub::KeyValuePair<U, U>, TPB>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
-  __shared__ T mu;      // mean
-  __shared__ T rsigma;  // 1 / std.dev.
+  __shared__ U mu;      // mean
+  __shared__ U rsigma;  // 1 / std.dev.
 
   KeyValuePairSum pair_sum;
   const auto sum_kv = BlockReduce(temp_storage).Reduce(thread_data, pair_sum);
@@ -102,23 +102,23 @@ __device__ inline void LayerNorm(
 
   for (int i = threadIdx.x; i < ld; i += TPB) {
     const int idx = offset + i;
-    const T val = output[idx];
-    const T g(gamma[i]);
-    const T b = (nullptr == beta) ? (T)0 : beta[i];
-    output[idx] = g * (val - mu) * rsigma + b;
+    const U val = static_cast<U>(output[idx]);
+    const U g = static_cast<U>(gamma[i]);
+    const U b = (nullptr == beta) ? U(0.f) : static_cast<U>(beta[i]);
+    output[idx] = static_cast<V>(g * (val - mu) * rsigma + b);
   }
 }
 
-template <typename T, int TPB, int ILP>
+template <typename U, typename V, int TPB, int ILP>
 __device__ inline void LayerNormVec(
-    const hipcub::KeyValuePair<T, T>& thread_data, const int ld, const int offset, const T* beta,
-    const T* gamma, const T epsilon, T* output) {
+    const hipcub::KeyValuePair<U, U>& thread_data, const int ld, const int offset, const V* beta,
+    const V* gamma, const U epsilon, V* output) {
   // Assuming thread_data is already divided by ld
-  using VecT = aligned_vector<T, ILP>;
-  using BlockReduce = hipcub::BlockReduce<hipcub::KeyValuePair<T, T>, TPB>;
+  using VecV = aligned_vector<V, ILP>;
+  using BlockReduce = hipcub::BlockReduce<hipcub::KeyValuePair<U, U>, TPB>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
-  __shared__ T mu;      // mean
-  __shared__ T rsigma;  // 1 / std.dev.
+  __shared__ U mu;      // mean
+  __shared__ U rsigma;  // 1 / std.dev.
 
   KeyValuePairSum pair_sum;
   const auto sum_kv = BlockReduce(temp_storage).Reduce(thread_data, pair_sum);
@@ -130,44 +130,37 @@ __device__ inline void LayerNormVec(
   __syncthreads();
 
   if (ILP * threadIdx.x < ld) {
-    T beta_v[ILP], gamma_v[ILP], output_v[ILP];
-    VecT* gamma_val = reinterpret_cast<VecT*>(&gamma_v);
-    VecT* output_val = reinterpret_cast<VecT*>(&output_v);
-
     for (int i = threadIdx.x * ILP; i < ld; i += TPB * ILP) {
       int idx = offset + i;
-      if (beta != nullptr) {
-        VecT* beta_val = reinterpret_cast<VecT*>(&beta_v);
-        *beta_val = *reinterpret_cast<const VecT*>(&beta[i]);
-      }
-      *gamma_val = *reinterpret_cast<const VecT*>(&gamma[i]);
-      *output_val = *reinterpret_cast<const VecT*>(&output[idx]);
+      const VecV beta_v = (beta != nullptr) ? *reinterpret_cast<const VecV*>(beta + i) : VecV();
+      const VecV gamma_v = *reinterpret_cast<const VecV*>(gamma + i);
+      VecV output_v = *reinterpret_cast<const VecV*>(output + idx);
+
       #pragma unroll
       for (int k = 0; k < ILP; k++) {
-        output_v[k] = (beta != nullptr) ? gamma_v[k] * (output_v[k] - mu) * rsigma + beta_v[k] :
-                                          gamma_v[k] * (output_v[k] - mu) * rsigma;
+        output_v.val[k] = (beta != nullptr) ? U(gamma_v.val[k]) * (U(output_v.val[k]) - mu) * rsigma + U(beta_v.val[k]) :
+                                              U(gamma_v.val[k]) * (U(output_v.val[k]) - mu) * rsigma;
       }
-      *(reinterpret_cast<VecT*>(&output[idx])) = *reinterpret_cast<VecT*>(&output_v[0]);
+      *(reinterpret_cast<VecV*>(output + idx)) = output_v;
     }
   }
 }
 
-template <typename T, int TPB, int ILP>
-__device__ inline void LayerNormSmall(const T* input_v, const hipcub::KeyValuePair<T, T>& thread_data,
-                                      const int ld, const int idx, const T* beta, const T* gamma,
-                                      const T epsilon, T* output) {
+template <typename T, typename U, typename V, int TPB, int ILP>
+__device__ inline void LayerNormSmall(const T* input_v, const hipcub::KeyValuePair<U, U>& thread_data,
+                                      const int ld, const int idx, const V* beta, const V* gamma,
+                                      const U epsilon, V* output) {
   // Assuming thread_data is already divided by ld
   // Small settings: the block covers the leading dimension TPB >= ld. The input
   // value is available in a register
-  using VecT = aligned_vector<T, ILP>;
-  using BlockReduce = hipcub::BlockReduce<hipcub::KeyValuePair<T, T>, TPB>;
+  using VecV = aligned_vector<V, ILP>;
+  using BlockReduce = hipcub::BlockReduce<hipcub::KeyValuePair<U, U>, TPB>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
-  __shared__ T mu;      // mean
-  __shared__ T rsigma;  // 1 / std.dev.
-  T beta_v[ILP], gamma_v[ILP], output_v[ILP];
+  __shared__ U mu;      // mean
+  __shared__ U rsigma;  // 1 / std.dev.
 
   KeyValuePairSum pair_sum;
-  const hipcub::KeyValuePair<T, T> sum_kv = BlockReduce(temp_storage).Reduce(thread_data, pair_sum);
+  const hipcub::KeyValuePair<U, U> sum_kv = BlockReduce(temp_storage).Reduce(thread_data, pair_sum);
 
   if (threadIdx.x == 0) {
     mu = sum_kv.key;
@@ -176,20 +169,16 @@ __device__ inline void LayerNormSmall(const T* input_v, const hipcub::KeyValuePa
   __syncthreads();
 
   if (ILP * threadIdx.x < ld) {
-    if (beta != nullptr) {
-      VecT* beta_val = reinterpret_cast<VecT*>(&beta_v);
-      *beta_val = *reinterpret_cast<const VecT*>(&beta[threadIdx.x * ILP]);
-    }
-
-    VecT* gamma_val = reinterpret_cast<VecT*>(&gamma_v);
-    *gamma_val = *reinterpret_cast<const VecT*>(&gamma[threadIdx.x * ILP]);
+    const VecV beta_v = (beta != nullptr) ? *reinterpret_cast<const VecV*>(beta + threadIdx.x * ILP) : VecV();
+    const VecV gamma_v = *reinterpret_cast<const VecV*>(gamma + threadIdx.x * ILP);
+    VecV output_v;
 
     #pragma unroll
     for (int i = 0; i < ILP; i++) {
-      output_v[i] = (beta != nullptr) ? gamma_v[i] * (input_v[i] - mu) * rsigma + beta_v[i] :
-                                        gamma_v[i] * (input_v[i] - mu) * rsigma;
+      output_v.val[i] = (beta != nullptr) ? U(gamma_v.val[i]) * (U(input_v[i]) - mu) * rsigma + U(beta_v.val[i]) :
+                                            U(gamma_v.val[i]) * (U(input_v[i]) - mu) * rsigma;
     }
-    *(reinterpret_cast<VecT*>(&output[idx])) = *reinterpret_cast<VecT*>(&output_v[0]);
+    *(reinterpret_cast<VecV*>(output + idx)) = output_v;
   }
 }
 
diff --git a/onnxruntime/contrib_ops/rocm/bert/skip_layer_norm.cc b/onnxruntime/contrib_ops/rocm/bert/skip_layer_norm.cc
index a254a8c04fbcd..24dbb87b50ef2 100644
--- a/onnxruntime/contrib_ops/rocm/bert/skip_layer_norm.cc
+++ b/onnxruntime/contrib_ops/rocm/bert/skip_layer_norm.cc
@@ -101,7 +101,7 @@ Status SkipLayerNorm<T>::ComputeInternal(OpKernelContext* ctx) const {
   int64_t element_count = input_dims[0] * sequence_length * hidden_size;
   typedef typename ToHipType<T>::MappedType HipT;
 
-  return LaunchSkipLayerNormKernel<HipT>(
+  return LaunchSkipLayerNormKernel<HipT, float, HipT>(
       GetTuningContext(),
       Stream(ctx),
       reinterpret_cast<HipT*>(output->MutableData<T>()),
diff --git a/onnxruntime/contrib_ops/rocm/bert/skip_layer_norm_impl.cu b/onnxruntime/contrib_ops/rocm/bert/skip_layer_norm_impl.cu
index bf33f940b3936..c6ac1196ba7b1 100644
--- a/onnxruntime/contrib_ops/rocm/bert/skip_layer_norm_impl.cu
+++ b/onnxruntime/contrib_ops/rocm/bert/skip_layer_norm_impl.cu
@@ -39,30 +39,31 @@ namespace onnxruntime {
 namespace contrib {
 namespace rocm {
 
-template <typename T>
+template <typename T, typename U, typename V>
 Status LaunchSkipLayerNormKernel(
-    RocmTuningContext* tuning_ctx, hipStream_t stream, T* output, T* skip_input_bias_add_output, const T* input, 
-    const T* skip, const T* gamma, const T* beta, const T* bias, float epsilon, int ld, int element_count) {
+    RocmTuningContext* tuning_ctx, hipStream_t stream, V* output, T* skip_input_bias_add_output, const T* input,
+    const T* skip, const V* gamma, const V* beta, const T* bias, float epsilon, int ld, int element_count) {
   // this must be true because element_count is the total size of the tensor
   assert(element_count % ld == 0);
 
-  SkipLayerNormParams<T> params(tuning_ctx, stream, output, skip_input_bias_add_output, input, skip, gamma, beta, bias, epsilon, ld, element_count);
+  SkipLayerNormParams<T, V> params(tuning_ctx, stream, output, skip_input_bias_add_output, input, skip,
+                                   gamma, beta, bias, epsilon, ld, element_count);
 
   if (tuning_ctx->IsTunableOpEnabled()) {
-    static SkipLayerNormTunableOp<T> op;
+    static SkipLayerNormTunableOp<T, U, V> op;
     return op(&params);
   }
 
-  return SkipLayerNormStaticSelection<T>(&params);
+  return SkipLayerNormStaticSelection<T, U, V>(&params);
 }
 
-template Status LaunchSkipLayerNormKernel<float>(
+template Status LaunchSkipLayerNormKernel<float, float, float>(
     RocmTuningContext* tuning_ctx, hipStream_t stream, float* output, float* skip_input_bias_add_output, const float* input,
     const float* skip, const float* gamma, const float* beta,
     const float* bias, float epsilon, int ld,
     int element_count);
 
-template Status LaunchSkipLayerNormKernel<half>(
+template Status LaunchSkipLayerNormKernel<half, float, half>(
     RocmTuningContext* tuning_ctx, hipStream_t stream, half* output, half* skip_input_bias_add_output, const half* input,
     const half* skip, const half* gamma, const half* beta,
     const half* bias, float epsilon, int ld,
diff --git a/onnxruntime/contrib_ops/rocm/bert/skip_layer_norm_impl.h b/onnxruntime/contrib_ops/rocm/bert/skip_layer_norm_impl.h
index 911164af92292..a1c09142fef2e 100644
--- a/onnxruntime/contrib_ops/rocm/bert/skip_layer_norm_impl.h
+++ b/onnxruntime/contrib_ops/rocm/bert/skip_layer_norm_impl.h
@@ -10,16 +10,16 @@ namespace onnxruntime {
 namespace contrib {
 namespace rocm {
 
-template <typename T>
+template <typename T, typename U, typename V>
 Status LaunchSkipLayerNormKernel(
     RocmTuningContext* tuning,
     hipStream_t stream,
-    T* output,         // output tensor
+    V* output,         // output tensor
     T* skip_input_bias_add_output, // optional output tensor
     const T* input,    // input tensor
     const T* skip,     // skip tensor
-    const T* gamma,    // Layer normalization gamma tensor
-    const T* beta,     // Layer normalization beta tensor
+    const V* gamma,    // Layer normalization gamma tensor
+    const V* beta,     // Layer normalization beta tensor
     const T* bias,     // Layer normalization beta tensor
     float epsilon,     // Layer normalization epsilon
     int hidden_size,   // hidden size, it is the leading dimension (ld)
diff --git a/onnxruntime/contrib_ops/rocm/bert/skip_layer_norm_impl_kernel.h b/onnxruntime/contrib_ops/rocm/bert/skip_layer_norm_impl_kernel.h
index aeb954de09f48..ee8959458b8e4 100644
--- a/onnxruntime/contrib_ops/rocm/bert/skip_layer_norm_impl_kernel.h
+++ b/onnxruntime/contrib_ops/rocm/bert/skip_layer_norm_impl_kernel.h
@@ -23,133 +23,125 @@ half maybe2half(float x) {
   return __float2half_rn(x);
 }
 
-template <typename T, unsigned TPB>
+template <typename T, typename U, typename V, unsigned TPB>
 __global__ void SkipLayerNormKernel(
-    const int ld, const T* input, const T* skip, const T* beta, const T* gamma, const T* bias,
-    const T epsilon, T* output, T* skip_input_bias_add_output) {
-  const T reverse_ld = T(1.f / ld);
+    const int ld, const T* input, const T* skip, const V* beta, const V* gamma, const T* bias,
+    const U epsilon, V* output, T* skip_input_bias_add_output) {
+  const U reverse_ld = U(1.f / ld);
   const int offset = blockIdx.x * ld;
 
   KeyValuePairSum pair_sum;
   // reduce x and x^2
-  hipcub::KeyValuePair<T, T> thread_data(0, 0);
+  hipcub::KeyValuePair<U, U> thread_data(U(0.f), U(0.f));
 
   for (int i = threadIdx.x; i < ld; i += TPB) {
     const int idx = offset + i;
-    const T val = (bias == nullptr) ? input[idx] + skip[idx] : input[idx] + skip[idx] + bias[i];
-    const T rldval = reverse_ld * val;
-    thread_data = pair_sum(thread_data, hipcub::KeyValuePair<T, T>(rldval, rldval * val));
+    const U val = (bias == nullptr) ? static_cast<U>(input[idx]) + static_cast<U>(skip[idx]) :
+                                      static_cast<U>(input[idx]) + static_cast<U>(skip[idx]) + static_cast<U>(bias[i]);
+    const U rldval = reverse_ld * val;
+    thread_data = pair_sum(thread_data, hipcub::KeyValuePair<U, U>(rldval, rldval * val));
 
     if (skip_input_bias_add_output != nullptr) {
-      skip_input_bias_add_output[idx] = val;
+      skip_input_bias_add_output[idx] = static_cast<T>(val);
     }
 
-    output[idx] = val;
+    output[idx] = static_cast<V>(val);
   }
 
-  LayerNorm<T, TPB>(thread_data, ld, offset, beta, gamma, epsilon, output);
+  LayerNorm<U, V, TPB>(thread_data, ld, offset, beta, gamma, epsilon, output);
 }
 
 // Vectorized kernel
-template <typename T, unsigned TPB, int ILP>
+template <typename T, typename U, typename V, unsigned TPB, int ILP>
 __global__ void SkipLayerNormKernelVec(
-    const int ld, const T* input, const T* skip, const T* beta, const T* gamma,
-    const T* bias, const T epsilon, T* output, T* skip_input_bias_add_output,
+    const int ld, const T* input, const T* skip, const V* beta, const V* gamma,
+    const T* bias, const U epsilon, V* output, T* skip_input_bias_add_output,
     bool hasBias, bool hasSkipInputBiasAdditionOutput) {
-  const T reverse_ld = T(1.f / ld);
+  const U reverse_ld = U(1.f / ld);
   const int offset = blockIdx.x * ld;
 
   KeyValuePairSum pair_sum;
   // reduce x and x^2
-  hipcub::KeyValuePair<T, T> thread_data(0, 0);
+  hipcub::KeyValuePair<U, U> thread_data(U(0.f), U(0.f));
 
   using VecT = aligned_vector<T, ILP>;
-  T input_v[ILP], skip_v[ILP], bias_v[ILP], skip_input_bias_add_output_v[ILP];
+  using VecV = aligned_vector<V, ILP>;
   if (threadIdx.x * ILP < ld) {
-    VecT* input_val = reinterpret_cast<VecT*>(&input_v);
-    VecT* skip_val = reinterpret_cast<VecT*>(&skip_v);
-
     for (int i = threadIdx.x * ILP; i < ld; i += TPB * ILP) {
       int idx = offset + i;
 
-      *input_val = *reinterpret_cast<const VecT*>(&input[idx]);
-      *skip_val = *reinterpret_cast<const VecT*>(&skip[idx]);
-      if (hasBias) {
-        VecT* bias_val = reinterpret_cast<VecT*>(&bias_v);
-        *bias_val = *reinterpret_cast<const VecT*>(&bias[i]);
-      }
+      const VecT input_v = *reinterpret_cast<const VecT*>(input + idx);
+      const VecT skip_v = *reinterpret_cast<const VecT*>(skip + idx);
+      const VecT bias_v = hasBias ? *reinterpret_cast<const VecT*>(bias + i) : VecT();
+      VecT skip_input_bias_add_output_v, output_v;
 
       #pragma unroll
       for (int k = 0; k < ILP; k++) {
-        input_v[k] += hasBias ? skip_v[k] + bias_v[k] : skip_v[k];
+        const U val = hasBias ? static_cast<U>(input_v.val[k]) + static_cast<U>(skip_v.val[k]) + static_cast<U>(bias_v.val[k]) :
+                                static_cast<U>(input_v.val[k]) + static_cast<U>(skip_v.val[k]);
+        const U rldval = reverse_ld * val;
 
         if (hasSkipInputBiasAdditionOutput) {
-          skip_input_bias_add_output_v[k] = input_v[k];
+          skip_input_bias_add_output_v.val[k] = static_cast<T>(val);
         }
-
-        const T rldval = reverse_ld * input_v[k];
-        thread_data = pair_sum(thread_data, hipcub::KeyValuePair<T, T>(rldval, rldval * input_v[k]));
+        thread_data = pair_sum(thread_data, hipcub::KeyValuePair<U, U>(rldval, rldval * val));
+        output_v.val[k] = static_cast<V>(val);
       }
 
       if (hasSkipInputBiasAdditionOutput) {
-        *(reinterpret_cast<VecT*>(&skip_input_bias_add_output[idx])) = *reinterpret_cast<VecT*>(&skip_input_bias_add_output_v);
+        *(reinterpret_cast<VecT*>(skip_input_bias_add_output + idx)) = skip_input_bias_add_output_v;
       }
 
-      *(reinterpret_cast<VecT*>(&output[idx])) = *reinterpret_cast<VecT*>(&input_v[0]);
+      *(reinterpret_cast<VecV*>(output + idx)) = output_v;
     }
   }
 
-  LayerNormVec<T, TPB, ILP>(thread_data, ld, offset, beta, gamma, epsilon, output);
+  LayerNormVec<U, V, TPB, ILP>(thread_data, ld, offset, beta, gamma, epsilon, output);
 }
 
 // Vectorized kernel
-template <typename T, unsigned TPB, int ILP>
+template <typename T, typename U, typename V, unsigned TPB, int ILP>
 __global__ void SkipLayerNormKernelSmall(
-    const int ld, const T* input, const T* skip, const T* beta, const T* gamma,
-    const T* bias, const T epsilon, T* output, T* skip_input_bias_add_output,
+    const int ld, const T* input, const T* skip, const V* beta, const V* gamma,
+    const T* bias, const U epsilon, V* output, T* skip_input_bias_add_output,
     bool hasBias, bool hasSkipInputBiasAdditionOutput) {
-  const T rld = T(1.f / ld);
+  const U rld = U(1.f / ld);
   const int idx = blockIdx.x * ld + threadIdx.x * ILP;  // grid_size = n / ld
 
   using VecT = aligned_vector<T, ILP>;
-  T input_v[ILP], skip_v[ILP], bias_v[ILP], skip_input_bias_add_output_v[ILP];
-
-  hipcub::KeyValuePair<T, T> thread_data(T(0.f), T(0.f));
+  hipcub::KeyValuePair<U, U> thread_data(U(0.f), U(0.f));
 
+  VecT input_v;
   if (ILP * threadIdx.x < ld) {
-    VecT* input_val = reinterpret_cast<VecT*>(&input_v);
-    *input_val = *reinterpret_cast<const VecT*>(&input[idx]);
-
-    VecT* skip_val = reinterpret_cast<VecT*>(&skip_v);
-    *skip_val = *reinterpret_cast<const VecT*>(&skip[idx]);
-
-    if (hasBias) {
-      VecT* bias_val = reinterpret_cast<VecT*>(&bias_v);
-      *bias_val = *reinterpret_cast<const VecT*>(&bias[threadIdx.x * ILP]);
-    }
+    input_v = *reinterpret_cast<const VecT*>(input + idx);
+    const VecT skip_v = *reinterpret_cast<const VecT*>(skip + idx);
+    const VecT bias_v = hasBias ? *reinterpret_cast<const VecT*>(bias + threadIdx.x * ILP) : VecT();
+    VecT skip_input_bias_add_output_v;
 
-    T rldval_sum = T(0.f);
-    T rldvalsq_sum = T(0.f);
+    U rldval_sum = U(0.f);
+    U rldvalsq_sum = U(0.f);
     #pragma unroll
     for (int i = 0; i < ILP; i++) {
-      input_v[i] += hasBias ? skip_v[i] + bias_v[i] : skip_v[i];
+      const U val = hasBias ? static_cast<U>(input_v.val[i]) + static_cast<U>(skip_v.val[i]) + static_cast<U>(bias_v.val[i]) :
+                              static_cast<U>(input_v.val[i]) + static_cast<U>(skip_v.val[i]);
 
       if (hasSkipInputBiasAdditionOutput) {
-        skip_input_bias_add_output_v[i] = input_v[i];
+        skip_input_bias_add_output_v.val[i] = static_cast<T>(val);
       }
 
-      const T rldval = rld * input_v[i];
+      const U rldval = rld * val;
       rldval_sum += rldval;
-      rldvalsq_sum += rldval * input_v[i];
+      rldvalsq_sum += rldval * val;
+      input_v.val[i] = static_cast<T>(val);
     }
 
     if (hasSkipInputBiasAdditionOutput) {
-      *(reinterpret_cast<VecT*>(&skip_input_bias_add_output[idx])) = *reinterpret_cast<VecT*>(&skip_input_bias_add_output_v);
+      *(reinterpret_cast<VecT*>(skip_input_bias_add_output + idx)) = skip_input_bias_add_output_v;
     }
 
-    thread_data = hipcub::KeyValuePair<T, T>(rldval_sum, rldvalsq_sum);
+    thread_data = hipcub::KeyValuePair<U, U>(rldval_sum, rldvalsq_sum);
   }
-  LayerNormSmall<T, TPB, ILP>(input_v, thread_data, ld, idx, beta, gamma, epsilon, output);
+  LayerNormSmall<T, U, V, TPB, ILP>(input_v.val, thread_data, ld, idx, beta, gamma, epsilon, output);
 }
 
 }  // namespace rocm
diff --git a/onnxruntime/contrib_ops/rocm/bert/skip_layer_norm_tunable_op.h b/onnxruntime/contrib_ops/rocm/bert/skip_layer_norm_tunable_op.h
index 4354b794144d0..a0b2507220ae1 100644
--- a/onnxruntime/contrib_ops/rocm/bert/skip_layer_norm_tunable_op.h
+++ b/onnxruntime/contrib_ops/rocm/bert/skip_layer_norm_tunable_op.h
@@ -18,76 +18,75 @@ namespace onnxruntime {
 namespace contrib {
 namespace rocm {
 
-template <typename T>
+template <typename T, typename V>
 struct SkipLayerNormParams : OpParams {
-  SkipLayerNormParams(RocmTuningContext* tuning_ctx, hipStream_t stream, T* output, T* skip_input_bias_add_output, const T* input,
-                      const T* skip, const T* gamma, const T* beta,
+  SkipLayerNormParams(RocmTuningContext* tuning_ctx, hipStream_t stream, V* output, T* skip_input_bias_add_output, const T* input,
+                      const T* skip, const V* gamma, const V* beta,
                       const T* bias, float epsilon, int ld, int element_count)
-      : OpParams(tuning_ctx, stream), output(output), skip_input_bias_add_output(skip_input_bias_add_output), input(input), skip(skip),
-        gamma(gamma), beta(beta), bias(bias), epsilon(epsilon), ld(ld), element_count(element_count) {}
+      : OpParams(tuning_ctx, stream), output(output), skip_input_bias_add_output(skip_input_bias_add_output), input(input), skip(skip), gamma(gamma), beta(beta), bias(bias), epsilon(epsilon), ld(ld), element_count(element_count) {}
 
   std::string Signature() const override {
     std::string sig = std::to_string(ld) + "_" + std::to_string(element_count);
     return sig;
   }
 
-  T* output;
+  V* output;
   T* skip_input_bias_add_output;
   const T* input;
   const T* skip;
-  const T* gamma;
-  const T* beta;
+  const V* gamma;
+  const V* beta;
   const T* bias;
   float epsilon;
   int ld;
   int element_count;
 };
 
-template <typename T, int ThreadsPerBlock, int VecSize>
-Status SkipLayerNormSmallOp(const SkipLayerNormParams<T>* params) {
+template <typename T, typename U, typename V, int ThreadsPerBlock, int VecSize>
+Status SkipLayerNormSmallOp(const SkipLayerNormParams<T, V>* params) {
   // Loosen the hard constraint for ld (hidden_size) to include more possible *Small kernels,
   // which could offer better performance in some combinations of ThreadsPerBlock and VecSize.
   TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
       !((params->ld <= 8192 && params->ld % VecSize == 0 &&
          params->ld <= ThreadsPerBlock * VecSize && params->ld > (ThreadsPerBlock - GPU_WARP_SIZE) * VecSize)));
-  SkipLayerNormKernelSmall<T, ThreadsPerBlock, VecSize><<<dim3(CeilDiv(params->element_count, params->ld)),
-                                                          dim3(ThreadsPerBlock),
-                                                          0, params->stream>>>(
+  SkipLayerNormKernelSmall<T, U, V, ThreadsPerBlock, VecSize><<<dim3(CeilDiv(params->element_count, params->ld)),
+                                                                dim3(ThreadsPerBlock),
+                                                                0, params->stream>>>(
       params->ld, params->input, params->skip,
-      params->beta, params->gamma, params->bias, maybe2half<T>(params->epsilon), params->output, params->skip_input_bias_add_output,
+      params->beta, params->gamma, params->bias, static_cast<U>(params->epsilon), params->output, params->skip_input_bias_add_output,
       (params->bias == nullptr) ? false : true, (params->skip_input_bias_add_output == nullptr) ? false : true);
   return HIP_CALL(hipGetLastError());
 }
 
-template <typename T, int ThreadsPerBlock, int VecSize>
-Status SkipLayerNormRegularOp(const SkipLayerNormParams<T>* params) {
+template <typename T, typename U, typename V, int ThreadsPerBlock, int VecSize>
+Status SkipLayerNormRegularOp(const SkipLayerNormParams<T, V>* params) {
   TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
       !((params->ld > 0 && params->ld % VecSize == 0 &&
          (params->ld >= ThreadsPerBlock * VecSize ||
           (params->ld < GPU_WARP_SIZE && params->ld > (ThreadsPerBlock - GPU_WARP_SIZE) * VecSize)))));
-  SkipLayerNormKernelVec<T, ThreadsPerBlock, VecSize><<<dim3(CeilDiv(params->element_count, params->ld)),
-                                                        dim3(ThreadsPerBlock),
-                                                        0, params->stream>>>(
+  SkipLayerNormKernelVec<T, U, V, ThreadsPerBlock, VecSize><<<dim3(CeilDiv(params->element_count, params->ld)),
+                                                              dim3(ThreadsPerBlock),
+                                                              0, params->stream>>>(
       params->ld, params->input, params->skip,
-      params->beta, params->gamma, params->bias, maybe2half<T>(params->epsilon), params->output, params->skip_input_bias_add_output,
+      params->beta, params->gamma, params->bias, static_cast<U>(params->epsilon), params->output, params->skip_input_bias_add_output,
       (params->bias == nullptr) ? false : true, (params->skip_input_bias_add_output == nullptr) ? false : true);
   return HIP_CALL(hipGetLastError());
 }
 
-template <typename T>
-Status SkipLayerNormStaticSelection(const SkipLayerNormParams<T>* params) {
+template <typename T, typename U, typename V>
+Status SkipLayerNormStaticSelection(const SkipLayerNormParams<T, V>* params) {
   bool hasBias = (params->bias == nullptr) ? false : true;
   bool hasSkipInputBiasAdditionOutput = (params->skip_input_bias_add_output == nullptr) ? false : true;
   const int grid_size = params->element_count / params->ld;
   const int block_size = 256;
 
-#define LAUNCH_SKIPLAYERNORM_SMALL_FORWARD(ELEMENTS, TPB, ILP)                              \
-  if (params->ld <= ELEMENTS) {                                                             \
-    SkipLayerNormKernelSmall<T, TPB, ILP><<<grid_size, TPB, 0, params->stream>>>(           \
-        params->ld, params->input, params->skip, params->beta, params->gamma, params->bias, \
-        maybe2half<T>(params->epsilon), params->output, params->skip_input_bias_add_output, \
-        hasBias, hasSkipInputBiasAdditionOutput);                                           \
-    break;                                                                                  \
+#define LAUNCH_SKIPLAYERNORM_SMALL_FORWARD(ELEMENTS, TPB, ILP)                               \
+  if (params->ld <= ELEMENTS) {                                                              \
+    SkipLayerNormKernelSmall<T, U, V, TPB, ILP><<<grid_size, TPB, 0, params->stream>>>(      \
+        params->ld, params->input, params->skip, params->beta, params->gamma, params->bias,  \
+        static_cast<U>(params->epsilon), params->output, params->skip_input_bias_add_output, \
+        hasBias, hasSkipInputBiasAdditionOutput);                                            \
+    break;                                                                                   \
   }
   if (0 == (params->ld % 4)) {
     do {
@@ -98,9 +97,9 @@ Status SkipLayerNormStaticSelection(const SkipLayerNormParams<T>* params) {
       LAUNCH_SKIPLAYERNORM_SMALL_FORWARD(768, 192, 4)
       LAUNCH_SKIPLAYERNORM_SMALL_FORWARD(1024, 256, 4)
 
-      SkipLayerNormKernel<T, block_size><<<grid_size, block_size, 0, params->stream>>>(
+      SkipLayerNormKernel<T, U, V, block_size><<<grid_size, block_size, 0, params->stream>>>(
           params->ld, params->input, params->skip, params->beta, params->gamma, params->bias,
-          maybe2half<T>(params->epsilon), params->output, params->skip_input_bias_add_output);
+          static_cast<U>(params->epsilon), params->output, params->skip_input_bias_add_output);
     } while (0);
   } else {
     do {
@@ -109,20 +108,20 @@ Status SkipLayerNormStaticSelection(const SkipLayerNormParams<T>* params) {
       LAUNCH_SKIPLAYERNORM_SMALL_FORWARD(128, 128, 1)
       LAUNCH_SKIPLAYERNORM_SMALL_FORWARD(384, 384, 1)
 
-      SkipLayerNormKernel<T, block_size><<<grid_size, block_size, 0, params->stream>>>(
+      SkipLayerNormKernel<T, U, V, block_size><<<grid_size, block_size, 0, params->stream>>>(
           params->ld, params->input, params->skip, params->beta, params->gamma, params->bias,
-          maybe2half<T>(params->epsilon), params->output, params->skip_input_bias_add_output);
+          static_cast<U>(params->epsilon), params->output, params->skip_input_bias_add_output);
     } while (0);
   }
   return HIP_CALL(hipPeekAtLastError());
 }  // namespace rocm
 
 #define ADD_OP_FOR_ALL_VEC_SIZE(name, threads_per_block) \
-  this->RegisterOp(name<T, threads_per_block, 1>);       \
-  this->RegisterOp(name<T, threads_per_block, 2>);       \
-  this->RegisterOp(name<T, threads_per_block, 4>);       \
-  this->RegisterOp(name<T, threads_per_block, 8>);       \
-  this->RegisterOp(name<T, threads_per_block, 16>);
+  this->RegisterOp(name<T, U, V, threads_per_block, 1>); \
+  this->RegisterOp(name<T, U, V, threads_per_block, 2>); \
+  this->RegisterOp(name<T, U, V, threads_per_block, 4>); \
+  this->RegisterOp(name<T, U, V, threads_per_block, 8>); \
+  this->RegisterOp(name<T, U, V, threads_per_block, 16>);
 
 #define ADD_OP_FOR_ALL_THREADS_PER_BLOCK_ALL_VEC_SIZE(name) \
   ADD_OP_FOR_ALL_VEC_SIZE(name, 64)                         \
@@ -141,11 +140,11 @@ Status SkipLayerNormStaticSelection(const SkipLayerNormParams<T>* params) {
   ADD_OP_FOR_ALL_VEC_SIZE(name, 896)                        \
   ADD_OP_FOR_ALL_VEC_SIZE(name, 1024)
 
-template <typename T>
-class SkipLayerNormTunableOp : public TunableOp<SkipLayerNormParams<T>> {
+template <typename T, typename U, typename V>
+class SkipLayerNormTunableOp : public TunableOp<SkipLayerNormParams<T, V>> {
  public:
   SkipLayerNormTunableOp() {
-    this->RegisterOp(SkipLayerNormStaticSelection<T>);
+    this->RegisterOp(SkipLayerNormStaticSelection<T, U, V>);
     ADD_OP_FOR_ALL_THREADS_PER_BLOCK_ALL_VEC_SIZE(SkipLayerNormSmallOp)
     ADD_OP_FOR_ALL_THREADS_PER_BLOCK_ALL_VEC_SIZE(SkipLayerNormRegularOp)
 
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/rocm/skip_layer_norm.cu b/onnxruntime/python/tools/kernel_explorer/kernels/rocm/skip_layer_norm.cu
index ac5ec602f8dda..37a9f14769d5a 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/rocm/skip_layer_norm.cu
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/rocm/skip_layer_norm.cu
@@ -23,16 +23,16 @@ class SkipLayerNormSmall : public IKernelExplorer {
                 static_cast<T*>(beta.ptr()), static_cast<T*>(bias.ptr()), epsilon, hidden_size, element_count) {}
 
   void Run() override {
-    ORT_THROW_IF_ERROR((contrib::rocm::SkipLayerNormSmallOp<T, ThreadsPerBlock, VecSize>(&params_)));
+    ORT_THROW_IF_ERROR((contrib::rocm::SkipLayerNormSmallOp<T, float, T, ThreadsPerBlock, VecSize>(&params_)));
   }
 
   bool IsSupported() {
-    Status status = contrib::rocm::SkipLayerNormSmallOp<T, ThreadsPerBlock, VecSize>(&params_);
+    Status status = contrib::rocm::SkipLayerNormSmallOp<T, float, T, ThreadsPerBlock, VecSize>(&params_);
     return status.IsOK();
   }
 
  private:
-  using ParamsT = contrib::rocm::SkipLayerNormParams<T>;
+  using ParamsT = contrib::rocm::SkipLayerNormParams<T, T>;
   ParamsT params_{};
 };
 
@@ -47,16 +47,16 @@ class SkipLayerNormRegular : public IKernelExplorer {
                 static_cast<T*>(beta.ptr()), static_cast<T*>(bias.ptr()), epsilon, hidden_size, element_count) {}
 
   void Run() override {
-    ORT_THROW_IF_ERROR((contrib::rocm::SkipLayerNormRegularOp<T, ThreadsPerBlock, VecSize>(&params_)));
+    ORT_THROW_IF_ERROR((contrib::rocm::SkipLayerNormRegularOp<T, float, T, ThreadsPerBlock, VecSize>(&params_)));
   }
 
   bool IsSupported() {
-    Status status = contrib::rocm::SkipLayerNormRegularOp<T, ThreadsPerBlock, VecSize>(&params_);
+    Status status = contrib::rocm::SkipLayerNormRegularOp<T, float, T, ThreadsPerBlock, VecSize>(&params_);
     return status.IsOK();
   }
 
  private:
-  using ParamsT = contrib::rocm::SkipLayerNormParams<T>;
+  using ParamsT = contrib::rocm::SkipLayerNormParams<T, T>;
   ParamsT params_{};
 };
 
@@ -71,16 +71,16 @@ class SkipLayerNormStaticSelection : public IKernelExplorer {
                 static_cast<T*>(beta.ptr()), static_cast<T*>(bias.ptr()), epsilon, hidden_size, element_count) {}
 
   void Run() override {
-    ORT_THROW_IF_ERROR((contrib::rocm::SkipLayerNormStaticSelection<T>(&params_)));
+    ORT_THROW_IF_ERROR((contrib::rocm::SkipLayerNormStaticSelection<T, float, T>(&params_)));
   }
 
   bool IsSupported() {
-    Status status = contrib::rocm::SkipLayerNormStaticSelection<T>(&params_);
+    Status status = contrib::rocm::SkipLayerNormStaticSelection<T, float, T>(&params_);
     return status.IsOK();
   }
 
  private:
-  using ParamsT = contrib::rocm::SkipLayerNormParams<T>;
+  using ParamsT = contrib::rocm::SkipLayerNormParams<T, T>;
   ParamsT params_{};
 };
 
@@ -105,9 +105,9 @@ class SkipLayerNormTunable : public IKernelExplorer {
   }
 
  private:
-  using ParamsT = contrib::rocm::SkipLayerNormParams<T>;
+  using ParamsT = contrib::rocm::SkipLayerNormParams<T, T>;
   ParamsT params_{};
-  contrib::rocm::SkipLayerNormTunableOp<T> op_{};
+  contrib::rocm::SkipLayerNormTunableOp<T, float, T> op_{};
 };
 
 #define REGISTER_OP(name, type, threads_per_block, vec_size)                                                   \
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/skip_layer_norm_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/skip_layer_norm_test.py
index 2a653f92a43f9..006e563ed8995 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/skip_layer_norm_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/skip_layer_norm_test.py
@@ -92,7 +92,7 @@ def run_skip_layer_norm(batch_size: int, seq_len: int, hidden_size: int, dtype:
         y_ref, y_optional = skip_layer_norm(input_x, skip, bias, gamma, beta, epsilon)
         np.testing.assert_almost_equal(y_ref, output_y, decimal=1)
         if has_optional_output:
-            np.testing.assert_almost_equal(y_optional, output_optional, decimal=1)
+            np.testing.assert_almost_equal(y_optional, output_optional, decimal=3)
 
 
 dtypes = ["float32", "float16"]

From 56bccac35d10042fc3c6a37e10335d134602206c Mon Sep 17 00:00:00 2001
From: PeixuanZuo <94887879+PeixuanZuo@users.noreply.github.com>
Date: Fri, 24 Mar 2023 21:43:44 +0800
Subject: [PATCH 16/20] [ROCm] update bert-L convergence reference file to fix
 CI (#15200)

The change of layernorm lead to the change of bert-L convergence result.
---
 .../bert_base.convergence.baseline.mi100.csv  | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/orttraining/tools/ci_test/results/bert_base.convergence.baseline.mi100.csv b/orttraining/tools/ci_test/results/bert_base.convergence.baseline.mi100.csv
index 10b5b22c070d5..21363646655ae 100644
--- a/orttraining/tools/ci_test/results/bert_base.convergence.baseline.mi100.csv
+++ b/orttraining/tools/ci_test/results/bert_base.convergence.baseline.mi100.csv
@@ -1,11 +1,11 @@
 step,total_loss,mlm_loss,nsp_loss
-0,11.2032,10.501,0.702181
-5,9.53939,7.52411,2.01528
-10,8.2614,7.564,0.697406
-15,8.28412,7.55601,0.728112
-20,8.17273,7.45947,0.71326
-25,8.228,7.53251,0.695496
-30,8.07991,7.38456,0.695344
-35,7.96173,7.25046,0.711262
-40,7.9463,7.25667,0.689625
-45,7.92987,7.26442,0.665449
+0,11.2031,10.501,0.702177
+5,9.54193,7.52403,2.01789
+10,8.26154,7.56404,0.6975
+15,8.28391,7.55602,0.727896
+20,8.17255,7.4595,0.713049
+25,8.2279,7.53246,0.69544
+30,8.07992,7.38461,0.695307
+35,7.96175,7.25041,0.711333
+40,7.94631,7.25669,0.689622
+45,7.92991,7.26444,0.665472

From dc87691000ceca1b42e3c46b1ce9066a739505ed Mon Sep 17 00:00:00 2001
From: Justin Stoecker <justoeck@microsoft.com>
Date: Fri, 24 Mar 2023 13:50:17 -0700
Subject: [PATCH 17/20] Enable DML graph fusion independently of graph
 optimization level (#15172)

### Description
Apply the DML graph fusion transformer optimization independently of ORT
graph optimization level.

### Motivation and Context
The DML graph fusion transformer is not a graph optimizer in the normal
sense: it isn't optimizing the ONNX graph structure, but rather fusing
nodes into what will later become a single IDMLCompiledOperator (using
IDMLDevice1::CompileGraph). This transformer can't be done ahead of time
(hence why it's disabled if saving an optimized model), but it's also
gated by the ORT graph optimization level; this makes it impossible to
preoptimize ONNX models ("offline mode") and then later disable graph
optimizations for better startup performance ("online mode") while
benefiting from DML graph fusion.
---
 .../dml/dml_session_options_config_keys.h     | 23 +++++++++++++++++++
 onnxruntime/core/session/inference_session.cc | 10 ++++++--
 2 files changed, 31 insertions(+), 2 deletions(-)
 create mode 100644 onnxruntime/core/providers/dml/dml_session_options_config_keys.h

diff --git a/onnxruntime/core/providers/dml/dml_session_options_config_keys.h b/onnxruntime/core/providers/dml/dml_session_options_config_keys.h
new file mode 100644
index 0000000000000..d11fa7516e713
--- /dev/null
+++ b/onnxruntime/core/providers/dml/dml_session_options_config_keys.h
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+/*
+ * This file defines SessionOptions Config Keys and format of the Config Values.
+ *
+ * The Naming Convention for a SessionOptions Config Key,
+ * "[Area][.[SubArea1].[SubArea2]...].[Keyname]"
+ * Such as "ep.cuda.use_arena"
+ * The Config Key cannot be empty
+ * The maximum length of the Config Key is 128
+ *
+ * The string format of a SessionOptions Config Value is defined individually for each Config.
+ * The maximum length of the Config Value is 1024
+ */
+
+// Influences whether or not the DirectML graph fusion transformer is allowed.
+// "0": not disabled (allowed). Graph fusion will be used if the session is not configured to save an optimized model.
+// "1": disabled (disallowed). Graph fusion will never be used.
+// The default value is "0"
+static const char* const kOrtSessionOptionsConfigDisableDmlGraphFusion = "ep.dml.disable_graph_fusion";
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index e07e6a2d6ba0f..2c5d9f091986d 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -51,6 +51,7 @@
 #ifdef USE_DML  // TODO: This is necessary for the workaround in TransformGraph
 #include "core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h"
 #include "core/providers/dml/DmlExecutionProvider/src/GraphTransformer.h"
+#include "core/providers/dml/dml_session_options_config_keys.h"
 #endif
 #include "core/session/environment.h"
 #include "core/session/IOBinding.h"
@@ -1023,7 +1024,7 @@ Status InferenceSession::LoadOrtModelWithLoader(std::function<Status()> load_ort
                 "The ORT format model version [", fbs_ort_model_version->string_view(),
                 "] is not supported in this build ", ORT_VERSION, ". ",
                 kOrtFormatVersion5BreakingChangeNote);
-#else  // ^^ defined(ORT_MINIMAL_BUILD) ^^ / vv !defined(ORT_MINIMAL_BUILD) vv
+#else   // ^^ defined(ORT_MINIMAL_BUILD) ^^ / vv !defined(ORT_MINIMAL_BUILD) vv
   const auto has_saved_runtime_optimizations = [](const fbs::InferenceSession& fbs_session) -> bool {
     if (const auto* fbs_model = fbs_session.model()) {
       if (const auto* fbs_graph = fbs_model->graph()) {
@@ -1384,8 +1385,13 @@ common::Status InferenceSession::Initialize() {
 
 #ifdef USE_DML
       if (execution_providers_.Get(kDmlExecutionProvider)) {
+        // DML graph fusion is an important runtime optimization that cannot be done ahead of time; it must be disabled
+        // when running in "offline mode" and saving an optimized model to disk. To support users that want to optimize
+        // models offline, and then disable graph optimizations when running "online", this transformer ignores the ORT
+        // graph optimization level and is generally always applied.
         bool dml_graph_fusion_enabled = session_options_.optimized_model_filepath.empty() &&
-                                        session_options_.graph_optimization_level >= TransformerLevel::Level3;
+                                        session_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsConfigDisableDmlGraphFusion, "0") == "0";
+
         if (dml_graph_fusion_enabled) {
           std::unique_ptr<onnxruntime::GraphTransformer> dmlGraphFusionTransformer = std::make_unique<Dml::DmlGraphFusionTransformer>("DmlGraphFusionTransformer",
                                                                                                                                       execution_providers_.Get(kDmlExecutionProvider));

From 2de15c5d503cf6e4d679a2b8fca7135adc9f5ae8 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Date: Fri, 24 Mar 2023 13:52:17 -0700
Subject: [PATCH 18/20] Re-work OrtApi struct to satisfy C++20 compilers
 (#15183)

### Description
<!-- Describe your changes. -->
Remove `deletion` of copy functions from `OrtApi` as its initialization
no longer compiles in C++20.
Introduce a non-copyable member to implicitly delete copy ctor.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
Inspired by https://github.com/microsoft/onnxruntime/pull/14901

Solution credits: @RyanUnderhill

Cc: @georgthegreat
---
 include/onnxruntime/core/session/onnxruntime_c_api.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 2d94b8481ef32..6832cc93af26a 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -4086,10 +4086,6 @@ struct OrtApi {
    * \since Version 1.15.
    */
   ORT_API2_STATUS(KernelInfoGetConstantInput_tensor, _In_ const OrtKernelInfo* info, size_t index, _Out_ int* is_constant, _Outptr_ const OrtValue** out); 
-
-#ifdef __cplusplus
-  OrtApi(const OrtApi&) = delete;  // Prevent users from accidentally copying the API structure, it should always be passed as a pointer
-#endif
 };
 
 /*

From d834ec895af8c2dd6e718c21b575697c01b8ec4f Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Fri, 24 Mar 2023 15:29:03 -0700
Subject: [PATCH 19/20] Adopt linrtunner as the linting tool - take 2 (#15085)

### Description

`lintrunner` is a linter runner successfully used by pytorch, onnx and
onnx-script. It provides a uniform experience running linters locally
and in CI. It supports all major dev systems: Windows, Linux and MacOs.
The checks are enforced by the `Python format` workflow.

This PR adopts `lintrunner` to onnxruntime and fixed ~2000 flake8 errors
in Python code. `lintrunner` now runs all required python lints
including `ruff`(replacing `flake8`), `black` and `isort`. Future lints
like `clang-format` can be added.

Most errors are auto-fixed by `ruff` and the fixes should be considered
robust.

Lints that are more complicated to fix are applied `# noqa` for now and
should be fixed in follow up PRs.

### Notable changes

1. This PR **removed some suboptimal patterns**:

	- `not xxx in` -> `xxx not in` membership checks
	- bare excepts (`except:` -> `except Exception`)
	- unused imports

	The follow up PR will remove:

	- `import *`
	- mutable values as default in function definitions (`def func(a=[])`)
	- more unused imports
	- unused local variables

2. Use `ruff` to replace `flake8`. `ruff` is much (40x) faster than
flake8 and is more robust. We are using it successfully in onnx and
onnx-script. It also supports auto-fixing many flake8 errors.

3. Removed the legacy flake8 ci flow and updated docs.

4. The added workflow supports SARIF code scanning reports on github,
example snapshot:


![image](https://user-images.githubusercontent.com/11205048/212598953-d60ce8a9-f242-4fa8-8674-8696b704604a.png)

5. Removed `onnxruntime-python-checks-ci-pipeline` as redundant

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Unified linting experience in CI and local.

Replacing https://github.com/microsoft/onnxruntime/pull/14306

---------

Signed-off-by: Justin Chu <justinchu@microsoft.com>
---
 .flake8                                       |  27 -
 .github/workflows/generate-skip-doc-change.py |   2 +-
 .github/workflows/lint.yml                    |  90 ++-
 .lintrunner.toml                              | 154 +++++
 cgmanifests/generate_cgmanifest.py            |  10 +-
 cgmanifests/print_submodule_info.py           |   8 +-
 docs/Coding_Conventions_and_Standards.md      |  49 +-
 docs/python/inference/conf.py                 |   6 +-
 .../python/inference/examples/plot_backend.py |   8 +-
 .../inference/examples/plot_common_errors.py  |  20 +-
 .../examples/plot_load_and_predict.py         |   2 +-
 .../inference/examples/plot_metadata.py       |  32 +-
 .../inference/examples/plot_pipeline.py       |  10 +-
 .../inference/examples/plot_profiling.py      |   6 +-
 .../examples/plot_train_convert_predict.py    |  30 +-
 onnxruntime/__init__.py                       |  66 ++-
 .../ort_flatbuffers_py/fbs/__init__.py        |   5 +-
 .../core/flatbuffers/schema/compile_schema.py |  73 ++-
 onnxruntime/python/backend/__init__.py        |   2 +-
 onnxruntime/python/backend/backend.py         |  14 +-
 onnxruntime/python/backend/backend_rep.py     |   4 +-
 onnxruntime/python/datasets/__init__.py       |   2 +-
 .../python/onnxruntime_collect_build_info.py  |   4 +-
 .../onnxruntime_inference_collection.py       |  23 +-
 onnxruntime/python/onnxruntime_validation.py  |  14 +-
 onnxruntime/python/providers/tvm/__init__.py  |   2 +-
 onnxruntime/python/providers/tvm/ort.py       |  12 +-
 .../kernels/_kernel_explorer.pyi              |   4 +-
 .../kernels/batched_gemm_test.py              |   6 +-
 .../kernels/gemm_fast_gelu_test.py            |   4 +-
 .../kernels/gemm_softmax_gemm_permute_test.py |  36 +-
 .../kernel_explorer/kernels/gemm_test.py      |  10 +-
 .../kernels/kernel_explorer.py                |   8 +-
 .../kernels/skip_layer_norm_test.py           |   2 +-
 .../kernels/strided_batched_gemm_test.py      |  10 +-
 .../tools/kernel_explorer/kernels/utils.py    |   2 +-
 .../python/tools/microbench/benchmark.py      |   6 +-
 onnxruntime/python/tools/offline_tuning.py    |  12 +-
 onnxruntime/python/tools/onnxruntime_test.py  |  22 +-
 .../profile_explorer/profile_explorer.py      |  13 +-
 .../tools/pytorch_export_contrib_ops.py       |   2 +-
 .../CalTableFlatBuffers/KeyValue.py           |  28 +-
 .../CalTableFlatBuffers/TrtTable.py           |  30 +-
 .../python/tools/quantization/__init__.py     |  26 +-
 .../python/tools/quantization/calibrate.py    |  47 +-
 .../python/tools/quantization/onnx_model.py   |  30 +-
 .../tools/quantization/onnx_quantizer.py      |  41 +-
 .../quantization/operators/activation.py      |   3 +-
 .../tools/quantization/operators/attention.py |   4 +-
 .../tools/quantization/operators/binary_op.py |   2 +-
 .../tools/quantization/operators/concat.py    |  10 +-
 .../tools/quantization/operators/conv.py      |   4 +-
 .../quantization/operators/embed_layernorm.py |   2 +-
 .../tools/quantization/operators/gemm.py      |  22 +-
 .../tools/quantization/operators/lstm.py      |  38 +-
 .../tools/quantization/operators/matmul.py    |   2 +-
 .../tools/quantization/operators/pad.py       |   1 -
 .../operators/qdq_base_operator.py            |   8 +-
 .../tools/quantization/qdq_loss_debug.py      |   2 +-
 .../tools/quantization/qdq_quantizer.py       |  16 +-
 .../python/tools/quantization/quant_utils.py  |  30 +-
 .../python/tools/quantization/registry.py     |  10 +-
 .../tools/quantization/shape_inference.py     |   2 +-
 .../python/tools/symbolic_shape_infer.py      | 236 ++++----
 .../python/tools/tensorrt/perf/benchmark.py   |  93 ++--
 .../tools/tensorrt/perf/benchmark_wrapper.py  | 194 +++----
 .../tools/tensorrt/perf/build/build_image.py  |   6 +-
 .../tensorrt/perf/build/ort_build_latest.py   |  42 +-
 .../comparison_scripts/compare_latency.py     |   2 +-
 .../python/tools/tensorrt/perf/perf_utils.py  |  31 +-
 .../python/tools/tensorrt/perf/post.py        |   2 +-
 .../perf/setup_scripts/setup_onnx_zoo.py      |   2 +-
 .../python/tools/transformers/__init__.py     |   4 +-
 .../python/tools/transformers/benchmark.py    |  51 +-
 .../tools/transformers/benchmark_helper.py    |  32 +-
 .../tools/transformers/bert_perf_test.py      |  14 +-
 .../tools/transformers/bert_test_data.py      |  12 +-
 .../transformers/compare_bert_results.py      |  13 +-
 .../tools/transformers/convert_generation.py  |  23 +-
 .../convert_tf_models_to_pytorch.py           |   6 +-
 .../python/tools/transformers/float16.py      |  12 +-
 .../tools/transformers/fusion_attention.py    |   4 +-
 .../tools/transformers/fusion_bias_add.py     |   2 +-
 .../transformers/fusion_biassplitgelu.py      |   3 +-
 .../tools/transformers/fusion_embedlayer.py   |  15 +-
 .../transformers/fusion_gelu_approximation.py |   2 +-
 .../transformers/fusion_gpt_attention.py      |   8 +-
 .../fusion_gpt_attention_megatron.py          |   6 +-
 .../fusion_gpt_attention_no_past.py           |  10 +-
 .../tools/transformers/fusion_layernorm.py    |   4 +-
 .../transformers/fusion_qordered_attention.py |   2 +-
 .../transformers/fusion_qordered_gelu.py      |   2 +-
 .../transformers/fusion_qordered_layernorm.py |   2 +-
 .../transformers/fusion_qordered_matmul.py    |   2 +-
 .../python/tools/transformers/fusion_shape.py |   2 +-
 .../transformers/fusion_skiplayernorm.py      |   8 +-
 .../tools/transformers/fusion_transpose.py    |   2 +-
 .../python/tools/transformers/fusion_utils.py |  10 +-
 .../tools/transformers/io_binding_helper.py   |   6 +-
 .../python/tools/transformers/machine_info.py |   6 +-
 .../tools/transformers/models/bart/export.py  |  12 +-
 .../utils/chain_enc_dec_with_beamsearch.py    |   2 +-
 .../bart/utils/export_summarization_edinit.py |   4 +-
 .../export_summarization_enc_dec_past.py      |   7 +-
 .../models/bart/utils/onnx_inference.py       |   1 -
 .../transformers/models/bert/eval_squad.py    |   6 +-
 .../models/gpt2/benchmark_gpt2.py             |   8 +-
 .../models/gpt2/convert_to_onnx.py            |  10 +-
 .../transformers/models/gpt2/gpt2_helper.py   |  27 +-
 .../transformers/models/gpt2/gpt2_parity.py   |  21 +-
 .../transformers/models/gpt2/gpt2_tester.py   |  26 +-
 .../models/gpt2/parity_check_helper.py        |   4 +-
 .../models/longformer/benchmark_longformer.py |  14 +-
 .../models/longformer/convert_to_onnx.py      |   9 +-
 .../models/longformer/generate_test_data.py   |  11 +-
 .../stable_diffusion/optimize_pipeline.py     |  10 +-
 .../transformers/models/t5/past_helper.py     |   2 +-
 .../transformers/models/t5/t5_decoder.py      |   4 +-
 .../models/t5/t5_encoder_decoder_init.py      |   4 +-
 .../tools/transformers/models/t5/t5_helper.py |  10 +-
 .../tools/transformers/onnx_exporter.py       |  27 +-
 .../python/tools/transformers/onnx_model.py   |  10 +-
 .../tools/transformers/onnx_model_bart.py     |   3 +-
 .../tools/transformers/onnx_model_bert.py     |   5 +-
 .../transformers/onnx_model_bert_keras.py     |  51 +-
 .../tools/transformers/onnx_model_bert_tf.py  |  18 +-
 .../tools/transformers/onnx_model_gpt2.py     |   3 +-
 .../tools/transformers/onnx_model_t5.py       |   3 +-
 .../tools/transformers/onnx_model_tnlr.py     |   4 +-
 .../python/tools/transformers/optimizer.py    |  10 +-
 .../python/tools/transformers/profiler.py     |   4 +-
 .../tools/transformers/quantize_helper.py     |   2 +-
 .../tools/transformers/shape_infer_helper.py  |   2 +-
 .../tools/transformers/shape_optimizer.py     |  13 +-
 .../transformers/torch_onnx_export_helper.py  |   2 +-
 .../ort_torch_ext/__init__.py                 |   2 +-
 .../contrib_ops/attention_lstm_data_gen.py    |  14 +-
 .../multihead_attention_op_test_data_gen.py   |   4 +-
 onnxruntime/test/onnx/gen_test_models.py      |  14 +-
 .../reduction_test_cases_generator.py         |  10 +-
 onnxruntime/test/providers/cpu/rnn/GRU.py     | 137 +++--
 onnxruntime/test/providers/cpu/rnn/LSTM.py    | 214 ++++---
 .../test/python/contrib_ops/aten_op_tests.py  |   6 +-
 .../contrib_ops/onnx_contrib_ops_helper.py    |  12 +-
 onnxruntime/test/python/helper.py             |   2 +-
 .../python/onnxruntime_test_collective.py     |  16 +-
 .../python/onnxruntime_test_ort_trainer.py    |  93 ++--
 ...e_test_ort_trainer_with_mixed_precision.py |   6 +-
 .../test/python/onnxruntime_test_python.py    | 163 +++---
 .../python/onnxruntime_test_python_azure.py   |  13 +-
 .../python/onnxruntime_test_python_backend.py |   8 +-
 .../onnxruntime_test_python_backend_mlops.py  |   8 +-
 .../onnxruntime_test_python_cudagraph.py      |  18 +-
 .../onnxruntime_test_python_iobinding.py      |   8 +-
 .../python/onnxruntime_test_python_keras.py   |   9 +-
 .../python/onnxruntime_test_python_mlops.py   |  10 +-
 .../onnxruntime_test_python_sparse_matmul.py  |  18 +-
 ...untime_test_python_symbolic_shape_infer.py |  12 +-
 .../onnxruntime_test_training_unit_tests.py   |   4 +-
 .../test/python/quantization/op_test_utils.py |   8 +-
 .../python/quantization/test_calibration.py   |   3 +-
 .../python/quantization/test_conv_dynamic.py  |  15 +-
 .../python/quantization/test_onnx_model.py    |   7 +-
 .../python/quantization/test_op_argmax.py     |  11 +-
 .../python/quantization/test_op_attention.py  |   3 +-
 .../python/quantization/test_op_concat.py     |   6 +-
 .../quantization/test_op_embed_layernorm.py   |   3 +-
 .../python/quantization/test_op_gavgpool.py   |   9 +-
 .../test/python/quantization/test_op_gemm.py  |  15 +-
 .../test_op_instance_normalization.py         |   1 -
 .../python/quantization/test_op_maxpool.py    |   9 +-
 .../test/python/quantization/test_op_pad.py   |  11 +-
 .../python/quantization/test_op_pooling.py    |  18 +-
 .../test/python/quantization/test_op_relu.py  |  11 +-
 .../python/quantization/test_op_reshape.py    |   9 +-
 .../python/quantization/test_op_resize.py     |   9 +-
 .../python/quantization/test_op_softmax.py    |  10 +-
 .../test/python/quantization/test_op_split.py |   6 +-
 .../quantization/test_op_squeeze_unsqueeze.py |  11 +-
 .../python/quantization/test_op_transpose.py  |   9 +-
 .../test/python/quantization/test_op_where.py |   7 +-
 .../test/python/quantization/test_qdq.py      |  23 +-
 .../quantization/test_qdq_loss_debug.py       |   5 +-
 .../python/quantization/test_quant_util.py    |   1 -
 .../quantization/test_quantize_static.py      |   3 +-
 .../quantization/test_symmetric_flag.py       |  10 +-
 .../python/test_pytorch_export_contrib_ops.py |  12 +-
 .../transformers/gpt2_model_generator.py      |   5 +-
 .../test/python/transformers/model_loader.py  |   2 +-
 .../python/transformers/parity_utilities.py   |   9 +-
 .../transformers/test_attention_fusion.py     |   4 +-
 .../generate_tiny_keras2onnx_bert_models.py   |  43 +-
 .../generate_tiny_gpt2_model.py               |  40 +-
 .../python/transformers/test_gelu_fusions.py  |   2 +-
 .../transformers/test_gemmfastgelu_fusion.py  |   4 +-
 .../python/transformers/test_generation.py    |  10 +-
 .../transformers/test_gpt2_benchmark.py       |   1 -
 .../python/transformers/test_optimizer.py     |   7 +-
 .../test_parity_decoder_attention.py          |  28 +-
 .../python/transformers/test_parity_gelu.py   |  10 +-
 .../test_parity_huggingface_gpt_attention.py  |   8 +-
 .../transformers/test_parity_layernorm.py     |  14 +-
 .../test_parity_neox_attention.py             |   1 -
 .../python/transformers/test_parity_t5_mha.py |   8 +-
 .../test/python/transformers/test_profiler.py |   1 -
 onnxruntime/test/testdata/CNTK/gen.py         |  46 +-
 .../test/testdata/capi_symbolic_dims.py       |   2 +-
 .../test/testdata/coreml_argmax_cast_test.py  |   2 +-
 .../testdata/dynamic_quantize_matmul_test.py  |   4 +-
 .../testdata/ep_dynamic_graph_input_test.py   |   2 +-
 .../test/testdata/ep_partitioning_tests.py    |   2 +-
 .../test/testdata/matmul_integer_to_float.py  |   4 +-
 ...ith_external_initializer_come_from_user.py |  16 +-
 .../model_with_external_initializers.py       |  18 +-
 .../test/testdata/model_with_metadata.py      |   2 +-
 .../testdata/nnapi_internal_uint8_support.py  |   2 +-
 .../testdata/nnapi_reshape_flatten_test.py    |   2 +-
 .../testdata/nnapi_sigmoid_input_rank_test.py |   2 +-
 .../testdata/sparse_initializer_as_output.py  |  24 +-
 .../test/testdata/sparse_to_dense_matmul.py   |  30 +-
 .../lr_scheduler_test_data_generator.py       |   5 +-
 .../sgd_test/sgd_test_data_generator.py       |   4 +-
 .../test_kernel_info_get_const_input.py       |   2 +-
 .../testdata/transform/cast_elimination.py    |   3 +-
 .../transform/computation_reduction.py        |  14 +-
 .../computation_reduction/gathernd/e2e.py     |  12 +-
 .../gathernd/gathernd_add.py                  |   6 +-
 .../gathernd/gathernd_div.py                  |   6 +-
 .../gathernd/gathernd_gelu.py                 |   3 +-
 .../gathernd/gathernd_layernormalization.py   |   6 +-
 .../gathernd/gathernd_matmul.py               |   2 +-
 .../testdata/transform/concat_graph_gen.py    |   2 +-
 .../transform/concat_slice_elimination.py     |   4 +-
 .../test/testdata/transform/cse/generate.py   |   2 +-
 .../testdata/transform/expand_elimination.py  |   2 +-
 .../transform/fusion/attention_gen.py         |   6 +-
 ...stant_folding_with_shape_to_initializer.py |   2 +-
 .../test/testdata/transform/fusion/div_mul.py |   4 +-
 .../fusion/dynamic_quantize_matmul.py         |   4 +-
 .../transform/fusion/embed_layer_norm_gen.py  |  24 +-
 .../testdata/transform/fusion/fast_gelu.py    |   4 +-
 .../testdata/transform/fusion/fast_gelu2.py   |   4 +-
 .../transform/fusion/fast_gelu3_with_casts.py |   2 +-
 .../testdata/transform/fusion/gelu_gen.py     |   4 +-
 .../transform/fusion/gemm_transpose_gen.py    |   1 +
 .../transform/fusion/isinf_reducesum.py       |   4 +-
 .../transform/fusion/layer_norm_t5_gen.py     |   4 +-
 .../fusion/layer_norm_with_cast_2.py          |   6 +-
 .../fusion/layer_norm_with_cast_3.py          |   4 +-
 .../fusion/matmul_integer_to_float.py         |   8 +-
 .../transform/fusion/matmul_scale_gen.py      |   3 +-
 .../testdata/transform/fusion/not_where.py    |   4 +-
 .../transform/fusion/skip_layer_norm_gen.py   |   2 +-
 .../test/testdata/transform/id-elim.py        |   4 +-
 .../test/testdata/transform/id-scan9_sum.py   |   4 +-
 .../bart_mlp_megatron_basic_test.py           |   2 +-
 ...bart_self_attention_megatron_basic_test.py |   4 +-
 .../model_parallel/mlp_megatron_basic_test.py |   2 +-
 .../self_attention_megatron_basic_test.py     |   2 +-
 .../test/testdata/transform/noop-add.py       |   2 +-
 .../propagate_cast/gen_propagate_cast.py      |  27 +-
 .../test/testdata/transform/qdq_conv_gen.py   |   4 +-
 .../orttraining/eager/opgen/onnxgen.py        |  17 +-
 orttraining/orttraining/eager/opgen/opgen.py  |   2 +-
 .../orttraining/eager/opgen/opgen/ast.py      |   5 +-
 .../orttraining/eager/opgen/opgen/atenops.py  |  86 +--
 .../eager/opgen/opgen/custom_ops.py           |   2 +-
 .../eager/opgen/opgen/generator.py            |  27 +-
 .../orttraining/eager/opgen/opgen/lexer.py    |  23 +-
 .../orttraining/eager/opgen/opgen/onnxops.py  |   4 +-
 .../orttraining/eager/opgen/opgen/parser.py   | 272 ++++-----
 .../orttraining/eager/opgen/opgen/writer.py   |   2 +-
 .../eager/opgen/opgen_test/lexer_test.py      |   4 +-
 .../orttraining/eager/test/__main__.py        |   2 +-
 .../test/linux_only_ortmodule_eager_test.py   |  10 +-
 .../orttraining/eager/test/ort_eps_test.py    |  27 +-
 .../orttraining/eager/test/ort_init.py        |   3 +-
 orttraining/orttraining/eager/test/ort_ops.py |  18 +-
 .../orttraining/eager/test/ort_tensor.py      |   7 +-
 .../test_model_OrtModule/mnist_fc_training.py |  23 +-
 .../orttraining/eager/test_models/mnist_fc.py |  10 +-
 .../eager/test_models/mnist_fc_training.py    |   3 +-
 .../eager/test_models/scratchpad.py           |   2 +-
 .../eager/test_models/training_test.py        |   4 +-
 .../orttraining/python/checkpointing_utils.py |   8 +-
 .../orttraining/python/deprecated/__init__.py |   4 +-
 .../python/deprecated/training_session.py     |  10 +-
 orttraining/orttraining/python/ort_trainer.py |  58 +-
 orttraining/orttraining/python/pt_patch.py    |   6 +-
 .../orttraining/python/training/__init__.py   |  15 +-
 .../python/training/_checkpoint_storage.py    |   7 +-
 .../orttraining/python/training/_utils.py     |   9 +-
 .../python/training/amp/__init__.py           |   2 +-
 .../python/training/amp/loss_scaler.py        |   6 +-
 .../python/training/api/__init__.py           |   8 +-
 .../python/training/api/lr_scheduler.py       |   1 -
 .../orttraining/python/training/api/module.py |   4 +-
 .../orttraining/python/training/checkpoint.py |  52 +-
 .../python/training/experimental/__init__.py  |   2 +-
 .../python/training/experimental/exporter.py  |   6 +-
 .../gradient_graph/_gradient_graph_tools.py   |   7 +-
 .../python/training/model_desc_validation.py  |   7 +-
 .../python/training/onnxblock/__init__.py     |  10 +-
 .../python/training/onnxblock/_graph_utils.py |   2 +-
 .../training/onnxblock/building_blocks.py     |   2 +-
 .../training/onnxblock/loss/__init__.py       |   2 +-
 .../training/onnxblock/optim/__init__.py      |   2 +-
 .../python/training/optim/__init__.py         |  19 +-
 .../training/optim/_apex_amp_modifier.py      |   4 +-
 .../python/training/optim/_ds_modifier.py     |   2 +-
 .../python/training/optim/_modifier.py        |   8 +-
 .../training/optim/_modifier_registry.py      |   2 +-
 .../training/optim/_multi_tensor_apply.py     |   2 +-
 .../python/training/optim/config.py           |  10 +-
 .../python/training/optim/fp16_optimizer.py   |   2 +-
 .../python/training/optim/fused_adam.py       |   9 +-
 .../python/training/optim/lr_scheduler.py     |   5 +-
 .../python/training/ortmodule/__init__.py     |   8 +-
 .../ortmodule/_custom_autograd_function.py    |  12 +-
 .../_custom_autograd_function_exporter.py     |   8 +-
 .../_custom_autograd_function_runner.py       |  14 +-
 .../ortmodule/_custom_gradient_registry.py    |  10 +-
 .../ortmodule/_custom_op_symbolic_registry.py |  17 +-
 .../training/ortmodule/_execution_agent.py    |   4 +-
 .../python/training/ortmodule/_fallback.py    |  19 +-
 .../ortmodule/_fallback_exceptions.py         |   2 +-
 .../_gradient_accumulation_manager.py         |   5 +-
 .../ortmodule/_graph_execution_interface.py   |   2 +-
 .../ortmodule/_graph_execution_manager.py     |  12 +-
 .../_graph_execution_manager_factory.py       |   6 +-
 .../python/training/ortmodule/_io.py          |  14 +-
 .../python/training/ortmodule/_logger.py      |   7 +-
 .../training/ortmodule/_runtime_inspector.py  |   3 +-
 .../ortmodule/_torch_module_factory.py        |   2 +-
 .../ortmodule/_torch_module_interface.py      |   6 +-
 .../training/ortmodule/_torch_module_ort.py   |  15 +-
 .../ortmodule/_torch_module_pytorch.py        |   8 +-
 .../training/ortmodule/_training_manager.py   |   4 +-
 .../python/training/ortmodule/_utils.py       |  10 +-
 .../hierarchical_ortmodule/__init__.py        |   2 +-
 .../_hierarchical_ortmodule.py                |   8 +-
 .../experimental/json_config/__init__.py      |   2 +-
 .../json_config/_load_config_from_json.py     |   9 +-
 .../python/training/ortmodule/ortmodule.py    |  26 +-
 .../cpu/torch_interop_utils/__init__.py       |   2 +-
 .../cpu/torch_interop_utils/setup.py          |   2 +-
 .../cuda/fused_ops/setup.py                   |   6 +-
 .../cuda/torch_gpu_allocator/setup.py         |   2 +-
 .../orttraining/python/training/orttrainer.py |  56 +-
 .../python/training/orttrainer_options.py     |  13 +-
 .../python/training/postprocess.py            |  23 +-
 .../training/torchdynamo/ort_backend.py       |  19 +-
 .../training/torchdynamo/register_backend.py  |   2 +-
 .../python/training/utils/data/__init__.py    |   2 +-
 .../python/training/utils/data/sampler.py     |  11 +-
 .../python/training/utils/hooks/__init__.py   |   7 +-
 .../utils/hooks/_statistics_subscriber.py     |   7 +-
 .../training/utils/hooks/_subscriber_base.py  |   2 +-
 .../utils/hooks/merge_activation_summary.py   |   3 +-
 .../test/external_custom_ops/setup.py         |  18 +-
 .../test/external_custom_ops/test.py          |  11 +-
 .../test/external_transformers_test.py        |  24 +-
 .../python/_orttraining_ortmodule_models.py   |   6 +-
 .../orttraining/test/python/_test_commons.py  |   8 +-
 .../orttraining/test/python/_test_helpers.py  |  34 +-
 .../orttraining/test/python/launch_test.py    |   7 +-
 .../python/onnxruntime_test_postprocess.py    |  30 +-
 .../python/onnxruntime_test_register_ep.py    |   5 +-
 ...orttraining_ortmodule_distributed_tests.py |   5 +-
 .../python/orttraining_ortmodule_tests.py     |   4 +-
 .../python/orttraining_run_bert_pretrain.py   |  74 +--
 ...rttraining_run_frontend_batch_size_test.py |   2 +-
 .../test/python/orttraining_run_glue.py       |  28 +-
 .../python/orttraining_run_multiple_choice.py |  29 +-
 .../orttraining_test_bert_postprocess.py      |   4 +-
 .../orttraining_test_checkpoint_storage.py    |  25 +-
 .../python/orttraining_test_data_loader.py    |  12 +-
 .../python/orttraining_test_debuggability.py  |  29 +-
 .../test/python/orttraining_test_dort.py      |   2 +-
 ...aining_test_experimental_gradient_graph.py |   5 +-
 ...orttraining_test_hierarchical_ortmodule.py |  26 +-
 .../test/python/orttraining_test_hooks.py     |   3 +-
 .../orttraining_test_layer_norm_transform.py  |  38 +-
 .../test/python/orttraining_test_lort.py      |   4 +-
 .../orttraining_test_model_transform.py       |   2 +-
 .../orttraining_test_onnx_ops_ortmodule.py    |  10 +-
 .../test/python/orttraining_test_onnxblock.py |  18 +-
 .../python/orttraining_test_ortmodule_api.py  | 525 +++++++++---------
 .../orttraining_test_ortmodule_autograd.py    | 118 ++--
 ...rttraining_test_ortmodule_autograd_dist.py |  17 +-
 ...training_test_ortmodule_bert_classifier.py |  63 ++-
 ...test_ortmodule_bert_classifier_autocast.py |  58 +-
 ...t_ortmodule_deepspeed_pipeline_parallel.py |  14 +-
 ...g_test_ortmodule_deepspeed_zero_stage_1.py |  28 +-
 ...test_ortmodule_experimental_json_config.py |  34 +-
 ...t_ortmodule_fairscale_sharded_optimizer.py |  42 +-
 .../orttraining_test_ortmodule_fallback.py    |  31 +-
 .../python/orttraining_test_ortmodule_poc.py  |  25 +-
 .../orttraining_test_ortmodule_pytorch_ddp.py |  19 +-
 ...ng_test_ortmodule_torch_lightning_basic.py |   6 +-
 ...ttraining_test_orttrainer_bert_toy_onnx.py |  52 +-
 ...ng_test_orttrainer_checkpoint_functions.py |  18 +-
 .../orttraining_test_orttrainer_frontend.py   |  51 +-
 .../test/python/orttraining_test_ortvalue.py  |  26 +-
 .../orttraining_test_python_bindings.py       |   2 +-
 .../test/python/orttraining_test_sampler.py   |   6 +-
 .../python/orttraining_test_transformers.py   |  28 +-
 .../test/python/orttraining_test_utils.py     |  17 +-
 .../python/orttraining_transformer_trainer.py |  31 +-
 .../perf_log/ort_module_perf_test_tools.py    |  11 +-
 .../test/python/qat_poc_example/model.py      |   7 +-
 .../test/python/qat_poc_example/qat.py        |   1 -
 .../test/python/qat_poc_example/quantize.py   |   3 +-
 .../test/python/qat_poc_example/train.py      |   3 +-
 .../test/python/utils_multiple_choice.py      |  23 +-
 .../mnist_training.py                         |  16 +-
 orttraining/tools/amdgpu/script/rocprof.py    |   5 +-
 .../tools/ci_test/compare_huggingface.py      |   4 +-
 orttraining/tools/ci_test/compare_results.py  |   7 +-
 .../ci_test/download_azure_blob_archive.py    |  10 +-
 .../tools/ci_test/run_batch_size_test.py      |   2 +-
 .../tools/ci_test/run_bert_perf_test.py       |   6 +-
 .../tools/ci_test/run_convergence_test.py     |   6 +-
 .../tools/ci_test/run_gpt2_perf_test.py       |   2 +-
 orttraining/tools/scripts/experiment.py       |  19 +-
 .../tools/scripts/gpt2_model_transform.py     |  11 +-
 .../tools/scripts/layer_norm_transform.py     |  47 +-
 orttraining/tools/scripts/model_transform.py  |  13 +-
 .../tools/scripts/nv_run_pretraining.py       |  89 ++-
 .../tools/scripts/opset12_model_transform.py  |   8 +-
 .../scripts/performance_investigation.py      |   1 +
 .../tools/scripts/pipeline_model_split.py     |  27 +-
 orttraining/tools/scripts/sqldb_to_tensors.py |   3 +-
 orttraining/tools/scripts/watch_experiment.py |  13 +-
 pyproject.toml                                |  42 +-
 .../training/orttrainer/mnist/ort_mnist.py    |   2 +-
 .../orttrainer/mnist/pytorch_mnist.py         |   2 +-
 .../pytorch_transformer/ort_train.py          |   4 +-
 .../pytorch_transformer/pt_model.py           |   4 +-
 .../pytorch_transformer/pt_train.py           |   4 +-
 .../orttrainer/pytorch_transformer/utils.py   |   9 +-
 setup.py                                      |  40 +-
 .../build_custom_android_package.py           |  50 +-
 tools/ci_build/amd_hipify.py                  |   4 +-
 tools/ci_build/build.py                       | 126 +++--
 tools/ci_build/clean_docker_image_cache.py    |   6 +-
 tools/ci_build/gen_def.py                     |   8 +-
 tools/ci_build/get_docker_image.py            |   8 +-
 .../github/android/build_aar_package.py       |  10 +-
 .../apple/build_and_assemble_ios_pods.py      |   7 +-
 .../github/apple/build_ios_framework.py       |   7 +-
 .../objectivec/assemble_objc_pod_package.py   |   2 +-
 .../github/apple/package_assembly_utils.py    |   6 +-
 .../github/apple/test_ios_packages.py         |   6 +-
 .../python-checks-ci-pipeline.yml             |  19 -
 .../build_ort_and_check_binary_size.py        |   5 +-
 .../ort_minimal/check_build_binary_size.py    |  11 +-
 .../github/linux/ort_minimal/readelf_utils.py |  10 +-
 tools/ci_build/github/python_checks/readme.md |  18 -
 .../github/python_checks/requirements.txt     |   1 -
 .../windows/post_binary_sizes_to_dashboard.py |   2 +-
 .../post_code_coverage_to_dashboard.py        |   2 +-
 tools/ci_build/op_registration_utils.py       |  23 +-
 tools/ci_build/op_registration_validator.py   |   7 +-
 tools/ci_build/reduce_op_kernels.py           |  23 +-
 tools/ci_build/replace_urls_in_deps.py        |   6 +-
 tools/ci_build/update_tsaoptions.py           |   2 +-
 .../upload_python_package_to_azure_storage.py |  10 +-
 tools/doc/rename_folders.py                   |   8 +-
 .../nuget/generate_nuspec_for_native_nuget.py |   2 +-
 tools/nuget/validate_package.py               |   4 +-
 tools/python/PythonTools.md                   |   2 +-
 tools/python/create_reduced_build_config.py   |   1 -
 tools/python/dump_ort_model.py                |   4 +-
 tools/python/example_operator_perf_test.py    |   6 +-
 ...ptimizer_opset_version_updates_required.py |   7 +-
 tools/python/gen_contrib_doc.py               |  85 ++-
 tools/python/gen_opkernel_doc.py              |  15 +-
 tools/python/gen_ort_mobile_pkg_doc.py        |   6 +-
 tools/python/onnx_test_data_utils.py          |   4 +-
 tools/python/ort_test_dir_utils.py            |  20 +-
 tools/python/run_CIs_for_external_pr.py       |   2 +-
 tools/python/run_android_emulator.py          |   4 +-
 tools/python/sparsify_initializers.py         |   6 +-
 tools/python/update_version.py                |   2 +-
 tools/python/util/__init__.py                 |  14 +-
 tools/python/util/__init__append.py           |   2 +-
 tools/python/util/android/__init__.py         |   8 +-
 tools/python/util/android/android.py          |  16 +-
 .../util/check_onnx_model_mobile_usability.py |   2 +-
 .../python/util/convert_onnx_models_to_ort.py |  16 +-
 tools/python/util/get_azcopy.py               |   6 +-
 .../util/mobile_helpers/usability_checker.py  |   4 +-
 tools/python/util/onnx_model_utils.py         |   4 +-
 .../python/util/ort_format_model/__init__.py  |  12 +-
 .../operator_type_usage_processors.py         |  25 +-
 .../ort_format_model/ort_model_processor.py   |  10 +-
 tools/python/util/ort_format_model/types.py   |   4 +-
 tools/python/util/ort_format_model/utils.py   |   2 +-
 tools/python/util/platform_helpers.py         |   2 +-
 tools/python/util/pytorch_export_helpers.py   |   4 +-
 .../util/qdq_helpers/qdq_model_utils.py       |   2 +-
 .../qdq_helpers/test/test_qdq_model_utils.py  |   2 +-
 .../util/reduced_build_config_parser.py       |  14 +-
 tools/python/util/run.py                      |   4 +-
 .../util/test/test_pytorch_export_helpers.py  |   2 +-
 506 files changed, 3974 insertions(+), 4099 deletions(-)
 delete mode 100644 .flake8
 create mode 100644 .lintrunner.toml
 delete mode 100644 tools/ci_build/github/azure-pipelines/python-checks-ci-pipeline.yml
 delete mode 100644 tools/ci_build/github/python_checks/readme.md
 delete mode 100644 tools/ci_build/github/python_checks/requirements.txt

diff --git a/.flake8 b/.flake8
deleted file mode 100644
index 299ca9aa354c2..0000000000000
--- a/.flake8
+++ /dev/null
@@ -1,27 +0,0 @@
-[flake8]
-max-line-length = 120
-per-file-ignores =
-    __init__.py:F401
-format = [flake8 PEP8 ERROR] %(path)s:%(row)d:%(col)d: %(code)s %(text)s
-exclude =
-    # ignore the .git directory
-    ./.git,
-    # ignore default build directory
-    ./build,
-    # ignore external dependency files
-    ./cmake/external,
-    # TODO enable
-    ./docs/python,
-    # ignore generated flatbuffers code
-    ./onnxruntime/core/flatbuffers/ort_flatbuffers_py,
-    # TODO enable
-    ./onnxruntime/python/tools,
-    # ignore test code for now
-    ./onnxruntime/test,
-    # TODO enable
-    ./orttraining,
-    # ignore server code for now
-    ./server,
-    # ignore issues from different git branches
-    ./.git,
-ignore = W503, E203
diff --git a/.github/workflows/generate-skip-doc-change.py b/.github/workflows/generate-skip-doc-change.py
index 045bbbb43ebe2..a6223558a96ea 100644
--- a/.github/workflows/generate-skip-doc-change.py
+++ b/.github/workflows/generate-skip-doc-change.py
@@ -7,7 +7,7 @@
 GITHUB_DIR = Path(__file__).resolve().parent.parent
 
 
-class Skipped_Workflow:
+class Skipped_Workflow:  # noqa: N801
     def __init__(self, workflow_name: str, job_names: list, output_file_name: str):
         self.workflow_name = workflow_name
         self.job_names = job_names
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 9944cd29ca152..d1f9efc47d249 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -8,27 +8,11 @@ on:
   pull_request:
 
 jobs:
-  lint-python:
-    name: Lint Python
+  optional-lint:
+    name: Optional Lint
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
-      - name: flake8
-        uses: reviewdog/action-flake8@v3
-        with:
-          github_token: ${{ secrets.github_token }}
-          # Change reviewdog reporter if you need [github-pr-check, github-check, github-pr-review].
-          reporter: github-pr-check
-          # Change reporter level if you need.
-          # GitHub Status Check won't become failure with a warning.
-          level: error
-          filter_mode: file
-      - name: pyflakes
-        uses: reviewdog/action-pyflakes@v1
-        with:
-          github_token: ${{ secrets.github_token }}
-          reporter: github-pr-check
-          level: warning
+      - uses: actions/checkout@v3
       - name: misspell # Check spellings as well
         uses: reviewdog/action-misspell@v1
         with:
@@ -44,43 +28,44 @@ jobs:
           reporter: github-pr-check
           level: info
           filter_mode: file
-      - name: pyright
-        uses: jordemort/action-pyright@v1
-        with:
-          github_token: ${{ secrets.github_token }}
-          reporter: github-pr-check
-          level: warning
-          filter_mode: added
-          lib: true
-          pyright_version: 1.1.291
-      - name: pylint
-        uses: dciborow/action-pylint@0.0.7
-        with:
-          github_token: ${{ secrets.github_token }}
-          reporter: github-pr-check
-          level: warning
-          filter_mode: diff_context
-          glob_pattern: "**/*.py"
 
   lint-python-format:
-    # Separated black/isort from other Python linters because we want this job to
-    # fail and not affect other linters
-    # According to https://black.readthedocs.io/en/stable/integrations/github_actions.html:
-    #     We recommend the use of the @stable tag, but per version tags also exist if you prefer that.
-    #     Note that the action’s version you select is independent of the version of Black the action will use.
-    #     The version of Black the action will use can be configured via version.
+    # Required workflow
     name: Python format
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
-      - uses: actions/setup-python@v2
+      - uses: actions/checkout@v3
+      - name: Setup Python
+        uses: actions/setup-python@v4
         with:
+          # Version range or exact version of Python to use, using SemVer's version range syntax. Reads from .python-version if unset.
           python-version: "3.10"
-      - uses: psf/black@stable
+      - name: Install dependencies
+        run: |
+          python -m pip install -r requirements-dev.txt
+          python -m pip install lintrunner lintrunner-adapters
+          lintrunner init
+      - name: Run lintrunner on all files
+        run: |
+          set +e
+          if ! lintrunner --force-color --all-files --tee-json=lint.json -v; then
+              echo ""
+              echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner -m main\`.\e[0m"
+              exit 1
+          fi
+      - name: Produce SARIF
+        if: always()
+        run: |
+          python -m lintrunner_adapters to-sarif lint.json lintrunner.sarif
+      - name: Upload SARIF file
+        if: always()
+        continue-on-error: true
+        uses: github/codeql-action/upload-sarif@v2
         with:
-          options: "--check --diff --color"
-          version: "22.12.0"
-      - uses: isort/isort-action@master
+          # Path to SARIF file relative to the root of the repository
+          sarif_file: lintrunner.sarif
+          category: lintrunner
+          checkout_path: ${{ github.workspace }}
 
   lint-cpp:
     name: Lint C++
@@ -98,13 +83,6 @@ jobs:
             --cmake_extra_defines CMAKE_EXPORT_COMPILE_COMMANDS=ON
       - name: Generate ONNX protobuf files
         run: cmake --build build/Debug --config Debug --target onnx_proto
-#       - name: Run clang-tidy
-#         uses: ZedThree/clang-tidy-review@526cbfb043719639f1ebdeedae0cc1eacd219d8f
-#         with:
-#           token: ${{ secrets.github_token }}
-#           build_dir: "build/Debug"
-#           config_file: ".clang-tidy"
-#           lgtm_comment_body: ""
       - uses: reviewdog/action-cpplint@master
         with:
           github_token: ${{ secrets.github_token }}
@@ -117,7 +95,7 @@ jobs:
     name: Lint JavaScript
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - uses: reviewdog/action-eslint@v1
         with:
           reporter: github-pr-check
diff --git a/.lintrunner.toml b/.lintrunner.toml
new file mode 100644
index 0000000000000..ce123c06ca577
--- /dev/null
+++ b/.lintrunner.toml
@@ -0,0 +1,154 @@
+# Configuration for lintrunner https://github.com/suo/lintrunner
+# You can install the dependencies and initialize with
+#
+# ```sh
+# pip install lintrunner lintrunner-adapters
+# lintrunner init
+# ```
+#
+# This will install lintrunner on your system and download all the necessary
+# dependencies to run linters locally.
+# If you want to see what lintrunner init will install, run
+# `lintrunner init --dry-run`.
+#
+# To lint local changes:
+#
+# ```bash
+# lintrunner -m main
+# ```
+#
+# To lint all files:
+#
+# ```bash
+# lintrunner --all-files
+# ```
+#
+# To format files:
+#
+# ```bash
+# lintrunner f --all-files
+# ```
+#
+# To read more about lintrunner, see [wiki](https://github.com/pytorch/pytorch/wiki/lintrunner).
+# To update an existing linting rule or create a new one, modify this file or create a
+# new adapter following examples in https://github.com/justinchuby/lintrunner-adapters.
+
+[[linter]]
+code = 'RUFF'
+include_patterns = [
+    '**/*.py',
+    '**/*.pyi',
+]
+exclude_patterns = [
+    'cmake/external/**',
+    # ignore generated flatbuffers code
+    'onnxruntime/core/flatbuffers/ort_flatbuffers_py/**',
+]
+command = [
+    'python',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'ruff_linter',
+    '--config=pyproject.toml',
+    '@{{PATHSFILE}}'
+]
+init_command = [
+    'python',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'pip_init',
+    '--dry-run={{DRYRUN}}',
+    'ruff==0.0.252',
+]
+
+[[linter]]
+code = 'RUFF-FIX'
+include_patterns = [
+    '**/*.py',
+    '**/*.pyi',
+]
+exclude_patterns = [
+    'cmake/external/**',
+    # ignore generated flatbuffers code
+    'onnxruntime/core/flatbuffers/ort_flatbuffers_py/**',
+]
+command = [
+    'python',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'ruff_fix_linter',
+    '--config=pyproject.toml',
+    '@{{PATHSFILE}}'
+]
+init_command = [
+    'python',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'pip_init',
+    '--dry-run={{DRYRUN}}',
+    'ruff==0.0.252',
+]
+is_formatter = true
+
+
+[[linter]]
+code = 'BLACK-ISORT'
+include_patterns = [
+    '**/*.py',
+]
+exclude_patterns = [
+    'cmake/**',
+    'orttraining/*',
+    'onnxruntime/core/flatbuffers/**',
+]
+command = [
+    'python3',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'black_isort_linter',
+    '--',
+    '@{{PATHSFILE}}'
+]
+init_command = [
+    'python3',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'pip_init',
+    '--dry-run={{DRYRUN}}',
+    'black==23.1.0',
+    'isort==5.10.1',
+]
+is_formatter = true
+
+[[linter]]
+code = 'PYLINT'
+include_patterns = [
+    # TODO: Opt in to pylint by adding paths here
+]
+exclude_patterns = [
+]
+command = [
+    'python3',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'pylint_linter',
+    '--rcfile=pyproject.toml',
+    '--',
+    '@{{PATHSFILE}}'
+]
+init_command = [
+    'python3',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'pip_init',
+    '--dry-run={{DRYRUN}}',
+    'pylint==2.15.5',
+]
diff --git a/cgmanifests/generate_cgmanifest.py b/cgmanifests/generate_cgmanifest.py
index bd2c85762115e..7d775996835da 100644
--- a/cgmanifests/generate_cgmanifest.py
+++ b/cgmanifests/generate_cgmanifest.py
@@ -50,7 +50,7 @@ def add_github_dep(name, parsed_url):
     if segments[3] != "archive":
         print("unrecognized github url path:" + parsed_url.path)
         return
-    git_repo_url = "https://github.com/%s/%s.git" % (org_name, repo_name)
+    git_repo_url = f"https://github.com/{org_name}/{repo_name}.git"
     # For example, the path might be like '/myorg/myrepo/archive/5a5f8a5935762397aa68429b5493084ff970f774.zip'
     # The last segment, segments[4], is '5a5f8a5935762397aa68429b5493084ff970f774.zip'
     if len(segments) == 5 and re.match(r"[0-9a-f]{40}", PurePosixPath(segments[4]).stem):
@@ -72,7 +72,7 @@ def add_github_dep(name, parsed_url):
             print("unrecognized github url path:" + parsed_url.path)
             return
         # Make a REST call to convert to tag to a git commit
-        url = "https://api.github.com/repos/%s/%s/git/refs/tags/%s" % (org_name, repo_name, tag)
+        url = f"https://api.github.com/repos/{org_name}/{repo_name}/git/refs/tags/{tag}"
         print("requesting %s ..." % url)
         res = requests.get(url, auth=(args.username, args.token))
         response_json = res.json()
@@ -92,7 +92,6 @@ def add_github_dep(name, parsed_url):
 
 with open(
     os.path.join(REPO_DIR, "tools", "ci_build", "github", "linux", "docker", "Dockerfile.manylinux2014_cuda11"),
-    mode="r",
 ) as f:
     for line in f:
         if not line.strip():
@@ -157,9 +156,8 @@ def normalize_path_separators(path):
     ],
     check=True,
     cwd=REPO_DIR,
-    stdout=subprocess.PIPE,
-    stderr=subprocess.PIPE,
-    universal_newlines=True,
+    capture_output=True,
+    text=True,
 )
 
 
diff --git a/cgmanifests/print_submodule_info.py b/cgmanifests/print_submodule_info.py
index 362d168930451..5603745a60c80 100644
--- a/cgmanifests/print_submodule_info.py
+++ b/cgmanifests/print_submodule_info.py
@@ -15,15 +15,13 @@
     check=True,
     cwd=path,
     stdout=subprocess.PIPE,
-    universal_newlines=True,
+    text=True,
 )
 
 url = proc.stdout.strip()
 
-proc = subprocess.run(
-    ["git", "rev-parse", "HEAD"], check=True, cwd=path, stdout=subprocess.PIPE, universal_newlines=True
-)
+proc = subprocess.run(["git", "rev-parse", "HEAD"], check=True, cwd=path, stdout=subprocess.PIPE, text=True)
 
 commit = proc.stdout.strip()
 
-print("{} {} {}".format(path, url, commit))
+print(f"{path} {url} {commit}")
diff --git a/docs/Coding_Conventions_and_Standards.md b/docs/Coding_Conventions_and_Standards.md
index f8bc60ba152a2..47555ba9f0dd4 100644
--- a/docs/Coding_Conventions_and_Standards.md
+++ b/docs/Coding_Conventions_and_Standards.md
@@ -112,15 +112,15 @@ void foo(gsl::span<const std::string> names) {
 * The following C++ warnings should never be disabled in onnxruntime VC++ projects(Required by [Binskim](https://github.com/microsoft/binskim/blob/d9afb65c89a621411efded74c27999281d87867e/src/BinSkim.Rules/PERules/BA2007.EnableCriticalCompilerWarnings.cs)).
   1. [4018](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-3-c4018) 'token' : signed/unsigned mismatch
   2. [4146](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-2-c4146?view=msvc-160) unary minus operator applied to unsigned type, result still unsigned
-  3. [4244](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-2-c4244?view=msvc-160) 'argument' : conversion from 'type1' to 'type2', possible loss of data. For example, casting a int64_t to size_t. 
+  3. [4244](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-2-c4244?view=msvc-160) 'argument' : conversion from 'type1' to 'type2', possible loss of data. For example, casting a int64_t to size_t.
   4. [4267](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-3-c4267?view=msvc-160) 'var' : conversion from 'size_t' to 'type', possible loss of data.
   5. [4302](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-2-c4302?view=msvc-160) 'conversion' : truncation from 'type 1' to 'type 2'
-  6. [4308](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-2-c4308?view=msvc-160) negative integral constant converted to unsigned type 
+  6. [4308](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-2-c4308?view=msvc-160) negative integral constant converted to unsigned type
   7. [4532](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-1-c4532?view=msvc-160) 'continue' : jump out of \_\_finally/finally block has undefined behavior during termination handling
   8. [4533](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-1-c4533?view=msvc-160) initialization of 'variable' is skipped by 'instruction'
   9. [4700](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-1-and-level-4-c4700?view=msvc-160) uninitialized local variable 'name' used
   10. [4789](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-1-c4789?view=msvc-160) buffer 'identifier' of size N bytes will be overrun; M bytes will be written starting at offset L
-  11. [4995](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-3-c4995?view=msvc-160) 'function': name was marked as #pragma deprecated 
+  11. [4995](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-3-c4995?view=msvc-160) 'function': name was marked as #pragma deprecated
   12. [4996](https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-3-c4996?view=msvc-160) Your code uses a function, class member, variable, or typedef that's marked deprecated
 
 #### Clang-format
@@ -150,21 +150,54 @@ There is a configuration file in `onnxruntime/VSCodeCoverage.runsettings` that c
 
 Using `Show Code Coverage Coloring` will allow you to visually inspect which lines were hit by the tests. See <https://docs.microsoft.com/en-us/visualstudio/test/using-code-coverage-to-determine-how-much-code-is-being-tested?view=vs-2017>.
 
+## Linting
+
+This project uses [lintrunner](https://github.com/suo/lintrunner) for linting. It provides a consistent linting experience locally and in CI. You can install the dependencies and initialize with
+
+```sh
+pip install lintrunner lintrunner-adapters
+lintrunner init
+```
+
+This will install lintrunner on your system and download all the necessary
+dependencies to run linters locally.
+If you want to see what lintrunner init will install, run
+`lintrunner init --dry-run`.
+
+To lint local changes:
+
+```bash
+lintrunner -m main
+```
+
+To lint all files:
+
+```bash
+lintrunner --all-files
+```
+
+To format files:
+
+```bash
+lintrunner -a --all-files
+```
+
+To read more about lintrunner, see [wiki](https://github.com/pytorch/pytorch/wiki/lintrunner).
+To update an existing linting rule or create a new one, modify `.lintrunner.toml` or create a
+new adapter following examples in https://github.com/justinchuby/lintrunner-adapters.
+
 ## Python Code Style
 
 Follow the [Black formatter](https://black.readthedocs.io)'s coding style when possible. A maximum line length of 120 characters is allowed for consistency with the C++ code.
 
 Please adhere to the [PEP8 Style Guide](https://www.python.org/dev/peps/pep-0008/). We use [Google's python style guide](https://google.github.io/styleguide/pyguide.html) as the style guide which is an extension to PEP8.
 
-Code can be validated with [flake8](https://pypi.org/project/flake8/) using the configuration file in the root directory called [.flake8](https://github.com/microsoft/onnxruntime/blob/main/.flake8).
-
 Use `pyright`, which is provided as a component of the `pylance` extension in VS Code for static type checking.
 
-Auto-formatting is done with `black` and `isort`. The tools are configured in `pyproject.toml`. From anywhere in the repository, you can run
+Auto-formatting is done with `black` and `isort`. The tools are configured in `pyproject.toml`. From the root of the repository, you can run
 
 ```sh
-black .
-isort .
+lintrunner f --all-files
 ```
 
 to format Python files.
diff --git a/docs/python/inference/conf.py b/docs/python/inference/conf.py
index ad56d6a465959..5febf5ef5a4b9 100644
--- a/docs/python/inference/conf.py
+++ b/docs/python/inference/conf.py
@@ -6,8 +6,8 @@
 # Configuration file for the Sphinx documentation builder.
 
 import os
-import shutil
-import sys
+import shutil  # noqa: F401
+import sys  # noqa: F401
 
 import onnxruntime
 
@@ -95,7 +95,7 @@ def setup(app):
         urllib.request.urlretrieve(url, dest)
     loc = os.path.split(dest)[-1]
     if not os.path.exists(loc):
-        import shutil
+        import shutil  # noqa: F811
 
         shutil.copy(dest, loc)
     return app
diff --git a/docs/python/inference/examples/plot_backend.py b/docs/python/inference/examples/plot_backend.py
index ecfc17175d11b..58fb4cd84f82f 100644
--- a/docs/python/inference/examples/plot_backend.py
+++ b/docs/python/inference/examples/plot_backend.py
@@ -34,8 +34,8 @@
 x = np.array([[-1.0, -2.0]], dtype=np.float32)
 try:
     label, proba = rep.run(x)
-    print("label={}".format(label))
-    print("probabilities={}".format(proba))
+    print(f"label={label}")
+    print(f"probabilities={proba}")
 except (RuntimeError, InvalidArgument) as e:
     print(e)
 
@@ -47,8 +47,8 @@
 x = np.array([[-1.0, -2.0]], dtype=np.float32)
 try:
     label, proba = rep.run(x)
-    print("label={}".format(label))
-    print("probabilities={}".format(proba))
+    print(f"label={label}")
+    print(f"probabilities={proba}")
 except (RuntimeError, InvalidArgument) as e:
     print(e)
 
diff --git a/docs/python/inference/examples/plot_common_errors.py b/docs/python/inference/examples/plot_common_errors.py
index e4b2b1e05a3ec..dc7078831a257 100644
--- a/docs/python/inference/examples/plot_common_errors.py
+++ b/docs/python/inference/examples/plot_common_errors.py
@@ -8,7 +8,7 @@
 ==============================
 
 This example looks into several common situations
-in which *onnxruntime* does not return the model 
+in which *onnxruntime* does not return the model
 prediction but raises an exception instead.
 It starts by loading the model trained in example
 :ref:`l-logreg-example` which produced a logistic regression
@@ -37,7 +37,7 @@
     sess.run([output_name], {input_name: x})
 except Exception as e:
     print("Unexpected type")
-    print("{0}: {1}".format(type(e), e))
+    print(f"{type(e)}: {e}")
 
 #########################
 # The model fails to return an output if the name
@@ -48,7 +48,7 @@
     sess.run(["misspelled"], {input_name: x})
 except Exception as e:
     print("Misspelled output name")
-    print("{0}: {1}".format(type(e), e))
+    print(f"{type(e)}: {e}")
 
 ###########################
 # The output name is optional, it can be replaced by *None*
@@ -70,7 +70,7 @@
     sess.run([output_name], {"misspelled": x})
 except Exception as e:
     print("Misspelled input name")
-    print("{0}: {1}".format(type(e), e))
+    print(f"{type(e)}: {e}")
 
 #########################
 # *onnxruntime* does not necessarily fail if the input
@@ -85,9 +85,9 @@
 ]:
     try:
         r = sess.run([output_name], {input_name: x})
-        print("Shape={0} and predicted labels={1}".format(x.shape, r))
+        print(f"Shape={x.shape} and predicted labels={r}")
     except (RuntimeError, InvalidArgument) as e:
-        print("ERROR with Shape={0} - {1}".format(x.shape, e))
+        print(f"ERROR with Shape={x.shape} - {e}")
 
 for x in [
     numpy.array([1.0, 2.0, 3.0, 4.0], dtype=numpy.float32),
@@ -98,9 +98,9 @@
 ]:
     try:
         r = sess.run(None, {input_name: x})
-        print("Shape={0} and predicted probabilities={1}".format(x.shape, r[1]))
+        print(f"Shape={x.shape} and predicted probabilities={r[1]}")
     except (RuntimeError, InvalidArgument) as e:
-        print("ERROR with Shape={0} - {1}".format(x.shape, e))
+        print(f"ERROR with Shape={x.shape} - {e}")
 
 #########################
 # It does not fail either if the number of dimension
@@ -113,6 +113,6 @@
 ]:
     try:
         r = sess.run([output_name], {input_name: x})
-        print("Shape={0} and predicted labels={1}".format(x.shape, r))
+        print(f"Shape={x.shape} and predicted labels={r}")
     except (RuntimeError, InvalidArgument) as e:
-        print("ERROR with Shape={0} - {1}".format(x.shape, e))
+        print(f"ERROR with Shape={x.shape} - {e}")
diff --git a/docs/python/inference/examples/plot_load_and_predict.py b/docs/python/inference/examples/plot_load_and_predict.py
index 09d7c9cdb4c88..0b58cdcf8230c 100644
--- a/docs/python/inference/examples/plot_load_and_predict.py
+++ b/docs/python/inference/examples/plot_load_and_predict.py
@@ -47,7 +47,7 @@
 #########################
 # Let's compute its outputs (or predictions if it is a machine learned model).
 
-import numpy.random
+import numpy.random  # noqa: E402
 
 x = numpy.random.random((3, 4, 5))
 x = x.astype(numpy.float32)
diff --git a/docs/python/inference/examples/plot_metadata.py b/docs/python/inference/examples/plot_metadata.py
index c76f2e8d9fa7f..e3ecac0caf0af 100644
--- a/docs/python/inference/examples/plot_metadata.py
+++ b/docs/python/inference/examples/plot_metadata.py
@@ -9,7 +9,7 @@
 model was produced. It is useful when the model
 is deployed to production to keep track of which
 instance was used at a specific time.
-Let's see how to do that with a simple 
+Let's see how to do that with a simple
 logistic regression model trained with
 *scikit-learn* and converted with *sklearn-onnx*.
 """
@@ -18,29 +18,29 @@
 
 example = get_example("logreg_iris.onnx")
 
-import onnx
+import onnx  # noqa: E402
 
 model = onnx.load(example)
 
-print("doc_string={}".format(model.doc_string))
-print("domain={}".format(model.domain))
-print("ir_version={}".format(model.ir_version))
-print("metadata_props={}".format(model.metadata_props))
-print("model_version={}".format(model.model_version))
-print("producer_name={}".format(model.producer_name))
-print("producer_version={}".format(model.producer_version))
+print(f"doc_string={model.doc_string}")
+print(f"domain={model.domain}")
+print(f"ir_version={model.ir_version}")
+print(f"metadata_props={model.metadata_props}")
+print(f"model_version={model.model_version}")
+print(f"producer_name={model.producer_name}")
+print(f"producer_version={model.producer_version}")
 
 #############################
 # With *ONNX Runtime*:
 
-import onnxruntime as rt
+import onnxruntime as rt  # noqa: E402
 
 sess = rt.InferenceSession(example, providers=rt.get_available_providers())
 meta = sess.get_modelmeta()
 
-print("custom_metadata_map={}".format(meta.custom_metadata_map))
-print("description={}".format(meta.description))
-print("domain={}".format(meta.domain, meta.domain))
-print("graph_name={}".format(meta.graph_name))
-print("producer_name={}".format(meta.producer_name))
-print("version={}".format(meta.version))
+print(f"custom_metadata_map={meta.custom_metadata_map}")
+print(f"description={meta.description}")
+print(f"domain={meta.domain}")
+print(f"graph_name={meta.graph_name}")
+print(f"producer_name={meta.producer_name}")
+print(f"version={meta.version}")
diff --git a/docs/python/inference/examples/plot_pipeline.py b/docs/python/inference/examples/plot_pipeline.py
index 05dcbdb25b7a6..7e632f0d6a866 100644
--- a/docs/python/inference/examples/plot_pipeline.py
+++ b/docs/python/inference/examples/plot_pipeline.py
@@ -24,7 +24,7 @@
 
 example1 = get_example("mul_1.onnx")
 
-import onnx
+import onnx  # noqa: E402
 
 model = onnx.load(example1)  # model is a ModelProto protobuf message
 
@@ -40,7 +40,7 @@
 # in a different way than before.
 
 
-from onnx import ModelProto
+from onnx import ModelProto  # noqa: E402
 
 model = ModelProto()
 with open(example1, "rb") as fid:
@@ -49,7 +49,7 @@
 
 ###################################
 # We convert it into a graph.
-from onnx.tools.net_drawer import GetOpNodeProducer, GetPydotGraph
+from onnx.tools.net_drawer import GetOpNodeProducer, GetPydotGraph  # noqa: E402
 
 pydot_graph = GetPydotGraph(
     model.graph, name=model.graph.name, rankdir="LR", node_producer=GetOpNodeProducer("docstring")
@@ -58,13 +58,13 @@
 
 #######################################
 # Then into an image
-import os
+import os  # noqa: E402
 
 os.system("dot -O -Tpng graph.dot")
 
 ################################
 # Which we display...
-import matplotlib.pyplot as plt
+import matplotlib.pyplot as plt  # noqa: E402
 
 image = plt.imread("graph.dot.png")
 plt.imshow(image)
diff --git a/docs/python/inference/examples/plot_profiling.py b/docs/python/inference/examples/plot_profiling.py
index 3236f954cc052..d35ef725562cf 100644
--- a/docs/python/inference/examples/plot_profiling.py
+++ b/docs/python/inference/examples/plot_profiling.py
@@ -59,10 +59,10 @@ def change_ir_version(filename, ir_version=6):
 ###########################
 # The results are stored un a file in JSON format.
 # Let's see what it contains.
-import json
+import json  # noqa: E402
 
-with open(prof_file, "r") as f:
+with open(prof_file) as f:
     sess_time = json.load(f)
-import pprint
+import pprint  # noqa: E402
 
 pprint.pprint(sess_time)
diff --git a/docs/python/inference/examples/plot_train_convert_predict.py b/docs/python/inference/examples/plot_train_convert_predict.py
index b5033b503b3eb..bc6ca0c18df02 100644
--- a/docs/python/inference/examples/plot_train_convert_predict.py
+++ b/docs/python/inference/examples/plot_train_convert_predict.py
@@ -26,14 +26,14 @@
 iris = load_iris()
 X, y = iris.data, iris.target
 
-from sklearn.model_selection import train_test_split
+from sklearn.model_selection import train_test_split  # noqa: E402
 
 X_train, X_test, y_train, y_test = train_test_split(X, y)
 
 ####################################
 # Then we fit a model.
 
-from sklearn.linear_model import LogisticRegression
+from sklearn.linear_model import LogisticRegression  # noqa: E402
 
 clr = LogisticRegression()
 clr.fit(X_train, y_train)
@@ -41,7 +41,7 @@
 ####################################
 # We compute the prediction on the test set
 # and we show the confusion matrix.
-from sklearn.metrics import confusion_matrix
+from sklearn.metrics import confusion_matrix  # noqa: E402
 
 pred = clr.predict(X_test)
 print(confusion_matrix(y_test, pred))
@@ -54,8 +54,8 @@
 # `sklearn-onnx <https://github.com/onnx/sklearn-onnx>`_
 # to convert the model into ONNX format.
 
-from skl2onnx import convert_sklearn
-from skl2onnx.common.data_types import FloatTensorType
+from skl2onnx import convert_sklearn  # noqa: E402
+from skl2onnx.common.data_types import FloatTensorType  # noqa: E402
 
 initial_type = [("float_input", FloatTensorType([None, 4]))]
 onx = convert_sklearn(clr, initial_types=initial_type)
@@ -66,12 +66,12 @@
 # We load the model with ONNX Runtime and look at
 # its input and output.
 
-import onnxruntime as rt
+import onnxruntime as rt  # noqa: E402
 
 sess = rt.InferenceSession("logreg_iris.onnx", providers=rt.get_available_providers())
 
-print("input name='{}' and shape={}".format(sess.get_inputs()[0].name, sess.get_inputs()[0].shape))
-print("output name='{}' and shape={}".format(sess.get_outputs()[0].name, sess.get_outputs()[0].shape))
+print(f"input name='{sess.get_inputs()[0].name}' and shape={sess.get_inputs()[0].shape}")
+print(f"output name='{sess.get_outputs()[0].name}' and shape={sess.get_outputs()[0].shape}")
 
 ##################################
 # We compute the predictions.
@@ -79,7 +79,7 @@
 input_name = sess.get_inputs()[0].name
 label_name = sess.get_outputs()[0].name
 
-import numpy
+import numpy  # noqa: E402
 
 pred_onx = sess.run([label_name], {input_name: X_test.astype(numpy.float32)})[0]
 print(confusion_matrix(pred, pred_onx))
@@ -105,13 +105,13 @@
 prob_name = sess.get_outputs()[1].name
 prob_rt = sess.run([prob_name], {input_name: X_test.astype(numpy.float32)})[0]
 
-import pprint
+import pprint  # noqa: E402
 
 pprint.pprint(prob_rt[0:3])
 
 ###############################
 # Let's benchmark.
-from timeit import Timer
+from timeit import Timer  # noqa: E402
 
 
 def speed(inst, number=10, repeat=20):
@@ -119,7 +119,7 @@ def speed(inst, number=10, repeat=20):
     raw = numpy.array(timer.repeat(repeat, number=number))
     ave = raw.sum() / len(raw) / number
     mi, ma = raw.min() / number, raw.max() / number
-    print("Average %1.3g min=%1.3g max=%1.3g" % (ave, mi, ma))
+    print(f"Average {ave:1.3g} min={mi:1.3g} max={ma:1.3g}")
     return ave
 
 
@@ -180,7 +180,7 @@ def sess_predict_proba(x):
 # +++++++++++++++++++++++++++
 #
 # We first train and save a model in ONNX format.
-from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import RandomForestClassifier  # noqa: E402
 
 rf = RandomForestClassifier()
 rf.fit(X_train, y_train)
@@ -222,13 +222,13 @@ def sess_predict_proba_rf(x):
     sess = rt.InferenceSession("rf_iris_%d.onnx" % n_trees, providers=rt.get_available_providers())
 
     def sess_predict_proba_loop(x):
-        return sess.run([prob_name], {input_name: x.astype(numpy.float32)})[0]
+        return sess.run([prob_name], {input_name: x.astype(numpy.float32)})[0]  # noqa: B023
 
     tsk = speed("loop(X_test, rf.predict_proba, 100)", number=5, repeat=5)
     trt = speed("loop(X_test, sess_predict_proba_loop, 100)", number=5, repeat=5)
     measures.append({"n_trees": n_trees, "sklearn": tsk, "rt": trt})
 
-from pandas import DataFrame
+from pandas import DataFrame  # noqa: E402
 
 df = DataFrame(measures)
 ax = df.plot(x="n_trees", y="sklearn", label="scikit-learn", c="blue", logy=True)
diff --git a/onnxruntime/__init__.py b/onnxruntime/__init__.py
index 5e8b04cdc8417..d052d9644c890 100644
--- a/onnxruntime/__init__.py
+++ b/onnxruntime/__init__.py
@@ -20,31 +20,29 @@
 # meaningful messages to the user.
 # the saved exception is raised after device version validation.
 try:
-    from onnxruntime.capi._pybind_state import (
-        ExecutionMode,
-        ExecutionOrder,
-        GraphOptimizationLevel,
-        ModelMetadata,
-        NodeArg,
-        OrtAllocatorType,
-        OrtArenaCfg,
-        OrtMemoryInfo,
-        OrtMemType,
-        OrtSparseFormat,
-        RunOptions,
-        SessionIOBinding,
-        SessionOptions,
-        create_and_register_allocator,
-        disable_telemetry_events,
-        enable_telemetry_events,
-        get_all_providers,
-        get_available_providers,
-        get_device,
-        get_version_string,
-        set_default_logger_severity,
-        set_default_logger_verbosity,
-        set_seed,
-    )
+    from onnxruntime.capi._pybind_state import ExecutionMode  # noqa: F401
+    from onnxruntime.capi._pybind_state import ExecutionOrder  # noqa: F401
+    from onnxruntime.capi._pybind_state import GraphOptimizationLevel  # noqa: F401
+    from onnxruntime.capi._pybind_state import ModelMetadata  # noqa: F401
+    from onnxruntime.capi._pybind_state import NodeArg  # noqa: F401
+    from onnxruntime.capi._pybind_state import OrtAllocatorType  # noqa: F401
+    from onnxruntime.capi._pybind_state import OrtArenaCfg  # noqa: F401
+    from onnxruntime.capi._pybind_state import OrtMemoryInfo  # noqa: F401
+    from onnxruntime.capi._pybind_state import OrtMemType  # noqa: F401
+    from onnxruntime.capi._pybind_state import OrtSparseFormat  # noqa: F401
+    from onnxruntime.capi._pybind_state import RunOptions  # noqa: F401
+    from onnxruntime.capi._pybind_state import SessionIOBinding  # noqa: F401
+    from onnxruntime.capi._pybind_state import SessionOptions  # noqa: F401
+    from onnxruntime.capi._pybind_state import create_and_register_allocator  # noqa: F401
+    from onnxruntime.capi._pybind_state import disable_telemetry_events  # noqa: F401
+    from onnxruntime.capi._pybind_state import enable_telemetry_events  # noqa: F401
+    from onnxruntime.capi._pybind_state import get_all_providers  # noqa: F401
+    from onnxruntime.capi._pybind_state import get_available_providers  # noqa: F401
+    from onnxruntime.capi._pybind_state import get_device  # noqa: F401
+    from onnxruntime.capi._pybind_state import get_version_string  # noqa: F401
+    from onnxruntime.capi._pybind_state import set_default_logger_severity  # noqa: F401
+    from onnxruntime.capi._pybind_state import set_default_logger_verbosity  # noqa: F401
+    from onnxruntime.capi._pybind_state import set_seed  # noqa: F401
 
     import_capi_exception = None
 except Exception as e:
@@ -55,22 +53,20 @@
 if import_capi_exception:
     raise import_capi_exception
 
-from onnxruntime.capi.onnxruntime_inference_collection import (
-    InferenceSession,
-    IOBinding,
-    OrtDevice,
-    OrtValue,
-    SparseTensor,
-)
+from onnxruntime.capi.onnxruntime_inference_collection import InferenceSession  # noqa: F401
+from onnxruntime.capi.onnxruntime_inference_collection import IOBinding  # noqa: F401
+from onnxruntime.capi.onnxruntime_inference_collection import OrtDevice  # noqa: F401
+from onnxruntime.capi.onnxruntime_inference_collection import OrtValue  # noqa: F401
+from onnxruntime.capi.onnxruntime_inference_collection import SparseTensor  # noqa: F401
 from onnxruntime.capi.training import *  # noqa: F403
 
 # TODO: thiagofc: Temporary experimental namespace for new PyTorch front-end
-try:
-    from . import experimental
+try:  # noqa: SIM105
+    from . import experimental  # noqa: F401
 except ImportError:
     pass
 
-from onnxruntime.capi.onnxruntime_validation import cuda_version, package_name, version
+from onnxruntime.capi.onnxruntime_validation import cuda_version, package_name, version  # noqa: F401
 
 if version:
     __version__ = version
diff --git a/onnxruntime/core/flatbuffers/ort_flatbuffers_py/fbs/__init__.py b/onnxruntime/core/flatbuffers/ort_flatbuffers_py/fbs/__init__.py
index 58ee00d30bbd5..f81c64382237c 100644
--- a/onnxruntime/core/flatbuffers/ort_flatbuffers_py/fbs/__init__.py
+++ b/onnxruntime/core/flatbuffers/ort_flatbuffers_py/fbs/__init__.py
@@ -1,6 +1,7 @@
 from os.path import dirname, basename, isfile, join, splitext
 import glob
+
 modules = glob.glob(join(dirname(__file__), "*.py"))
-__all__ = [splitext(basename(f))[0] for f in modules if isfile(f) and not f.endswith('__init__.py')]
+__all__ = [splitext(basename(f))[0] for f in modules if isfile(f) and not f.endswith("__init__.py")]
 
-from . import *  # noqa
+from . import *
diff --git a/onnxruntime/core/flatbuffers/schema/compile_schema.py b/onnxruntime/core/flatbuffers/schema/compile_schema.py
index 55d332682b937..e0c6c399e55c9 100644
--- a/onnxruntime/core/flatbuffers/schema/compile_schema.py
+++ b/onnxruntime/core/flatbuffers/schema/compile_schema.py
@@ -14,79 +14,94 @@
 def update_namespace(schema_path: pathlib.Path, updated_schema_path: pathlib.Path):
     # create a copy of the schema so we can replace the namespace so that the generated module name doesn't clash
     # with the 'onnxruntime' package.
-    with open(schema_path, 'r') as input, open(updated_schema_path, 'w') as output:
+    with open(schema_path) as input, open(updated_schema_path, "w") as output:
         for line in input:
             # convert any line with the namespace to use ort_flatbuffers_py instead of onnxruntime as the top level
             # namespace. this doesn't change how anything works - it just avoids a naming clash with the 'real'
             # onnxruntime python package
-            output.write(line.replace('onnxruntime.fbs', 'ort_flatbuffers_py.fbs'))
+            output.write(line.replace("onnxruntime.fbs", "ort_flatbuffers_py.fbs"))
 
 
 def generate_python(flatc: pathlib.Path, schema_path: pathlib.Path, output_dir: pathlib.Path):
     # run flatc to generate Python code
-    cmd = [str(flatc), '--python', str(schema_path)]
+    cmd = [str(flatc), "--python", str(schema_path)]
     subprocess.run(cmd, check=True, cwd=output_dir)
 
 
 def create_init_py(output_dir: pathlib.Path):
     # create an __init__.py that imports all the py files so we can just 'import ort_flatbuffers_py.fbs'
     # in a script that wants to process an ORT format model
-    init_py_path = output_dir / 'ort_flatbuffers_py/fbs/__init__.py'
-    with open(init_py_path, 'w') as init_py:
-        init_py.write('''from os.path import dirname, basename, isfile, join, splitext
+    init_py_path = output_dir / "ort_flatbuffers_py/fbs/__init__.py"
+    with open(init_py_path, "w") as init_py:
+        init_py.write(
+            """from os.path import dirname, basename, isfile, join, splitext
 import glob
 modules = glob.glob(join(dirname(__file__), "*.py"))
 __all__ = [splitext(basename(f))[0] for f in modules if isfile(f) and not f.endswith('__init__.py')]
 
-from . import *  # noqa
-''')
+from . import *
+"""
+        )
 
 
 def generate_cpp(flatc: pathlib.Path, schema_path: pathlib.Path):
     # run flatc to generate C++ code
-    cmd = [str(flatc), '--cpp', '--scoped-enums', '--filename-suffix', '.fbs', str(schema_path)]
+    cmd = [str(flatc), "--cpp", "--scoped-enums", "--filename-suffix", ".fbs", str(schema_path)]
     subprocess.run(cmd, check=True, cwd=SCRIPT_DIR)
 
 
 def main():
-    parser = argparse.ArgumentParser(description='Generate language bindings for the ORT flatbuffers schema.',
-                                     usage='Provide the path to the flatbuffers flatc executable. '
-                                           'Script can be executed from anywhere but must be located in its original '
-                                           'directory in the ONNX Runtime enlistment.')
-
-    parser.add_argument('-f', '--flatc', required=True, type=pathlib.Path,
-                        help='Path to flatbuffers flatc executable. '
-                             'Can be found in the build directory under external/flatbuffers/<config>/')
-
-    all_languages = ['python', 'cpp']
-    parser.add_argument('-l', '--language', action='append', dest='languages', choices=all_languages,
-                        help='Specify which language bindings to generate.')
+    parser = argparse.ArgumentParser(
+        description="Generate language bindings for the ORT flatbuffers schema.",
+        usage="Provide the path to the flatbuffers flatc executable. "
+        "Script can be executed from anywhere but must be located in its original "
+        "directory in the ONNX Runtime enlistment.",
+    )
+
+    parser.add_argument(
+        "-f",
+        "--flatc",
+        required=True,
+        type=pathlib.Path,
+        help="Path to flatbuffers flatc executable. "
+        "Can be found in the build directory under external/flatbuffers/<config>/",
+    )
+
+    all_languages = ["python", "cpp"]
+    parser.add_argument(
+        "-l",
+        "--language",
+        action="append",
+        dest="languages",
+        choices=all_languages,
+        help="Specify which language bindings to generate.",
+    )
 
     args = parser.parse_args()
     languages = args.languages if args.languages is not None else all_languages
     flatc = args.flatc.resolve(strict=True)
-    schema_path = SCRIPT_DIR / 'ort.fbs'
+    schema_path = SCRIPT_DIR / "ort.fbs"
 
-    if 'python' in languages:
+    if "python" in languages:
         with tempfile.TemporaryDirectory() as temp_dir_name:
             temp_dir = pathlib.Path(temp_dir_name).resolve()
-            updated_schema_path = temp_dir / 'ort.py.fbs'
+            updated_schema_path = temp_dir / "ort.py.fbs"
             update_namespace(schema_path, updated_schema_path)
 
-            output_dir = temp_dir / 'out'
+            output_dir = temp_dir / "out"
             output_dir.mkdir()
             generate_python(flatc, updated_schema_path, output_dir)
             create_init_py(output_dir)
 
             # replace generated files in repo
-            target_dir = SCRIPT_DIR.parent / 'ort_flatbuffers_py'
+            target_dir = SCRIPT_DIR.parent / "ort_flatbuffers_py"
             if target_dir.is_dir():
                 shutil.rmtree(target_dir)
-            shutil.move(str(output_dir / 'ort_flatbuffers_py'), str(target_dir))
+            shutil.move(str(output_dir / "ort_flatbuffers_py"), str(target_dir))
 
-    if 'cpp' in languages:
+    if "cpp" in languages:
         generate_cpp(flatc, schema_path)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/onnxruntime/python/backend/__init__.py b/onnxruntime/python/backend/__init__.py
index 14a5d6de98bb2..d2da1af6c2ad8 100644
--- a/onnxruntime/python/backend/__init__.py
+++ b/onnxruntime/python/backend/__init__.py
@@ -3,4 +3,4 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-from .backend import is_compatible, prepare, run, supports_device
+from .backend import is_compatible, prepare, run, supports_device  # noqa: F401
diff --git a/onnxruntime/python/backend/backend.py b/onnxruntime/python/backend/backend.py
index 99592e3d5cf3e..1edae383e93e6 100644
--- a/onnxruntime/python/backend/backend.py
+++ b/onnxruntime/python/backend/backend.py
@@ -9,7 +9,7 @@
 import unittest
 
 import packaging.version
-from onnx import ModelProto, helper, version
+from onnx import ModelProto, helper, version  # noqa: F401
 from onnx.backend.base import Backend
 from onnx.checker import check_model
 
@@ -27,9 +27,9 @@ class OnnxRuntimeBackend(Backend):
     `Importing models from ONNX to Caffe2 <https://github.com/onnx/tutorials/blob/master/tutorials/OnnxCaffe2Import.ipynb>`_
     shows how to use *caffe2* as a backend for a converted model.
     Note: This is not the official Python API.
-    """  # noqa: E501
+    """
 
-    allowReleasedOpsetsOnly = bool(os.getenv("ALLOW_RELEASED_ONNX_OPSET_ONLY", "1") == "1")
+    allowReleasedOpsetsOnly = bool(os.getenv("ALLOW_RELEASED_ONNX_OPSET_ONLY", "1") == "1")  # noqa: N815
 
     @classmethod
     def is_compatible(cls, model, device=None, **kwargs):
@@ -59,11 +59,11 @@ def is_opset_supported(cls, model):
                 domain = opset.domain if opset.domain else "ai.onnx"
                 try:
                     key = (domain, opset.version)
-                    if not (key in helper.OP_SET_ID_VERSION_MAP):
+                    if key not in helper.OP_SET_ID_VERSION_MAP:
                         error_message = (
                             "Skipping this test as only released onnx opsets are supported."
                             "To run this test set env variable ALLOW_RELEASED_ONNX_OPSET_ONLY to 0."
-                            " Got Domain '{0}' version '{1}'.".format(domain, opset.version)
+                            " Got Domain '{}' version '{}'.".format(domain, opset.version)
                         )
                         return False, error_message
                 except AttributeError:
@@ -74,7 +74,7 @@ def is_opset_supported(cls, model):
                         error_message = (
                             "Skipping this test as only released onnx opsets are supported."
                             "To run this test set env variable ALLOW_RELEASED_ONNX_OPSET_ONLY to 0."
-                            " Got Domain '{0}' version '{1}'.".format(domain, opset.version)
+                            " Got Domain '{}' version '{}'.".format(domain, opset.version)
                         )
                         return False, error_message
         return True, ""
@@ -121,7 +121,7 @@ def prepare(cls, model, device=None, **kwargs):
             # which may hide test failures.
             inf.disable_fallback()
             if device is not None and not cls.supports_device(device):
-                raise RuntimeError("Incompatible device expected '{0}', got '{1}'".format(device, get_device()))
+                raise RuntimeError(f"Incompatible device expected '{device}', got '{get_device()}'")
             return cls.prepare(inf, device, **kwargs)
         else:
             # type: ModelProto
diff --git a/onnxruntime/python/backend/backend_rep.py b/onnxruntime/python/backend/backend_rep.py
index 6dced3aba7f80..c4dddaaba1378 100644
--- a/onnxruntime/python/backend/backend_rep.py
+++ b/onnxruntime/python/backend/backend_rep.py
@@ -5,7 +5,7 @@
 """
 Implements ONNX's backend API.
 """
-from typing import Any, Tuple
+from typing import Any, Tuple  # noqa: F401
 
 from onnx.backend.base import BackendRep
 
@@ -48,6 +48,6 @@ def run(self, inputs, **kwargs):  # type: (Any, **Any) -> Tuple[Any, ...]
         else:
             inp = self._session.get_inputs()
             if len(inp) != 1:
-                raise RuntimeError("Model expect {0} inputs".format(len(inp)))
+                raise RuntimeError(f"Model expect {len(inp)} inputs")
             inps = {inp[0].name: inputs}
             return self._session.run(None, inps, options)
diff --git a/onnxruntime/python/datasets/__init__.py b/onnxruntime/python/datasets/__init__.py
index e1ab4a0e44077..ba64aa8a6e159 100644
--- a/onnxruntime/python/datasets/__init__.py
+++ b/onnxruntime/python/datasets/__init__.py
@@ -13,5 +13,5 @@ def get_example(name):
     this = os.path.abspath(os.path.dirname(__file__))
     full = os.path.join(this, name)
     if not os.path.exists(full):
-        raise FileNotFoundError("Unable to find example '{0}'".format(name))
+        raise FileNotFoundError(f"Unable to find example '{name}'")
     return full
diff --git a/onnxruntime/python/onnxruntime_collect_build_info.py b/onnxruntime/python/onnxruntime_collect_build_info.py
index 6cd67938dd0ba..07ac21a11eb04 100644
--- a/onnxruntime/python/onnxruntime_collect_build_info.py
+++ b/onnxruntime/python/onnxruntime_collect_build_info.py
@@ -35,7 +35,7 @@ def get_cudart_version(find_cudart_version=None):
             status = cudart.cudaRuntimeGetVersion(ctypes.byref(version))
             if status != 0:
                 return None
-        except:  # noqa
+        except Exception:
             return None
 
         return version.value
@@ -93,7 +93,7 @@ def get_cudnn_supported_cuda_version(find_cudnn_version=None):
             # cudnn_ver = cudnn.cudnnGetVersion()
             cuda_ver = cudnn.cudnnGetCudartVersion()
             return cuda_ver
-        except:  # noqa
+        except Exception:
             return None
 
     # use set to avoid duplications
diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py
index 0883c528c9f07..6913ab091f165 100644
--- a/onnxruntime/python/onnxruntime_inference_collection.py
+++ b/onnxruntime/python/onnxruntime_inference_collection.py
@@ -57,7 +57,7 @@ def set_provider_options(name, options):
             )
 
         if name in provider_name_to_options:
-            warnings.warn("Duplicate provider '{}' encountered, ignoring.".format(name))
+            warnings.warn(f"Duplicate provider '{name}' encountered, ignoring.")
             return
 
         normalized_options = {str(key): str(value) for key, value in options.items()}
@@ -105,7 +105,6 @@ class Session:
     """
 
     def __init__(self):
-
         # self._sess is managed by the derived class and relies on bindings from C.InferenceSession
         self._sess = None
         self._enable_fallback = True
@@ -193,15 +192,15 @@ def run(self, output_names, input_feed, run_options=None):
         num_inputs = len(input_feed)
         # the graph may have optional inputs used to override initializers. allow for that.
         if num_inputs < num_required_inputs:
-            raise ValueError("Model requires {} inputs. Input Feed contains {}".format(num_required_inputs, num_inputs))
+            raise ValueError(f"Model requires {num_required_inputs} inputs. Input Feed contains {num_inputs}")
         if not output_names:
             output_names = [output.name for output in self._outputs_meta]
         try:
             return self._sess.run(output_names, input_feed, run_options)
         except C.EPFail as err:
             if self._enable_fallback:
-                print("EP Error: {} using {}".format(str(err), self._providers))
-                print("Falling back to {} and retrying.".format(self._fallback_providers))
+                print(f"EP Error: {str(err)} using {self._providers}")
+                print(f"Falling back to {self._fallback_providers} and retrying.")
                 self.set_providers(self._fallback_providers)
                 # Fallback only once.
                 self.disable_fallback()
@@ -239,15 +238,15 @@ def invoke(sess, output_names, input_dict_ort_values, run_options):
         num_inputs = len(input_dict_ort_values)
         # the graph may have optional inputs used to override initializers. allow for that.
         if num_inputs < num_required_inputs:
-            raise ValueError("Model requires {} inputs. Input Feed contains {}".format(num_required_inputs, num_inputs))
+            raise ValueError(f"Model requires {num_required_inputs} inputs. Input Feed contains {num_inputs}")
         if not output_names:
             output_names = [output.name for output in self._outputs_meta]
         try:
             return invoke(self._sess, output_names, input_dict_ort_values, run_options)
         except C.EPFail as err:
             if self._enable_fallback:
-                print("EP Error: {} using {}".format(str(err), self._providers))
-                print("Falling back to {} and retrying.".format(self._fallback_providers))
+                print(f"EP Error: {str(err)} using {self._providers}")
+                print(f"Falling back to {self._fallback_providers} and retrying.")
                 self.set_providers(self._fallback_providers)
                 # Fallback only once.
                 self.disable_fallback()
@@ -352,7 +351,7 @@ def __init__(self, path_or_bytes, sess_options=None, providers=None, provider_op
             self._model_path = None
             self._model_bytes = path_or_bytes  # TODO: This is bad as we're holding the memory indefinitely
         else:
-            raise TypeError("Unable to load from type '{0}'".format(type(path_or_bytes)))
+            raise TypeError(f"Unable to load from type '{type(path_or_bytes)}'")
 
         self._sess_options = sess_options
         self._sess_options_initial = sess_options
@@ -366,8 +365,8 @@ def __init__(self, path_or_bytes, sess_options=None, providers=None, provider_op
             self._create_inference_session(providers, provider_options, disabled_optimizers)
         except ValueError:
             if self._enable_fallback:
-                print("EP Error using {}".format(providers))
-                print("Falling back to {} and retrying.".format(self._fallback_providers))
+                print(f"EP Error using {providers}")
+                print(f"Falling back to {self._fallback_providers} and retrying.")
                 self._create_inference_session(self._fallback_providers, None)
                 # Fallback only once.
                 self.disable_fallback()
@@ -392,7 +391,7 @@ def _create_inference_session(self, providers, provider_options, disabled_optimi
         if providers == [] and len(available_providers) > 1:
             self.disable_fallback()
             raise ValueError(
-                "This ORT build has {} enabled. ".format(available_providers)
+                f"This ORT build has {available_providers} enabled. "
                 + "Since ORT 1.9, you are required to explicitly set "
                 + "the providers parameter when instantiating InferenceSession. For example, "
                 "onnxruntime.InferenceSession(..., providers={}, ...)".format(available_providers)
diff --git a/onnxruntime/python/onnxruntime_validation.py b/onnxruntime/python/onnxruntime_validation.py
index 8b313635527ac..16cbc8e8099e1 100644
--- a/onnxruntime/python/onnxruntime_validation.py
+++ b/onnxruntime/python/onnxruntime_validation.py
@@ -15,8 +15,8 @@ def check_distro_info():
     __my_distro_ver__ = ""
     __my_system__ = platform.system().lower()
 
-    __OS_RELEASE_FILE__ = "/etc/os-release"
-    __LSB_RELEASE_FILE__ = "/etc/lsb-release"
+    __OS_RELEASE_FILE__ = "/etc/os-release"  # noqa: N806
+    __LSB_RELEASE_FILE__ = "/etc/lsb-release"  # noqa: N806
 
     if __my_system__ == "windows":
         __my_distro__ = __my_system__
@@ -67,7 +67,7 @@ def validate_build_package_info():
 
     has_ortmodule = False
     try:
-        from onnxruntime.training.ortmodule import ORTModule  # noqa
+        from onnxruntime.training.ortmodule import ORTModule  # noqa: F401
 
         has_ortmodule = True
     except ImportError:
@@ -100,9 +100,9 @@ def validate_build_package_info():
             from .build_and_package_info import __version__ as version
             from .build_and_package_info import package_name
 
-            try:
+            try:  # noqa: SIM105
                 from .build_and_package_info import cuda_version
-            except:  # noqa
+            except Exception:
                 pass
 
             if cuda_version:
@@ -110,7 +110,7 @@ def validate_build_package_info():
                 # when the build environment has none or multiple libraries installed
                 try:
                     from .build_and_package_info import cudart_version
-                except:  # noqa
+                except Exception:
                     warnings.warn("WARNING: failed to get cudart_version from onnxruntime build info.")
                     cudart_version = None
 
@@ -132,7 +132,7 @@ def print_build_package_info():
                 # TODO: rcom
                 pass
 
-        except Exception as e:  # noqa
+        except Exception as e:
             warnings.warn("WARNING: failed to collect onnxruntime version and build info")
             print(e)
 
diff --git a/onnxruntime/python/providers/tvm/__init__.py b/onnxruntime/python/providers/tvm/__init__.py
index ce4fdf626edb8..4bcbc0bfef586 100644
--- a/onnxruntime/python/providers/tvm/__init__.py
+++ b/onnxruntime/python/providers/tvm/__init__.py
@@ -7,4 +7,4 @@
 JIT interface implementing packed functions that
 import and compile frontend models
 """
-from .ort import ANSOR_TYPE, AUTO_TVM_TYPE, onnx_compile
+from .ort import ANSOR_TYPE, AUTO_TVM_TYPE, onnx_compile  # noqa: F401
diff --git a/onnxruntime/python/providers/tvm/ort.py b/onnxruntime/python/providers/tvm/ort.py
index d2b690ddc6c35..e3970ab5de488 100644
--- a/onnxruntime/python/providers/tvm/ort.py
+++ b/onnxruntime/python/providers/tvm/ort.py
@@ -48,9 +48,7 @@ def get_tvm_executor(irmod, executor, target, params):
             log.info("Build TVM graph executor")
             lib = relay.build(irmod, target=target, params=params)
         else:
-            log.error(
-                "Executor type {} is unsupported. ".format(executor) + 'Only "vm" and "graph" types are supported'
-            )
+            log.error(f"Executor type {executor} is unsupported. " + 'Only "vm" and "graph" types are supported')
             return None
         return lib
 
@@ -90,7 +88,7 @@ def get_tvm_executor(irmod, executor, target, params):
                 "vision.roi_align": ["NHWC", "default"],
             }
             log.info("Use tuning file from ", ANSOR_TYPE, ": ", tuning_logfile)
-            with auto_scheduler.ApplyHistoryBest(tuning_logfile):
+            with auto_scheduler.ApplyHistoryBest(tuning_logfile):  # noqa: SIM117
                 with tvm.transform.PassContext(
                     opt_level=opt_level,
                     config={
@@ -116,8 +114,8 @@ def get_tvm_executor(irmod, executor, target, params):
                     lib = get_tvm_executor(irmod, executor, tvm_target, params)
         else:
             log.error(
-                "Tuning log type {} is unsupported. ".format(tuning_type)
-                + "Only {} and {} types are supported".format(ANSOR_TYPE, AUTO_TVM_TYPE)
+                f"Tuning log type {tuning_type} is unsupported. "
+                + f"Only {ANSOR_TYPE} and {AUTO_TVM_TYPE} types are supported"
             )
             return None
     else:
@@ -134,7 +132,7 @@ def get_tvm_executor(irmod, executor, target, params):
         m = graph_executor.GraphModule(lib["default"](ctx))
     else:
         print(
-            "ERROR: Executor type {} is unsupported. ".format(executor),
+            f"ERROR: Executor type {executor} is unsupported. ",
             'Only "vm" and "graph" types are supported',
         )
         return None
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/_kernel_explorer.pyi b/onnxruntime/python/tools/kernel_explorer/kernels/_kernel_explorer.pyi
index 79da3c11b9e4b..335e71738e081 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/_kernel_explorer.pyi
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/_kernel_explorer.pyi
@@ -1,8 +1,8 @@
 class DeviceArray:
     def __init__(self, ndarray) -> None: ...
-    def UpdateHostNumpyArray(self) -> None: ...
+    def UpdateHostNumpyArray(self) -> None: ...  # noqa: N802
 
-class blas_op:
+class blas_op:  # noqa: N801
     T: int
     N: int
 
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/batched_gemm_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/batched_gemm_test.py
index 596c83602285e..13882eec92a8b 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/batched_gemm_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/batched_gemm_test.py
@@ -100,7 +100,7 @@ def test_rocblas_gemm_all_cases(dtype, transa, transb, m, n, k, batch):
 @pytest.mark.parametrize("transa, transb", all_transabs)
 @pytest.mark.parametrize("dtype", dtypes)
 def test_gemm_tunable_bert_cases(dtype, transa, transb, m, n, k, batch):
-    wrapper_name = "BatchedGemmTunable_{}_{}".format(dtype_to_suffix(dtype), transab_to_suffix((transa, transb)))
+    wrapper_name = f"BatchedGemmTunable_{dtype_to_suffix(dtype)}_{transab_to_suffix((transa, transb))}"
     _test_batched_gemm(getattr(ke, wrapper_name), dtype, transa, transb, m, n, k, batch)
 
 
@@ -116,7 +116,7 @@ def test_rocblas_gemm_alpha_beta(dtype, transa, transb, alpha, beta):
 @pytest.mark.parametrize("transa, transb", all_transabs)
 @pytest.mark.parametrize("dtype", dtypes)
 def test_tunable_gemm_alpha_beta(dtype, transa, transb, alpha, beta):
-    wrapper_name = "BatchedGemmTunable_{}_{}".format(dtype_to_suffix(dtype), transab_to_suffix((transa, transb)))
+    wrapper_name = f"BatchedGemmTunable_{dtype_to_suffix(dtype)}_{transab_to_suffix((transa, transb))}"
     _test_batched_gemm(getattr(ke, wrapper_name), dtype, transa, transb, 768, 768, 512, 4, alpha=alpha, beta=beta)
 
 
@@ -165,7 +165,7 @@ def profile_gemm_func(f, dtype: str, transa: bool, transb: bool, m: int, n: int,
         duration_ms = -1
         if my_gemm.SelectOp(impl):
             duration_ms = my_gemm.Profile()
-        FLOPs = batch * m * k * n * 2
+        FLOPs = batch * m * k * n * 2  # noqa: N806
         ke.report(BatchedGemmMetric(impl, dtype, duration_ms, FLOPs, transa, transb, m, n, k, batch))
 
 
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_fast_gelu_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_fast_gelu_test.py
index 96348d3803462..697da9b9e8b22 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_fast_gelu_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_fast_gelu_test.py
@@ -81,7 +81,7 @@ def test_gemmfastgelu_unfused_bert_cases(dtype, size, transab):
 @pytest.mark.parametrize("size", get_gemm_basic_sizes(full=False) + get_gemm_bert_sizes(full=False))
 @pytest.mark.parametrize("transab", all_transabs)
 def test_gemmfastgelu_tunable_bert_cases(dtype, size, transab):
-    wrapper_name = "GemmFastGeluTunable_{}_{}".format(dtype_to_suffix(dtype), transab_to_suffix(transab))
+    wrapper_name = f"GemmFastGeluTunable_{dtype_to_suffix(dtype)}_{transab_to_suffix(transab)}"
     _test_gemmfastgelu(getattr(ke, wrapper_name), dtype, *size, *transab)
 
 
@@ -90,7 +90,7 @@ def test_gemmfastgelu_tunable_bert_cases(dtype, size, transab):
 @pytest.mark.parametrize("size", get_gemm_basic_sizes(full=False) + get_gemm_bert_sizes(full=False))
 @pytest.mark.parametrize("transab", all_transabs)
 def test_gemmfastgelu_ck_bert_cases(dtype, size, transab):
-    wrapper_name = "CKGemmFastGelu_{}_{}".format(dtype_to_suffix(dtype), transab_to_suffix(transab))
+    wrapper_name = f"CKGemmFastGelu_{dtype_to_suffix(dtype)}_{transab_to_suffix(transab)}"
     _test_gemmfastgelu(getattr(ke, wrapper_name), dtype, *size, *transab)
 
 
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_softmax_gemm_permute_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_softmax_gemm_permute_test.py
index e7b59baed3cf8..edd68ed3f837e 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_softmax_gemm_permute_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_softmax_gemm_permute_test.py
@@ -85,9 +85,9 @@ def _test_gemm_softmax_gemm_permute(
             raise ValueError
 
     np.random.seed(42)
-    Q = multinormal_distribution(np.prod(q_shape[:-1]), q_shape[-1]).reshape(q_shape).astype(np.float64)
-    K = multinormal_distribution(np.prod(k_shape[:-1]), k_shape[-1]).reshape(k_shape).astype(np.float64)
-    V = multinormal_distribution(np.prod(v_shape[:-1]), v_shape[-1]).reshape(v_shape).astype(np.float64)
+    Q = multinormal_distribution(np.prod(q_shape[:-1]), q_shape[-1]).reshape(q_shape).astype(np.float64)  # noqa: N806
+    K = multinormal_distribution(np.prod(k_shape[:-1]), k_shape[-1]).reshape(k_shape).astype(np.float64)  # noqa: N806
+    V = multinormal_distribution(np.prod(v_shape[:-1]), v_shape[-1]).reshape(v_shape).astype(np.float64)  # noqa: N806
     if bias_shape is not None:
         attn_bias = np.random.uniform(-0.5, 0.5, size=bias_shape)
     if mask_shape is not None:
@@ -110,13 +110,13 @@ def _test_gemm_softmax_gemm_permute(
     ref = np.swapaxes(attn, 2, 1)  # permute 0213
 
     out = np.empty(out_shape, dtype=dtype)
-    host_Q = Q.astype(dtype)
-    host_K = K.astype(dtype)
-    host_V = V.astype(dtype)
+    host_Q = Q.astype(dtype)  # noqa: N806
+    host_K = K.astype(dtype)  # noqa: N806
+    host_V = V.astype(dtype)  # noqa: N806
     host_attn_bias = attn_bias.astype(dtype) if attn_bias is not None else None
-    dev_Q = ke.DeviceArray(host_Q)
-    dev_K = ke.DeviceArray(host_K)
-    dev_V = ke.DeviceArray(host_V)
+    dev_Q = ke.DeviceArray(host_Q)  # noqa: N806
+    dev_K = ke.DeviceArray(host_K)  # noqa: N806
+    dev_V = ke.DeviceArray(host_V)  # noqa: N806
     dev_out = ke.DeviceArray(out)
     dev_attn_bias = ke.DeviceArray(host_attn_bias) if host_attn_bias is not None else None
     dev_attn_mask = ke.DeviceArray(attn_mask) if attn_mask is not None else None
@@ -264,22 +264,22 @@ def profile_gemm_softmax_gemm_permute_func(
             raise ValueError
 
     np.random.seed(42)
-    Q = multinormal_distribution(np.prod(q_shape[:-1]), q_shape[-1]).reshape(q_shape).astype(np.float64)
-    K = multinormal_distribution(np.prod(k_shape[:-1]), k_shape[-1]).reshape(k_shape).astype(np.float64)
-    V = multinormal_distribution(np.prod(v_shape[:-1]), v_shape[-1]).reshape(v_shape).astype(np.float64)
+    Q = multinormal_distribution(np.prod(q_shape[:-1]), q_shape[-1]).reshape(q_shape).astype(np.float64)  # noqa: N806
+    K = multinormal_distribution(np.prod(k_shape[:-1]), k_shape[-1]).reshape(k_shape).astype(np.float64)  # noqa: N806
+    V = multinormal_distribution(np.prod(v_shape[:-1]), v_shape[-1]).reshape(v_shape).astype(np.float64)  # noqa: N806
     if bias_shape is not None:
         attn_bias = np.random.uniform(-2, 2, size=bias_shape)
     if mask_shape is not None:
         attn_mask = (np.random.randint(0, 100, size=mask_shape) < 95).astype(np.int32)
 
     out = np.empty(out_shape, dtype=dtype)
-    host_Q = Q.astype(dtype)
-    host_K = K.astype(dtype)
-    host_V = V.astype(dtype)
+    host_Q = Q.astype(dtype)  # noqa: N806
+    host_K = K.astype(dtype)  # noqa: N806
+    host_V = V.astype(dtype)  # noqa: N806
     host_attn_bias = attn_bias.astype(dtype) if attn_bias is not None else None
-    dev_Q = ke.DeviceArray(host_Q)
-    dev_K = ke.DeviceArray(host_K)
-    dev_V = ke.DeviceArray(host_V)
+    dev_Q = ke.DeviceArray(host_Q)  # noqa: N806
+    dev_K = ke.DeviceArray(host_K)  # noqa: N806
+    dev_V = ke.DeviceArray(host_V)  # noqa: N806
     dev_out = ke.DeviceArray(out)
     dev_attn_bias = ke.DeviceArray(host_attn_bias) if host_attn_bias is not None else None
     dev_attn_mask = ke.DeviceArray(attn_mask) if attn_mask is not None else None
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_test.py
index c2e0e38fc0ea5..40f17b7ce28aa 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/gemm_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/gemm_test.py
@@ -83,7 +83,7 @@ def test_rocblas_gemm_all_cases(dtype, transa, transb, m, n, k):
 @pytest.mark.parametrize("transa, transb", all_transabs)
 @pytest.mark.parametrize("dtype", dtypes)
 def test_ck_gemm_bert_cases(dtype, transa, transb, m, n, k):
-    wrapper_name = "CKGemm_{}_{}".format(dtype_to_suffix(dtype), transab_to_suffix((transa, transb)))
+    wrapper_name = f"CKGemm_{dtype_to_suffix(dtype)}_{transab_to_suffix((transa, transb))}"
     _test_gemm(getattr(ke, wrapper_name), dtype, transa, transb, m, n, k)
 
 
@@ -92,7 +92,7 @@ def test_ck_gemm_bert_cases(dtype, transa, transb, m, n, k):
 @pytest.mark.parametrize("transa, transb", all_transabs)
 @pytest.mark.parametrize("dtype", dtypes)
 def test_gemm_tunable_bert_cases(dtype, transa, transb, m, n, k):
-    wrapper_name = "GemmTunable_{}_{}".format(dtype_to_suffix(dtype), transab_to_suffix((transa, transb)))
+    wrapper_name = f"GemmTunable_{dtype_to_suffix(dtype)}_{transab_to_suffix((transa, transb))}"
     _test_gemm(getattr(ke, wrapper_name), dtype, transa, transb, m, n, k)
 
 
@@ -109,7 +109,7 @@ def test_rocblas_gemm_alpha_beta(dtype, transa, transb, alpha, beta):
 @pytest.mark.parametrize("transa, transb", all_transabs)
 @pytest.mark.parametrize("dtype", dtypes)
 def test_ck_gemm_alpha_beta(dtype, transa, transb, alpha, beta):
-    wrapper_name = "CKGemm_{}_{}".format(dtype_to_suffix(dtype), transab_to_suffix((transa, transb)))
+    wrapper_name = f"CKGemm_{dtype_to_suffix(dtype)}_{transab_to_suffix((transa, transb))}"
     _test_gemm(getattr(ke, wrapper_name), dtype, transa, transb, 256, 128, 384, alpha=alpha, beta=beta)
 
 
@@ -117,7 +117,7 @@ def test_ck_gemm_alpha_beta(dtype, transa, transb, alpha, beta):
 @pytest.mark.parametrize("transa, transb", all_transabs)
 @pytest.mark.parametrize("dtype", dtypes)
 def test_gemm_tunable_alpha_beta(dtype, transa, transb, alpha, beta):
-    wrapper_name = "GemmTunable_{}_{}".format(dtype_to_suffix(dtype), transab_to_suffix((transa, transb)))
+    wrapper_name = f"GemmTunable_{dtype_to_suffix(dtype)}_{transab_to_suffix((transa, transb))}"
     _test_gemm(getattr(ke, wrapper_name), dtype, transa, transb, 128, 512, 384, alpha=alpha, beta=beta)
 
 
@@ -165,7 +165,7 @@ def profile_gemm_func(f, dtype: str, transa: bool, transb: bool, m: int, n: int,
         duration_ms = -1
         if my_gemm.SelectOp(impl):
             duration_ms = my_gemm.Profile()
-        FLOPs = m * k * n * 2
+        FLOPs = m * k * n * 2  # noqa: N806
 
         ke.report(GemmMetric(impl, dtype, duration_ms, FLOPs, transa, transb, m, n, k))
 
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/kernel_explorer.py b/onnxruntime/python/tools/kernel_explorer/kernels/kernel_explorer.py
index d147b335abb1e..e69179bf616bd 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/kernel_explorer.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/kernel_explorer.py
@@ -23,7 +23,7 @@
 sys.path.insert(0, build_dir)
 
 # pylint: disable=wrong-import-position
-import onnxruntime_pybind11_state  # noqa
+import onnxruntime_pybind11_state  # noqa: E402
 
 # We need to call some functions to properly initialize so pointers in the library
 available_providers = onnxruntime_pybind11_state.get_available_providers()
@@ -52,17 +52,17 @@
             library_to_load.append(path)
             continue
 
-        raise EnvironmentError(f"cannot found {lib}")
+        raise OSError(f"cannot found {lib}")
 
 
 # use RTLD_GLOBAL to bring all symbols to global name space
 libraries = [ctypes.CDLL(lib_path, mode=ctypes.RTLD_GLOBAL) for lib_path in library_to_load]
 
 # pylint: disable=wrong-import-position, disable=unused-import
-import _kernel_explorer  # noqa
+import _kernel_explorer  # noqa: E402, F401
 
 # pylint: disable=wrong-import-position, disable=unused-import, disable=wildcard-import
-from _kernel_explorer import *  # noqa
+from _kernel_explorer import *  # noqa: F403, E402
 
 
 # Benchmark Reporter
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/skip_layer_norm_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/skip_layer_norm_test.py
index 006e563ed8995..60d441b30908e 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/skip_layer_norm_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/skip_layer_norm_test.py
@@ -52,7 +52,7 @@ def run_skip_layer_norm(batch_size: int, seq_len: int, hidden_size: int, dtype:
     skip = np.random.rand(batch_size, seq_len, hidden_size).astype(dtype)
     bias = np.random.rand(hidden_size).astype(dtype)
     gamma = np.random.rand(hidden_size).astype(dtype)
-    beta = np.random.rand((hidden_size)).astype(dtype)
+    beta = np.random.rand(hidden_size).astype(dtype)
     # Because of rocm FMAs calculation issue with float16, epsilon should be larger when hidden_size is small
     epsilon = 0.05 if hidden_size < 8 else 0.0005
     output_y = np.random.rand(batch_size, seq_len, hidden_size).astype(dtype)
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/strided_batched_gemm_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/strided_batched_gemm_test.py
index bcd9f8db7a414..95b08e452063d 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/strided_batched_gemm_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/strided_batched_gemm_test.py
@@ -104,7 +104,7 @@ def test_rocblas_gemm_all_cases(dtype, transa, transb, m, n, k, batch):
 @pytest.mark.parametrize("transa, transb", all_transabs)
 @pytest.mark.parametrize("dtype", dtypes)
 def test_ck_gemm_all_cases(dtype, transa, transb, m, n, k, batch):
-    wrapper_name = "CKStridedBatchedGemm_{}_{}".format(dtype_to_suffix(dtype), transab_to_suffix((transa, transb)))
+    wrapper_name = f"CKStridedBatchedGemm_{dtype_to_suffix(dtype)}_{transab_to_suffix((transa, transb))}"
     _test_strided_batched_gemm(getattr(ke, wrapper_name), dtype, transa, transb, m, n, k, batch)
 
 
@@ -114,7 +114,7 @@ def test_ck_gemm_all_cases(dtype, transa, transb, m, n, k, batch):
 @pytest.mark.parametrize("transa, transb", all_transabs)
 @pytest.mark.parametrize("dtype", dtypes)
 def test_gemm_tunable_bert_cases(dtype, transa, transb, m, n, k, batch):
-    wrapper_name = "StridedBatchedGemmTunable_{}_{}".format(dtype_to_suffix(dtype), transab_to_suffix((transa, transb)))
+    wrapper_name = f"StridedBatchedGemmTunable_{dtype_to_suffix(dtype)}_{transab_to_suffix((transa, transb))}"
     _test_strided_batched_gemm(getattr(ke, wrapper_name), dtype, transa, transb, m, n, k, batch)
 
 
@@ -133,7 +133,7 @@ def test_rocblas_gemm_alpha_beta(dtype, transa, transb, alpha, beta):
 @pytest.mark.parametrize("transa, transb", all_transabs)
 @pytest.mark.parametrize("dtype", dtypes)
 def test_ck_gemm_alpha_beta(dtype, transa, transb, alpha, beta):
-    wrapper_name = "CKStridedBatchedGemm_{}_{}".format(dtype_to_suffix(dtype), transab_to_suffix((transa, transb)))
+    wrapper_name = f"CKStridedBatchedGemm_{dtype_to_suffix(dtype)}_{transab_to_suffix((transa, transb))}"
     _test_strided_batched_gemm(
         getattr(ke, wrapper_name), dtype, transa, transb, 256, 128, 384, 8, alpha=alpha, beta=beta
     )
@@ -143,7 +143,7 @@ def test_ck_gemm_alpha_beta(dtype, transa, transb, alpha, beta):
 @pytest.mark.parametrize("transa, transb", all_transabs)
 @pytest.mark.parametrize("dtype", dtypes)
 def test_gemm_tunable_alpha_beta(dtype, transa, transb, alpha, beta):
-    wrapper_name = "StridedBatchedGemmTunable_{}_{}".format(dtype_to_suffix(dtype), transab_to_suffix((transa, transb)))
+    wrapper_name = f"StridedBatchedGemmTunable_{dtype_to_suffix(dtype)}_{transab_to_suffix((transa, transb))}"
     _test_strided_batched_gemm(
         getattr(ke, wrapper_name), dtype, transa, transb, 128, 512, 384, 4, alpha=alpha, beta=beta
     )
@@ -197,7 +197,7 @@ def profile_gemm_func(f, dtype: str, transa: bool, transb: bool, m: int, n: int,
         duration_ms = -1
         if my_gemm.SelectOp(impl):
             duration_ms = my_gemm.Profile()
-        FLOPs = batch * m * k * n * 2
+        FLOPs = batch * m * k * n * 2  # noqa: N806
         ke.report(StridedBatchedGemmMetric(impl, dtype, duration_ms, FLOPs, transa, transb, m, n, k, batch))
 
 
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/utils.py b/onnxruntime/python/tools/kernel_explorer/kernels/utils.py
index b005dd98d53e7..4818e487ec7eb 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/utils.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/utils.py
@@ -48,7 +48,7 @@ def get_gemm_bound(
     machine_eps = 2.0 ** -(24 if dtype == "float32" else 11)
 
     # The following implements error bound 5.7 in paper I. C. Ipsen and H. Zhou, “Probabilistic error analysis for
-    # Inner Products,” SIAM Journal on Matrix Analysis and Applications, vol. 41, no. 4, pp. 1726–1741, 2020.
+    # Inner Products,” SIAM Journal on Matrix Analysis and Applications, vol. 41, no. 4, pp. 1726-1741, 2020.
     # NOTE: the bound is not tight for float16 when k is large
     if a_b_positive:
         coeff = 1.0
diff --git a/onnxruntime/python/tools/microbench/benchmark.py b/onnxruntime/python/tools/microbench/benchmark.py
index fcf8c6f23f362..a52740d45956c 100644
--- a/onnxruntime/python/tools/microbench/benchmark.py
+++ b/onnxruntime/python/tools/microbench/benchmark.py
@@ -76,7 +76,7 @@ def get_default_provider():
 
 class Benchmark:
     def __init__(self, model, inputs, outputs, args):
-        self.provider = get_default_provider() if args.provider == None else provider_name(args.provider)
+        self.provider = get_default_provider() if args.provider is None else provider_name(args.provider)
         logger.info(f"Execution provider: {self.provider}")
         self.profiling = args.profiling
         self.model = model
@@ -126,13 +126,13 @@ def benchmark(self):
         io_binding = self.create_io_binding(sess, input_tensors, output_tensors)
 
         # warm up
-        for iter in range(10):
+        for _iter in range(10):
             sess.run_with_iobinding(io_binding)
 
         # measure
         max_iters = 100
         start_time = time.time()
-        for iter in range(max_iters):
+        for _iter in range(max_iters):
             sess.run_with_iobinding(io_binding)
 
         # time is in milliseconds
diff --git a/onnxruntime/python/tools/offline_tuning.py b/onnxruntime/python/tools/offline_tuning.py
index 8dbae5efe8f9f..c032685b70f7c 100644
--- a/onnxruntime/python/tools/offline_tuning.py
+++ b/onnxruntime/python/tools/offline_tuning.py
@@ -124,23 +124,23 @@ def main():
         if tuning_results is None:
             sys.stderr.write(f"{args.input_onnx} does not have tuning results embedded!\n")
             sys.exit(-1)
-        json.dump(tuning_results, open(args.output_json, "w"))
+        json.dump(tuning_results, open(args.output_json, "w"))  # noqa: SIM115
     elif args.cmd == "embed":
         model = onnx.load_model(args.input_onnx)
         merger = Merger()
-        for tuning_results in [json.load(open(f)) for f in args.input_json]:
+        for tuning_results in [json.load(open(f)) for f in args.input_json]:  # noqa: SIM115
             merger.merge(tuning_results)
         model = embed(model, merger.get_merged(), args.force)
         onnx.save_model(model, args.output_onnx)
     elif args.cmd == "merge":
         merger = Merger()
-        for tuning_results in [json.load(open(f)) for f in args.input_json]:
+        for tuning_results in [json.load(open(f)) for f in args.input_json]:  # noqa: SIM115
             merger.merge(tuning_results)
-        json.dump(merger.get_merged(), open(args.output_json, "w"))
+        json.dump(merger.get_merged(), open(args.output_json, "w"))  # noqa: SIM115
     elif args.cmd == "pprint":
         tuning_results = None
-        try:
-            tuning_results = json.load(open(args.json_or_onnx, "r"))
+        try:  # noqa: SIM105
+            tuning_results = json.load(open(args.json_or_onnx))  # noqa: SIM115
         except Exception:
             # it might be an onnx file otherwise, try it latter
             pass
diff --git a/onnxruntime/python/tools/onnxruntime_test.py b/onnxruntime/python/tools/onnxruntime_test.py
index 11759f3ad17d5..b5f3f31c4db94 100644
--- a/onnxruntime/python/tools/onnxruntime_test.py
+++ b/onnxruntime/python/tools/onnxruntime_test.py
@@ -29,7 +29,7 @@
 }
 
 
-def generate_feeds(sess, symbolic_dims={}):
+def generate_feeds(sess, symbolic_dims={}):  # noqa: B006
     feeds = {}
     for input_meta in sess.get_inputs():
         # replace any symbolic dimensions
@@ -56,7 +56,7 @@ def generate_feeds(sess, symbolic_dims={}):
         elif input_meta.type == "tensor(bool)":
             feeds[input_meta.name] = np.random.randint(2, size=tuple(shape)).astype("bool")
         else:
-            print("unsupported input type {} for input {}".format(input_meta.type, input_meta.name))
+            print(f"unsupported input type {input_meta.type} for input {input_meta.name}")
             sys.exit(-1)
     return feeds
 
@@ -67,12 +67,12 @@ def run_model(
     num_iters=1,
     debug=None,
     profile=None,
-    symbolic_dims={},
+    symbolic_dims={},  # noqa: B006
     feeds=None,
     override_initializers=True,
 ):
     if debug:
-        print("Pausing execution ready for debugger to attach to pid: {}".format(os.getpid()))
+        print(f"Pausing execution ready for debugger to attach to pid: {os.getpid()}")
         print("Press key to continue.")
         sys.stdin.read(1)
 
@@ -107,22 +107,22 @@ def run_model(
             elif initializer.type == "tensor(bool)":
                 feeds[initializer.name] = np.random.randint(2, size=tuple(shape)).astype("bool")
             else:
-                print("unsupported initializer type {} for initializer {}".format(initializer.type, initializer.name))
+                print(f"unsupported initializer type {initializer.type} for initializer {initializer.name}")
                 sys.exit(-1)
 
     start = timer()
-    for i in range(num_iters):
+    for _i in range(num_iters):
         outputs = sess.run([], feeds)  # fetch all outputs
     end = timer()
 
-    print("model: {}".format(meta.graph_name))
-    print("version: {}".format(meta.version))
-    print("iterations: {}".format(num_iters))
-    print("avg latency: {} ms".format(((end - start) * 1000) / num_iters))
+    print(f"model: {meta.graph_name}")
+    print(f"version: {meta.version}")
+    print(f"iterations: {num_iters}")
+    print(f"avg latency: {((end - start) * 1000) / num_iters} ms")
 
     if profile:
         trace_file = sess.end_profiling()
-        print("trace file written to: {}".format(trace_file))
+        print(f"trace file written to: {trace_file}")
 
     return 0, feeds, num_iters > 0 and outputs
 
diff --git a/onnxruntime/python/tools/profile_explorer/profile_explorer.py b/onnxruntime/python/tools/profile_explorer/profile_explorer.py
index f3430a89e7a34..a76b01610df79 100644
--- a/onnxruntime/python/tools/profile_explorer/profile_explorer.py
+++ b/onnxruntime/python/tools/profile_explorer/profile_explorer.py
@@ -13,7 +13,7 @@ def _demangle(name, demangler="c++filt"):
         with sp.Popen([demangler, name], stdin=sp.PIPE, stdout=sp.PIPE) as proc:
             out, _ = proc.communicate()
             return out.decode("utf-8").strip()
-    except:
+    except Exception:
         return name
 
 
@@ -68,7 +68,7 @@ def _json_to_df(profile_path, filter_matcher):
     cpu_entries = []
     gpu_entries = []
 
-    with open(profile_path, "r", encoding="utf-8") as file_obj:
+    with open(profile_path, encoding="utf-8") as file_obj:
         data = json.load(file_obj)
     if isinstance(data, dict):
         data = data["traceEvents"]
@@ -156,7 +156,7 @@ def _print_cpu_top_hitters(frame, args):
 
     frame2 = frame[["duration", "count"]].sum()
     frame["pct"] = 100 * (frame["duration"] / frame2["duration"])
-    fields = group_key + ["duration", "pct", "count"]
+    fields = [*group_key, "duration", "pct", "count"]
     frame1 = frame[fields].groupby(group_key).sum().reset_index()
     frame1 = frame1.sort_values(by="duration", ascending=False)[:top]
     frame1["cumulative_pct"] = frame1["pct"].cumsum()
@@ -180,7 +180,7 @@ def _print_gpu_top_hitters(frame, args):
 
     frame2 = frame[["duration", "count"]].sum()
     frame["pct"] = 100 * (frame["duration"] / frame2["duration"])
-    fields = group_key + ["duration", "pct", "count"]
+    fields = [*group_key, "duration", "pct", "count"]
     frame1 = frame[fields].groupby(group_key).sum().reset_index()
     frame1 = frame1.sort_values(by="duration", ascending=False)[:top]
     frame1["cumulative_pct"] = frame1["pct"].cumsum()
@@ -207,10 +207,7 @@ def _construct_filter_matcher(args):
     def _match_item(item):
         if item in concrete_filter_set:
             return True
-        for pattern in fnmatch_filter_set:
-            if fnmatch.fnmatch(item, pattern):
-                return True
-        return False
+        return any(fnmatch.fnmatch(item, pattern) for pattern in fnmatch_filter_set)
 
     return _match_item
 
diff --git a/onnxruntime/python/tools/pytorch_export_contrib_ops.py b/onnxruntime/python/tools/pytorch_export_contrib_ops.py
index 9af57eda6ae90..aeb78f03dd721 100644
--- a/onnxruntime/python/tools/pytorch_export_contrib_ops.py
+++ b/onnxruntime/python/tools/pytorch_export_contrib_ops.py
@@ -11,7 +11,7 @@
     # TODO(justinchuby): Create a function to alert users when torch is not installed
     import torch
 except ModuleNotFoundError:
-    raise ModuleNotFoundError(
+    raise ModuleNotFoundError(  # noqa: B904
         "This module is only useful in combination with PyTorch. To install PyTorch see https://pytorch.org/."
     )
 
diff --git a/onnxruntime/python/tools/quantization/CalTableFlatBuffers/KeyValue.py b/onnxruntime/python/tools/quantization/CalTableFlatBuffers/KeyValue.py
index ba846b17eecdc..873d41324b514 100644
--- a/onnxruntime/python/tools/quantization/CalTableFlatBuffers/KeyValue.py
+++ b/onnxruntime/python/tools/quantization/CalTableFlatBuffers/KeyValue.py
@@ -8,71 +8,71 @@
 np = import_numpy()
 
 
-class KeyValue(object):
+class KeyValue:
     __slots__ = ["_tab"]
 
     @classmethod
-    def GetRootAs(cls, buf, offset=0):
+    def GetRootAs(cls, buf, offset=0):  # noqa: N802
         n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
         x = KeyValue()
         x.Init(buf, n + offset)
         return x
 
     @classmethod
-    def GetRootAsKeyValue(cls, buf, offset=0):
+    def GetRootAsKeyValue(cls, buf, offset=0):  # noqa: N802
         """This method is deprecated. Please switch to GetRootAs."""
         return cls.GetRootAs(buf, offset)
 
     # KeyValue
-    def Init(self, buf, pos):
+    def Init(self, buf, pos):  # noqa: N802
         self._tab = flatbuffers.table.Table(buf, pos)
 
     # KeyValue
-    def Key(self):
+    def Key(self):  # noqa: N802
         o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
         if o != 0:
             return self._tab.String(o + self._tab.Pos)
         return None
 
     # KeyValue
-    def Value(self):
+    def Value(self):  # noqa: N802
         o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(6))
         if o != 0:
             return self._tab.String(o + self._tab.Pos)
         return None
 
 
-def Start(builder):
+def Start(builder):  # noqa: N802
     builder.StartObject(2)
 
 
-def KeyValueStart(builder):
+def KeyValueStart(builder):  # noqa: N802
     """This method is deprecated. Please switch to Start."""
     return Start(builder)
 
 
-def AddKey(builder, key):
+def AddKey(builder, key):  # noqa: N802
     builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(key), 0)
 
 
-def KeyValueAddKey(builder, key):
+def KeyValueAddKey(builder, key):  # noqa: N802
     """This method is deprecated. Please switch to AddKey."""
     return AddKey(builder, key)
 
 
-def AddValue(builder, value):
+def AddValue(builder, value):  # noqa: N802
     builder.PrependUOffsetTRelativeSlot(1, flatbuffers.number_types.UOffsetTFlags.py_type(value), 0)
 
 
-def KeyValueAddValue(builder, value):
+def KeyValueAddValue(builder, value):  # noqa: N802
     """This method is deprecated. Please switch to AddValue."""
     return AddValue(builder, value)
 
 
-def End(builder):
+def End(builder):  # noqa: N802
     return builder.EndObject()
 
 
-def KeyValueEnd(builder):
+def KeyValueEnd(builder):  # noqa: N802
     """This method is deprecated. Please switch to End."""
     return End(builder)
diff --git a/onnxruntime/python/tools/quantization/CalTableFlatBuffers/TrtTable.py b/onnxruntime/python/tools/quantization/CalTableFlatBuffers/TrtTable.py
index cf5202ee3b359..1c2cad56d1ce0 100644
--- a/onnxruntime/python/tools/quantization/CalTableFlatBuffers/TrtTable.py
+++ b/onnxruntime/python/tools/quantization/CalTableFlatBuffers/TrtTable.py
@@ -8,27 +8,27 @@
 np = import_numpy()
 
 
-class TrtTable(object):
+class TrtTable:
     __slots__ = ["_tab"]
 
     @classmethod
-    def GetRootAs(cls, buf, offset=0):
+    def GetRootAs(cls, buf, offset=0):  # noqa: N802
         n = flatbuffers.encode.Get(flatbuffers.packer.uoffset, buf, offset)
         x = TrtTable()
         x.Init(buf, n + offset)
         return x
 
     @classmethod
-    def GetRootAsTrtTable(cls, buf, offset=0):
+    def GetRootAsTrtTable(cls, buf, offset=0):  # noqa: N802
         """This method is deprecated. Please switch to GetRootAs."""
         return cls.GetRootAs(buf, offset)
 
     # TrtTable
-    def Init(self, buf, pos):
+    def Init(self, buf, pos):  # noqa: N802
         self._tab = flatbuffers.table.Table(buf, pos)
 
     # TrtTable
-    def Dict(self, j):
+    def Dict(self, j):  # noqa: N802
         o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
         if o != 0:
             x = self._tab.Vector(o)
@@ -42,49 +42,49 @@ def Dict(self, j):
         return None
 
     # TrtTable
-    def DictLength(self):
+    def DictLength(self):  # noqa: N802
         o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
         if o != 0:
             return self._tab.VectorLen(o)
         return 0
 
     # TrtTable
-    def DictIsNone(self):
+    def DictIsNone(self):  # noqa: N802
         o = flatbuffers.number_types.UOffsetTFlags.py_type(self._tab.Offset(4))
         return o == 0
 
 
-def Start(builder):
+def Start(builder):  # noqa: N802
     builder.StartObject(1)
 
 
-def TrtTableStart(builder):
+def TrtTableStart(builder):  # noqa: N802
     """This method is deprecated. Please switch to Start."""
     return Start(builder)
 
 
-def AddDict(builder, dict):
+def AddDict(builder, dict):  # noqa: N802
     builder.PrependUOffsetTRelativeSlot(0, flatbuffers.number_types.UOffsetTFlags.py_type(dict), 0)
 
 
-def TrtTableAddDict(builder, dict):
+def TrtTableAddDict(builder, dict):  # noqa: N802
     """This method is deprecated. Please switch to AddDict."""
     return AddDict(builder, dict)
 
 
-def StartDictVector(builder, numElems):
+def StartDictVector(builder, numElems):  # noqa: N802
     return builder.StartVector(4, numElems, 4)
 
 
-def TrtTableStartDictVector(builder, numElems):
+def TrtTableStartDictVector(builder, numElems):  # noqa: N802
     """This method is deprecated. Please switch to Start."""
     return StartDictVector(builder, numElems)
 
 
-def End(builder):
+def End(builder):  # noqa: N802
     return builder.EndObject()
 
 
-def TrtTableEnd(builder):
+def TrtTableEnd(builder):  # noqa: N802
     """This method is deprecated. Please switch to End."""
     return End(builder)
diff --git a/onnxruntime/python/tools/quantization/__init__.py b/onnxruntime/python/tools/quantization/__init__.py
index 4553805c57058..9d397499d45a4 100644
--- a/onnxruntime/python/tools/quantization/__init__.py
+++ b/onnxruntime/python/tools/quantization/__init__.py
@@ -1,12 +1,16 @@
-from .calibrate import CalibraterBase, CalibrationDataReader, CalibrationMethod, MinMaxCalibrater, create_calibrator
-from .qdq_quantizer import QDQQuantizer
-from .quant_utils import QuantFormat, QuantType, write_calibration_table
-from .quantize import (
-    DynamicQuantConfig,
-    QuantizationMode,
-    StaticQuantConfig,
-    quantize,
-    quantize_dynamic,
-    quantize_static,
+from .calibrate import (  # noqa: F401
+    CalibraterBase,
+    CalibrationDataReader,
+    CalibrationMethod,
+    MinMaxCalibrater,
+    create_calibrator,
 )
-from .shape_inference import quant_pre_process
+from .qdq_quantizer import QDQQuantizer  # noqa: F401
+from .quant_utils import QuantFormat, QuantType, write_calibration_table  # noqa: F401
+from .quantize import DynamicQuantConfig  # noqa: F401
+from .quantize import QuantizationMode  # noqa: F401
+from .quantize import StaticQuantConfig  # noqa: F401
+from .quantize import quantize  # noqa: F401
+from .quantize import quantize_dynamic  # noqa: F401
+from .quantize import quantize_static  # noqa: F401
+from .shape_inference import quant_pre_process  # noqa: F401
diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index b431647313ad4..edb8fd90c2573 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft, Intel Corporation. All rights reserved.
 # Licensed under the MIT License. See License.txt in the project root for
@@ -81,7 +80,7 @@ def __init__(
         self.infer_session = None
         self.execution_providers = ["CPUExecutionProvider"]
 
-    def set_execution_providers(self, execution_providers=["CPUExecutionProvider"]):
+    def set_execution_providers(self, execution_providers=["CPUExecutionProvider"]):  # noqa: B006
         """
         reset the execution providers to execute the collect_data. It triggers to re-creating inference session.
         """
@@ -110,15 +109,15 @@ def select_tensors_to_calibrate(self, model):
         value_infos = {vi.name: vi for vi in model.graph.value_info}
         value_infos.update({ot.name: ot for ot in model.graph.output})
         value_infos.update({it.name: it for it in model.graph.input})
-        initializer = set(init.name for init in model.graph.initializer)
+        initializer = {init.name for init in model.graph.initializer}
 
         tensors_to_calibrate = set()
-        tensor_type_to_calibrate = set([TensorProto.FLOAT, TensorProto.FLOAT16])
+        tensor_type_to_calibrate = {TensorProto.FLOAT, TensorProto.FLOAT16}
 
         for node in model.graph.node:
             if not self.op_types_to_calibrate or node.op_type in self.op_types_to_calibrate:
                 for tensor_name in itertools.chain(node.input, node.output):
-                    if tensor_name in value_infos.keys():
+                    if tensor_name in value_infos:
                         vi = value_infos[tensor_name]
                         if (
                             vi.type.HasField("tensor_type")
@@ -176,7 +175,7 @@ def __init__(
         :param moving_average: compute the moving average of the minimum and maximum values instead of the global minimum and maximum.
         :param averaging_constant: constant smoothing factor to use when computing the moving average.
         """
-        super(MinMaxCalibrater, self).__init__(
+        super().__init__(
             model,
             op_types_to_calibrate=op_types_to_calibrate,
             augmented_model_path=augmented_model_path,
@@ -186,7 +185,7 @@ def __init__(
         self.intermediate_outputs = []
         self.calibrate_tensors_range = None
         self.num_model_outputs = len(self.model.graph.output)
-        self.model_original_outputs = set(output.name for output in self.model.graph.output)
+        self.model_original_outputs = {output.name for output in self.model.graph.output}
         self.moving_average = moving_average
         if moving_average and (averaging_constant < 0 or averaging_constant > 1):
             raise ValueError("Invalid averaging constant, which should not be < 0 or > 1.")
@@ -292,9 +291,9 @@ def compute_range(self):
             added_output_names[i].rpartition("_")[0] for i in range(0, len(added_output_names), 2)
         ]  # output names
 
-        merged_added_output_dict = dict(
-            (i, merged_output_dict[i]) for i in merged_output_dict if i not in self.model_original_outputs
-        )
+        merged_added_output_dict = {
+            i: merged_output_dict[i] for i in merged_output_dict if i not in self.model_original_outputs
+        }
 
         pairs = []
         for i in range(0, len(added_output_names), 2):
@@ -350,7 +349,7 @@ def __init__(
         :param num_quantized_bins: number of quantized bins. Default 128.
         :param percentile: A float number between [0, 100]. Default 99.99.
         """
-        super(HistogramCalibrater, self).__init__(
+        super().__init__(
             model,
             op_types_to_calibrate=op_types_to_calibrate,
             augmented_model_path=augmented_model_path,
@@ -360,7 +359,7 @@ def __init__(
         self.intermediate_outputs = []
         self.calibrate_tensors_range = None
         self.num_model_outputs = len(self.model.graph.output)
-        self.model_original_outputs = set(output.name for output in self.model.graph.output)
+        self.model_original_outputs = {output.name for output in self.model.graph.output}
         self.collector = None
         self.method = method
         self.num_bins = num_bins
@@ -413,7 +412,7 @@ def collect_data(self, data_reader: CalibrationDataReader):
             for k, v in d.items():
                 merged_dict.setdefault(k, []).append(v)
 
-        clean_merged_dict = dict((i, merged_dict[i]) for i in merged_dict if i in self.tensors_to_calibrate)
+        clean_merged_dict = {i: merged_dict[i] for i in merged_dict if i in self.tensors_to_calibrate}
 
         if not self.collector:
             self.collector = HistogramCollector(
@@ -460,7 +459,7 @@ def __init__(
         :param num_bins: number of bins to create a new histogram for collecting tensor values.
         :param num_quantized_bins: number of quantized bins. Default 128.
         """
-        super(EntropyCalibrater, self).__init__(
+        super().__init__(
             model,
             op_types_to_calibrate,
             augmented_model_path,
@@ -494,7 +493,7 @@ def __init__(
         :param num_quantized_bins: number of quantized bins. Default 128.
         :param percentile: A float number between [0, 100]. Default 99.99.
         """
-        super(PercentileCalibrater, self).__init__(
+        super().__init__(
             model,
             op_types_to_calibrate,
             augmented_model_path,
@@ -633,7 +632,6 @@ def collect_value(self, name_to_arr):
                 )
 
     def merge_histogram(self, old_histogram, data_arr, new_min, new_max, new_threshold):
-
         (old_hist, old_hist_edges, old_min, old_max, old_threshold) = old_histogram
 
         if new_threshold <= old_threshold:
@@ -668,7 +666,7 @@ def merge_histogram(self, old_histogram, data_arr, new_min, new_max, new_thresho
     def compute_collection_result(self):
         if not self.histogram_dict or len(self.histogram_dict) == 0:
             raise ValueError("Histogram has not been collected. Please run collect() first.")
-        print("Finding optimal threshold for each tensor using {} algorithm ...".format(self.method))
+        print(f"Finding optimal threshold for each tensor using {self.method} algorithm ...")
 
         if self.method == "entropy":
             return self.compute_entropy()
@@ -686,9 +684,9 @@ def compute_percentile(self):
 
         thresholds_dict = {}  # per tensor thresholds
 
-        print("Number of tensors : {}".format(len(histogram_dict)))
-        print("Number of histogram bins : {}".format(self.num_bins))
-        print("Percentile : ({},{})".format(100.0 - percentile, percentile))
+        print(f"Number of tensors : {len(histogram_dict)}")
+        print(f"Number of histogram bins : {self.num_bins}")
+        print(f"Percentile : ({100.0 - percentile},{percentile})")
 
         for tensor, histogram in histogram_dict.items():
             hist = histogram[0]
@@ -728,13 +726,13 @@ def compute_entropy(self):
 
         thresholds_dict = {}  # per tensor thresholds
 
-        print("Number of tensors : {}".format(len(histogram_dict)))
+        print(f"Number of tensors : {len(histogram_dict)}")
         print(
             "Number of histogram bins : {} (The number may increase depends on the data it collects)".format(
                 self.num_bins
             )
         )
-        print("Number of quantized bins : {}".format(self.num_quantized_bins))
+        print(f"Number of quantized bins : {self.num_quantized_bins}")
 
         for tensor, histogram in histogram_dict.items():
             optimal_threshold = self.get_entropy_threshold(histogram, num_quantized_bins)
@@ -847,9 +845,8 @@ def create_calibrator(
     augmented_model_path="augmented_model.onnx",
     calibrate_method=CalibrationMethod.MinMax,
     use_external_data_format=False,
-    extra_options={},
+    extra_options={},  # noqa: B006
 ):
-
     calibrator = None
     if calibrate_method == CalibrationMethod.MinMax:
         # default settings for min-max algorithm
@@ -899,4 +896,4 @@ def create_calibrator(
         calibrator.create_inference_session()
         return calibrator
 
-    raise ValueError("Unsupported calibration method {}".format(calibrate_method))
+    raise ValueError(f"Unsupported calibration method {calibrate_method}")
diff --git a/onnxruntime/python/tools/quantization/onnx_model.py b/onnxruntime/python/tools/quantization/onnx_model.py
index 9a91e2ccee5d2..0fff72c542155 100644
--- a/onnxruntime/python/tools/quantization/onnx_model.py
+++ b/onnxruntime/python/tools/quantization/onnx_model.py
@@ -129,7 +129,7 @@ def get_initializer(self, name):
         return None
 
     def get_initializer_name_set(self):
-        return set(initializer.name for initializer in self.model.graph.initializer)
+        return {initializer.name for initializer in self.model.graph.initializer}
 
     def remove_initializer(self, tensor):
         if tensor in self.model.graph.initializer:
@@ -260,25 +260,25 @@ def __replace_gemm_with_matmul(graph_path):
             if node.op_type == "Gemm":
                 alpha = 1.0
                 beta = 1.0
-                transA = 0
-                transB = 0
+                transA = 0  # noqa: N806
+                transB = 0  # noqa: N806
                 for attr in node.attribute:
                     if attr.name == "alpha":
                         alpha = onnx_helper.get_attribute_value(attr)
                     elif attr.name == "beta":
                         beta = onnx_helper.get_attribute_value(attr)
                     elif attr.name == "transA":
-                        transA = onnx_helper.get_attribute_value(attr)
+                        transA = onnx_helper.get_attribute_value(attr)  # noqa: N806
                     elif attr.name == "transB":
-                        transB = onnx_helper.get_attribute_value(attr)
+                        transB = onnx_helper.get_attribute_value(attr)  # noqa: N806
                 if alpha == 1.0 and beta == 1.0 and transA == 0:
-                    inputB = node.input[1]
+                    inputB = node.input[1]  # noqa: N806
                     if transB == 1:
-                        B, Bs_graph = ONNXModel.__get_initializer(node.input[1], graph_path)
+                        B, Bs_graph = ONNXModel.__get_initializer(node.input[1], graph_path)  # noqa: N806
                         if B:
                             # assume B is not used by any other node
-                            B_array = onnx_numpy_helper.to_array(B)
-                            B_trans = onnx_numpy_helper.from_array(B_array.T)
+                            B_array = onnx_numpy_helper.to_array(B)  # noqa: N806
+                            B_trans = onnx_numpy_helper.from_array(B_array.T)  # noqa: N806
                             B_trans.name = B.name
                             Bs_graph.initializer.remove(B)
                             for input in Bs_graph.input:
@@ -287,7 +287,7 @@ def __replace_gemm_with_matmul(graph_path):
                                     break
                             Bs_graph.initializer.extend([B_trans])
                         else:
-                            inputB += "_Transposed"
+                            inputB += "_Transposed"  # noqa: N806
                             transpose_node = onnx_helper.make_node(
                                 "Transpose",
                                 inputs=[node.input[1]],
@@ -393,16 +393,10 @@ def remove_unused_constant(self):
         self.remove_initializers(ununsed_weights)
 
     def is_graph_output(self, output_name):
-        for output in self.model.graph.output:
-            if output.name == output_name:
-                return True
-        return False
+        return any(output.name == output_name for output in self.model.graph.output)
 
     def is_graph_input(self, tensor_name: str) -> bool:
-        for input in self.model.graph.input:
-            if input.name == tensor_name:
-                return True
-        return False
+        return any(input.name == tensor_name for input in self.model.graph.input)
 
     # TODO:use OnnxModel.graph_topological_sort(self.model.graph) from transformers.onnx_model
     # Currently it breaks Openvino/Linux training gpu pipeline so hold off for 1.8 release
diff --git a/onnxruntime/python/tools/quantization/onnx_quantizer.py b/onnxruntime/python/tools/quantization/onnx_quantizer.py
index 3c54748ea9df0..6739393ba6ef6 100644
--- a/onnxruntime/python/tools/quantization/onnx_quantizer.py
+++ b/onnxruntime/python/tools/quantization/onnx_quantizer.py
@@ -49,7 +49,6 @@ def __init__(
         op_types_to_quantize,
         extra_options=None,
     ):
-
         if not model_has_infer_metadata(model):
             model = save_and_reload_model(model)
         self.value_infos = {vi.name: vi for vi in model.graph.value_info}
@@ -114,8 +113,8 @@ def __init__(
 
         self.opset_version = self.check_opset_version()
 
-        if not self.mode in QuantizationMode:
-            raise ValueError("unsupported quantization mode {}".format(self.mode))
+        if self.mode not in QuantizationMode:
+            raise ValueError(f"unsupported quantization mode {self.mode}")
 
         self.quantization_params = self.calculate_quantization_params()
 
@@ -164,7 +163,7 @@ def quantize_subgraph(self, subgraph, graph_key):
             self.extra_options,
         )
         sub_quanitzer.parent = self
-        sub_quanitzer.graph_scope = "{}{}/".format(self.graph_scope, graph_key)
+        sub_quanitzer.graph_scope = f"{self.graph_scope}{graph_key}/"
         sub_quanitzer.quantize_model()
         return sub_quanitzer.model.model.graph
 
@@ -180,11 +179,11 @@ def quantize_node_with_sub_graph(self, node):
         ]
         if len(graph_attrs) == 0:
             return node
-        node_name = node.name if node.name != "" else "{}_node_count_{}".format(node.op_type, len(self.new_nodes))
+        node_name = node.name if node.name != "" else f"{node.op_type}_node_count_{len(self.new_nodes)}"
         kwargs = {}
         for attr in node.attribute:
             if attr.type == onnx.AttributeProto.GRAPH:
-                kv = {attr.name: self.quantize_subgraph(attr.g, "{}:{}".format(node_name, attr.name))}
+                kv = {attr.name: self.quantize_subgraph(attr.g, f"{node_name}:{attr.name}")}
             elif attr.type == onnx.AttributeProto.GRAPHS:
                 value = []
                 for subgraph in attr.graphs:
@@ -192,7 +191,7 @@ def quantize_node_with_sub_graph(self, node):
                         [
                             self.quantize_subgraph(
                                 subgraph,
-                                "{}:{}:{}".format(node_name, attr.name, len(value)),
+                                f"{node_name}:{attr.name}:{len(value)}",
                             )
                         ]
                     )
@@ -206,7 +205,7 @@ def check_opset_version(self):
         ai_onnx_domain = [
             opset for opset in self.model.model.opset_import if not opset.domain or opset.domain == "ai.onnx"
         ]
-        if 1 != len(ai_onnx_domain):
+        if len(ai_onnx_domain) != 1:
             raise ValueError("Failed to find proper ai.onnx domain")
         opset_version = ai_onnx_domain[0].version
 
@@ -231,7 +230,7 @@ def check_opset_version(self):
         self.fuse_dynamic_quant = True
         return opset_version
 
-    def has_QDQ_nodes(self):
+    def has_QDQ_nodes(self):  # noqa: N802
         """
         Detect if model already has QuantizeLinear or DequantizeLinear.
         """
@@ -308,7 +307,7 @@ def is_float_tensor(self, tensor_name):
         if self.is_input_a_initializer(tensor_name):
             return self.is_valid_quantize_weight(tensor_name)
 
-        if tensor_name in self.value_infos.keys():
+        if tensor_name in self.value_infos:
             vi = self.value_infos[tensor_name]
             if vi.type.HasField("tensor_type") and vi.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
                 return True
@@ -358,7 +357,7 @@ def _get_dynamic_input_quantization_params_int8(self, input_name, nodes_list):
             parameter nodes_list: new nodes are appended to this list.
             return: scale_name, zero_point_name, scale_shape, zero_point_shape.
         """
-        qType = onnx_proto.TensorProto.INT8
+        qType = onnx_proto.TensorProto.INT8  # noqa: N806
 
         # Reduce min and Reduce max
         input_scale_name = input_name + "_scale"
@@ -441,7 +440,7 @@ def _get_dynamic_input_quantization_params_uint8(self, input_name, nodes_list):
             parameter nodes_list: new nodes are appended to this list.
             return: scale_name, zero_point_name, scale_shape, zero_point_shape.
         """
-        qType = onnx_proto.TensorProto.UINT8
+        qType = onnx_proto.TensorProto.UINT8  # noqa: N806
         # Reduce min and Reduce max
         input_scale_name = input_name + "_scale"
         input_zp_name = input_name + "_zero_point"
@@ -536,7 +535,7 @@ def _get_quantization_params(self, param_name, use_scale=None, use_zeropoint=Non
         """
         if use_scale is None or use_zeropoint is None:
             if self.quantization_params is None or param_name not in self.quantization_params:
-                logging.info('Quantization parameters for tensor:"{}" not specified'.format(param_name))
+                logging.info(f'Quantization parameters for tensor:"{param_name}" not specified')
                 return False, "", "", "", ""
 
             params = self.quantization_params[param_name]
@@ -626,7 +625,7 @@ def _get_quantize_input_nodes(self, node, input_index, qType, given_scale_name=N
                 )
 
         self.quantized_value_map[input_name] = QuantizedValue(input_name, output_name, scale_name, zp_name, qType)
-        return nodes + [qlinear_node]
+        return [*nodes, qlinear_node]
 
     def set_quant_scale_zp(self, tensor_name, value):
         assert isinstance(value, tuple) and len(value) == 2, "value must be scale(float) and zeropoint"
@@ -672,7 +671,7 @@ def quantize_bias_static(self, bias_name, input_name, weight_name, beta=1.0):
         elif input_name in self.quantization_params:
             _, input_scale_name, _, _, _ = self._get_quantization_params(input_name)
         else:
-            raise ValueError("Expected {} to be in quantized value map for static quantization".format(input_name))
+            raise ValueError(f"Expected {input_name} to be in quantized value map for static quantization")
 
         inputscale_initializer = find_by_name(input_scale_name, self.model.initializer())
         input_scale = tensor_proto_to_array(inputscale_initializer)
@@ -808,7 +807,11 @@ def __quantize_inputs(
             initializer = find_by_name(node_input, self.model.initializer())
             if initializer is not None:
                 if self.per_channel and op_level_per_channel:
-                    (q_weight_name, zp_name, scale_name,) = self.quantize_weight_per_channel(
+                    (
+                        q_weight_name,
+                        zp_name,
+                        scale_name,
+                    ) = self.quantize_weight_per_channel(
                         initializer.name,
                         self.weight_qType if initializer_use_weight_qType else self.activation_qType,
                         axis,
@@ -867,9 +870,7 @@ def __quantize_inputs(
                 zero_point_names.append(parent_zero_point_names[0])
                 # node should not be add this child level here
             else:
-                raise ValueError(
-                    "Invalid tensor name to quantize: {} @graph scope{}".format(node_input, self.graph_scope)
-                )
+                raise ValueError(f"Invalid tensor name to quantize: {node_input} @graph scope{self.graph_scope}")
 
         return quantized_input_names, zero_point_names, scale_names, nodes
 
@@ -1068,7 +1069,7 @@ def calculate_quantization_params(self):
                 continue
             self.tensors_range[node.input[0]] = self.tensors_range[node.output[0]]
         quantization_params = {}
-        for tensor_name in self.tensors_range.keys():
+        for tensor_name in self.tensors_range:
             rmin, rmax = self.tensors_range[tensor_name]
             qmin, qmax = get_qmin_qmax_for_qType(self.activation_qType, symmetric=self.is_activation_symmetric)
 
diff --git a/onnxruntime/python/tools/quantization/operators/activation.py b/onnxruntime/python/tools/quantization/operators/activation.py
index 1029e7b679b60..4335e458a1701 100644
--- a/onnxruntime/python/tools/quantization/operators/activation.py
+++ b/onnxruntime/python/tools/quantization/operators/activation.py
@@ -1,5 +1,4 @@
 import onnx
-from onnx import onnx_pb as onnx_proto
 
 from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
 from .base_operator import QuantOperatorBase
@@ -10,7 +9,7 @@ class QLinearActivation(QuantOperatorBase):
     def __init__(self, onnx_quantizer, onnx_node):
         super().__init__(onnx_quantizer, onnx_node)
 
-    def QuantizeClipRelu(self):
+    def QuantizeClipRelu(self):  # noqa: N802
         node = self.node
         assert node.op_type == "Relu" or node.op_type == "Clip"
 
diff --git a/onnxruntime/python/tools/quantization/operators/attention.py b/onnxruntime/python/tools/quantization/operators/attention.py
index 36428b860d060..e2581e00b54fa 100644
--- a/onnxruntime/python/tools/quantization/operators/attention.py
+++ b/onnxruntime/python/tools/quantization/operators/attention.py
@@ -1,5 +1,5 @@
 import onnx
-from onnx import onnx_pb as onnx_proto
+from onnx import onnx_pb as onnx_proto  # noqa: F401
 
 from ..quant_utils import attribute_to_kwarg, ms_domain
 from .base_operator import QuantOperatorBase
@@ -29,7 +29,7 @@ def quantize(self):
         # attribute. This needs to be removed once the QAttention for varied q,k,v sizes
         # is implemented
         for attr in node.attribute:
-            if "qkv_hidden_sizes" == attr.name:
+            if attr.name == "qkv_hidden_sizes":
                 return super().quantize()
 
         (
diff --git a/onnxruntime/python/tools/quantization/operators/binary_op.py b/onnxruntime/python/tools/quantization/operators/binary_op.py
index 3beb96aabe575..69440e5cf738d 100644
--- a/onnxruntime/python/tools/quantization/operators/binary_op.py
+++ b/onnxruntime/python/tools/quantization/operators/binary_op.py
@@ -1,5 +1,5 @@
 import onnx
-from onnx import onnx_pb as onnx_proto
+from onnx import onnx_pb as onnx_proto  # noqa: F401
 
 from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
 from .base_operator import QuantOperatorBase
diff --git a/onnxruntime/python/tools/quantization/operators/concat.py b/onnxruntime/python/tools/quantization/operators/concat.py
index 998ca5c558743..833932a46ba59 100644
--- a/onnxruntime/python/tools/quantization/operators/concat.py
+++ b/onnxruntime/python/tools/quantization/operators/concat.py
@@ -1,8 +1,14 @@
 import onnx
 
-from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from ..quant_utils import (  # noqa: F401
+    TENSOR_NAME_QUANT_SUFFIX,
+    QuantizedValue,
+    QuantizedValueType,
+    attribute_to_kwarg,
+    ms_domain,
+)
 from .base_operator import QuantOperatorBase
-from .qdq_base_operator import QDQOperatorBase
+from .qdq_base_operator import QDQOperatorBase  # noqa: F401
 
 
 class QLinearConcat(QuantOperatorBase):
diff --git a/onnxruntime/python/tools/quantization/operators/conv.py b/onnxruntime/python/tools/quantization/operators/conv.py
index 0d137ab2eff14..68d1add0426d0 100644
--- a/onnxruntime/python/tools/quantization/operators/conv.py
+++ b/onnxruntime/python/tools/quantization/operators/conv.py
@@ -2,9 +2,9 @@
 import onnx
 from onnx import onnx_pb as onnx_proto
 
+from ..quant_utils import BiasToQuantize  # noqa: F401
 from ..quant_utils import (
     TENSOR_NAME_QUANT_SUFFIX,
-    BiasToQuantize,
     QuantizedValue,
     QuantizedValueType,
     attribute_to_kwarg,
@@ -34,7 +34,7 @@ def add_bias(self, nodes, scaled_output):
         # Add tensors for the shape to be reshaped to
         weight = find_by_name(node.input[1], model.initializer())
         if weight is None:
-            raise ValueError("Expected {} to be an initializer".format(node.input[1]))
+            raise ValueError(f"Expected {node.input[1]} to be an initializer")
 
         # Add reshape for correct broadcase
         output = node.output[0]
diff --git a/onnxruntime/python/tools/quantization/operators/embed_layernorm.py b/onnxruntime/python/tools/quantization/operators/embed_layernorm.py
index 01b5fad1c3c75..8e168c34453a5 100644
--- a/onnxruntime/python/tools/quantization/operators/embed_layernorm.py
+++ b/onnxruntime/python/tools/quantization/operators/embed_layernorm.py
@@ -1,7 +1,7 @@
 import logging
 
 import onnx
-from onnx import onnx_pb as onnx_proto
+from onnx import onnx_pb as onnx_proto  # noqa: F401
 
 from ..quant_utils import attribute_to_kwarg, ms_domain
 from .base_operator import QuantOperatorBase
diff --git a/onnxruntime/python/tools/quantization/operators/gemm.py b/onnxruntime/python/tools/quantization/operators/gemm.py
index 07e7678a34957..39f9e0950fd3c 100644
--- a/onnxruntime/python/tools/quantization/operators/gemm.py
+++ b/onnxruntime/python/tools/quantization/operators/gemm.py
@@ -1,27 +1,21 @@
 import logging
 
-import numpy as np
+import numpy as np  # noqa: F401
 import onnx
 from onnx import onnx_pb as onnx_proto
 
-from ..quant_utils import (
-    TENSOR_NAME_QUANT_SUFFIX,
-    QuantizedValue,
-    QuantizedValueType,
-    attribute_to_kwarg,
-    find_by_name,
-    get_mul_node,
-    ms_domain,
-)
-from .base_operator import QuantOperatorBase
+from ..quant_utils import find_by_name  # noqa: F401
+from ..quant_utils import get_mul_node  # noqa: F401
+from ..quant_utils import TENSOR_NAME_QUANT_SUFFIX, QuantizedValue, QuantizedValueType, attribute_to_kwarg, ms_domain
+from .base_operator import QuantOperatorBase  # noqa: F401
 from .matmul import QOpMatMul
 from .qdq_base_operator import QDQOperatorBase
 
 
-def is_B_transposed(gemm_node):
-    transB_attribute = [attr for attr in gemm_node.attribute if attr.name == "transB"]
+def is_B_transposed(gemm_node):  # noqa: N802
+    transB_attribute = [attr for attr in gemm_node.attribute if attr.name == "transB"]  # noqa: N806
     if len(transB_attribute):
-        return 0 < onnx.helper.get_attribute_value(transB_attribute[0])
+        return onnx.helper.get_attribute_value(transB_attribute[0]) > 0
 
     return False
 
diff --git a/onnxruntime/python/tools/quantization/operators/lstm.py b/onnxruntime/python/tools/quantization/operators/lstm.py
index 87552a18a037e..0ab46cb9c42a9 100644
--- a/onnxruntime/python/tools/quantization/operators/lstm.py
+++ b/onnxruntime/python/tools/quantization/operators/lstm.py
@@ -2,7 +2,7 @@
 import onnx
 from onnx import onnx_pb as onnx_proto
 
-from ..quant_utils import QuantType, attribute_to_kwarg, ms_domain
+from ..quant_utils import QuantType, attribute_to_kwarg, ms_domain  # noqa: F401
 from .base_operator import QuantOperatorBase
 
 """
@@ -30,15 +30,15 @@ def quantize(self):
             return
 
         model = self.quantizer.model
-        W = model.get_initializer(node.input[1])
-        R = model.get_initializer(node.input[2])
+        W = model.get_initializer(node.input[1])  # noqa: N806
+        R = model.get_initializer(node.input[2])  # noqa: N806
 
         if len(W.dims) != 3 or len(R.dims) != 3:
             super().quantize()
             return
 
-        [W_num_dir, W_4_hidden_size, W_input_size] = W.dims
-        [R_num_dir, R_4_hidden_size, R_hidden_size] = R.dims
+        [W_num_dir, W_4_hidden_size, W_input_size] = W.dims  # noqa: N806
+        [R_num_dir, R_4_hidden_size, R_hidden_size] = R.dims  # noqa: N806
 
         if self.quantizer.is_per_channel():
             del W.dims[0]
@@ -53,29 +53,29 @@ def quantize(self):
             node.input[2], onnx_proto.TensorProto.INT8, 0
         )
 
-        W_quant_weight = model.get_initializer(quant_input_weight_tuple[0])
-        R_quant_weight = model.get_initializer(quant_recurrent_weight_tuple[0])
+        W_quant_weight = model.get_initializer(quant_input_weight_tuple[0])  # noqa: N806
+        R_quant_weight = model.get_initializer(quant_recurrent_weight_tuple[0])  # noqa: N806
 
-        W_quant_array = onnx.numpy_helper.to_array(W_quant_weight)
-        R_quant_array = onnx.numpy_helper.to_array(R_quant_weight)
+        W_quant_array = onnx.numpy_helper.to_array(W_quant_weight)  # noqa: N806
+        R_quant_array = onnx.numpy_helper.to_array(R_quant_weight)  # noqa: N806
 
-        W_quant_array = numpy.reshape(W_quant_array, (W_num_dir, W_4_hidden_size, W_input_size))
-        R_quant_array = numpy.reshape(R_quant_array, (R_num_dir, R_4_hidden_size, R_hidden_size))
+        W_quant_array = numpy.reshape(W_quant_array, (W_num_dir, W_4_hidden_size, W_input_size))  # noqa: N806
+        R_quant_array = numpy.reshape(R_quant_array, (R_num_dir, R_4_hidden_size, R_hidden_size))  # noqa: N806
 
-        W_quant_array = numpy.transpose(W_quant_array, (0, 2, 1))
-        R_quant_array = numpy.transpose(R_quant_array, (0, 2, 1))
+        W_quant_array = numpy.transpose(W_quant_array, (0, 2, 1))  # noqa: N806
+        R_quant_array = numpy.transpose(R_quant_array, (0, 2, 1))  # noqa: N806
 
-        W_quant_tranposed = onnx.numpy_helper.from_array(W_quant_array, quant_input_weight_tuple[0])
-        R_quant_tranposed = onnx.numpy_helper.from_array(R_quant_array, quant_recurrent_weight_tuple[0])
+        W_quant_tranposed = onnx.numpy_helper.from_array(W_quant_array, quant_input_weight_tuple[0])  # noqa: N806
+        R_quant_tranposed = onnx.numpy_helper.from_array(R_quant_array, quant_recurrent_weight_tuple[0])  # noqa: N806
 
         model.remove_initializers([W_quant_weight, R_quant_weight])
         model.add_initializer(W_quant_tranposed)
         model.add_initializer(R_quant_tranposed)
 
-        W_quant_zp = model.get_initializer(quant_input_weight_tuple[1])
-        R_quant_zp = model.get_initializer(quant_recurrent_weight_tuple[1])
-        W_quant_scale = model.get_initializer(quant_input_weight_tuple[2])
-        R_quant_scale = model.get_initializer(quant_recurrent_weight_tuple[2])
+        W_quant_zp = model.get_initializer(quant_input_weight_tuple[1])  # noqa: N806
+        R_quant_zp = model.get_initializer(quant_recurrent_weight_tuple[1])  # noqa: N806
+        W_quant_scale = model.get_initializer(quant_input_weight_tuple[2])  # noqa: N806
+        R_quant_scale = model.get_initializer(quant_recurrent_weight_tuple[2])  # noqa: N806
 
         if self.quantizer.is_per_channel():
             W_quant_zp.dims[:] = [W_num_dir, W_4_hidden_size]
diff --git a/onnxruntime/python/tools/quantization/operators/matmul.py b/onnxruntime/python/tools/quantization/operators/matmul.py
index 1bbc73cfdc68d..5cd59f337df0b 100644
--- a/onnxruntime/python/tools/quantization/operators/matmul.py
+++ b/onnxruntime/python/tools/quantization/operators/matmul.py
@@ -24,7 +24,7 @@ def should_quantize(self):
         # do not quantize non-constant B matrices for matmul
         if self.quantizer.q_matmul_const_b_only:
             if not self.quantizer.find_initializer_in_path(self.node.input[1]):
-                print("Ignore MatMul due to non constant B: {}[{}]".format(self.quantizer.graph_scope, self.node.name))
+                print(f"Ignore MatMul due to non constant B: {self.quantizer.graph_scope}[{self.node.name}]")
                 return False
         return True
 
diff --git a/onnxruntime/python/tools/quantization/operators/pad.py b/onnxruntime/python/tools/quantization/operators/pad.py
index e7eeac2cec3ef..2d1690e545263 100644
--- a/onnxruntime/python/tools/quantization/operators/pad.py
+++ b/onnxruntime/python/tools/quantization/operators/pad.py
@@ -1,4 +1,3 @@
-import numpy as np
 import onnx
 
 from ..quant_utils import (
diff --git a/onnxruntime/python/tools/quantization/operators/qdq_base_operator.py b/onnxruntime/python/tools/quantization/operators/qdq_base_operator.py
index 0fe05df5191fa..69448c73d30b2 100644
--- a/onnxruntime/python/tools/quantization/operators/qdq_base_operator.py
+++ b/onnxruntime/python/tools/quantization/operators/qdq_base_operator.py
@@ -1,16 +1,14 @@
 import itertools
 
-from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, quantize_nparray
-from .base_operator import QuantOperatorBase
+from ..quant_utils import QuantizedValue, QuantizedValueType, attribute_to_kwarg, quantize_nparray  # noqa: F401
+from .base_operator import QuantOperatorBase  # noqa: F401
 
 
 class QDQOperatorBase:
     def __init__(self, onnx_quantizer, onnx_node):
         self.quantizer = onnx_quantizer
         self.node = onnx_node
-        self.disable_qdq_for_node_output = (
-            True if onnx_node.op_type in onnx_quantizer.op_types_to_exclude_output_quantization else False
-        )
+        self.disable_qdq_for_node_output = onnx_node.op_type in onnx_quantizer.op_types_to_exclude_output_quantization
 
     def quantize(self):
         node = self.node
diff --git a/onnxruntime/python/tools/quantization/qdq_loss_debug.py b/onnxruntime/python/tools/quantization/qdq_loss_debug.py
index a3adf675d890c..5eed354a05f89 100644
--- a/onnxruntime/python/tools/quantization/qdq_loss_debug.py
+++ b/onnxruntime/python/tools/quantization/qdq_loss_debug.py
@@ -10,7 +10,7 @@
 A use case is to debug quantization induced accuracy drop. An AI engineer can
 run the original float32 model and the quantized model with the same inputs,
 then compare the corresponding activations between the two models to find
-where the divergence is. 
+where the divergence is.
 
 Example Usage:
 
diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py
index a970e72aa1f86..5f3cc3ff93160 100644
--- a/onnxruntime/python/tools/quantization/qdq_quantizer.py
+++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py
@@ -105,7 +105,7 @@ def __init__(
         self.quantize_bias = True if "QuantizeBias" not in extra_options else extra_options["QuantizeBias"]
 
         # The default behavior is that multiple nodes can share a QDQ pair as their inputs.
-        # In TRT, QDQ pair can’t be shared between nodes, so it will create dedicated QDQ pairs for each node.
+        # In TRT, QDQ pair can`t be shared between nodes, so it will create dedicated QDQ pairs for each node.
         self.dedicated_qdq_pair = (
             False if "DedicatedQDQPair" not in extra_options else extra_options["DedicatedQDQPair"]
         )
@@ -127,7 +127,7 @@ def _is_tensor_quantizable(self, tensor_name):
         if weight is not None:
             if weight.data_type == onnx_proto.TensorProto.FLOAT:
                 return True
-        elif tensor_name in self.value_infos.keys():
+        elif tensor_name in self.value_infos:
             vi = self.value_infos[tensor_name]
             if vi.type.HasField("tensor_type") and vi.type.tensor_type.elem_type == TensorProto.FLOAT:
                 return True
@@ -186,9 +186,7 @@ def quantize_weight_tensor_per_channel(self, tensor_name, axis):
                     tensor_type=QDQQuantTensorType.WEIGHT, axis=axis
                 )
         else:
-            logging.warning(
-                "only support per-channel quantization on weight. Tensor: {} is not quantized.".format(tensor_name)
-            )
+            logging.warning(f"only support per-channel quantization on weight. Tensor: {tensor_name} is not quantized.")
 
     def quantize_bias_tensor(self, bias_name, input_name, weight_name, beta=1.0):
         weight = find_by_name(bias_name, self.model.initializer())
@@ -196,7 +194,7 @@ def quantize_bias_tensor(self, bias_name, input_name, weight_name, beta=1.0):
             if weight.data_type == onnx_proto.TensorProto.FLOAT:
                 self.bias_to_quantize.append((bias_name, input_name, weight_name, beta))
         else:
-            logging.warning("Expected {} to be a weight".format(bias_name))
+            logging.warning(f"Expected {bias_name} to be a weight")
 
     def remove_node(self, node):
         self.nodes_to_remove.append(node)
@@ -231,7 +229,7 @@ def quantize_model(self):
 
     def try_replacing_upstream_output(self, upstream_output_name, output_name):
         if (
-            output_name in self.quantization_params.keys()
+            output_name in self.quantization_params
             and len(self.model.input_name_to_nodes()[upstream_output_name]) == 1
             and not self.model.is_graph_output(upstream_output_name)
             and not self.model.is_graph_input(upstream_output_name)
@@ -369,7 +367,7 @@ def _add_qdq_pair_for_activation(self, tensor_name, scale_name, zp_name):
 
     def _quantize_normal_tensors(self):
         for tensor_name, tensor_info in self.tensors_to_quantize.copy().items():
-            if tensor_name in self.quantized_value_map.keys():
+            if tensor_name in self.quantized_value_map:
                 continue
 
             if not tensor_info.is_shared:
@@ -409,7 +407,7 @@ def _quantize_sharing_param_tensors(self):
 
     def _quantize_bias_tensors(self):
         for bias_name, input_name, weight_name, beta in self.bias_to_quantize:
-            if bias_name in self.quantized_value_map.keys():
+            if bias_name in self.quantized_value_map:
                 continue
             # Quantize the input
             self.quantize_bias_static(bias_name, input_name, weight_name, beta)
diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py
index 2ceefeadcd1e5..02d62dd41b894 100644
--- a/onnxruntime/python/tools/quantization/quant_utils.py
+++ b/onnxruntime/python/tools/quantization/quant_utils.py
@@ -56,7 +56,7 @@ def from_string(mode):
         try:
             return QuantizationMode[mode]
         except KeyError:
-            raise ValueError()
+            raise ValueError()  # noqa: B904
 
 
 class QuantizedValueType(Enum):
@@ -71,7 +71,7 @@ def from_string(v):
         try:
             return QuantizedValueType[v]
         except KeyError:
-            raise ValueError()
+            raise ValueError()  # noqa: B904
 
 
 class QuantType(Enum):
@@ -86,7 +86,7 @@ def from_string(t):
         try:
             return QuantType[t]
         except KeyError:
-            raise ValueError()
+            raise ValueError()  # noqa: B904
 
 
 class QuantFormat(Enum):
@@ -101,7 +101,7 @@ def from_string(format):
         try:
             return QuantFormat[format]
         except KeyError:
-            raise ValueError()
+            raise ValueError()  # noqa: B904
 
 
 ONNX_TYPE_TO_NP_TYPE = {
@@ -111,9 +111,7 @@ def from_string(format):
 
 
 def quantize_nparray(qType, arr, scale, zero_point, low=None, high=None):
-    assert (
-        qType in ONNX_TYPE_TO_NP_TYPE
-    ), "Unexpected data type {} requested. Only INT8 and UINT8 are supported.".format(qType)
+    assert qType in ONNX_TYPE_TO_NP_TYPE, f"Unexpected data type {qType} requested. Only INT8 and UINT8 are supported."
     dtype = ONNX_TYPE_TO_NP_TYPE[qType]
     cliplow = max(0 if dtype == numpy.uint8 else -127, -127 if low is None else low)
     cliphigh = min(255 if dtype == numpy.uint8 else 127, 255 if high is None else high)
@@ -204,7 +202,7 @@ def quantize_data(data, qType, symmetric, reduce_range=False):
     return rmin, rmax, zero_point, scale, quantized_data
 
 
-def get_qmin_qmax_for_qType(qType, reduce_range=False, symmetric=False):
+def get_qmin_qmax_for_qType(qType, reduce_range=False, symmetric=False):  # noqa: N802
     """
     Return qmin and qmax, the minimum and maximum value representable by the given qType
     :parameter qType: onnx.onnx_pb.TensorProto.UINT8 or onnx.onnx_pb.TensorProto.UINT8
@@ -218,11 +216,11 @@ def get_qmin_qmax_for_qType(qType, reduce_range=False, symmetric=False):
         else:
             (qmin, qmax) = (-64, 64) if reduce_range else (-128, 127)
     else:
-        raise ValueError("Unexpected data type {} requested. Only INT8 and UINT8 are supported.".format(qType))
+        raise ValueError(f"Unexpected data type {qType} requested. Only INT8 and UINT8 are supported.")
     return qmin, qmax
 
 
-def get_qrange_for_qType(qType, reduce_range=False, symmetric=False):
+def get_qrange_for_qType(qType, reduce_range=False, symmetric=False):  # noqa: N802
     """
     Helper function to get the quantization range for a type.
         parameter qType: quantization type.
@@ -245,8 +243,8 @@ def __init__(
         rmaxs,
         zero_points,
         scales,
-        data=[],
-        quantized_data=[],
+        data=[],  # noqa: B006
+        quantized_data=[],  # noqa: B006
         axis=None,
     ):
         self.name = name
@@ -265,7 +263,7 @@ def __init__(
 
 class QuantizedValue:
     """
-    Represents a linearly quantized value (input\output\intializer)
+    Represents a linearly quantized value (input\\output\\intializer)
     """
 
     def __init__(
@@ -303,7 +301,7 @@ def attribute_to_kwarg(attribute):
         :return: attribute in {key: value} format.
     """
     if attribute.type == 0:
-        raise ValueError("attribute {} does not have type specified.".format(attribute.name))
+        raise ValueError(f"attribute {attribute.name} does not have type specified.")
 
     # Based on attribute type definitions from AttributeProto
     # definition in https://github.com/onnx/onnx/blob/main/onnx/onnx.proto
@@ -328,7 +326,7 @@ def attribute_to_kwarg(attribute):
     elif attribute.type == 10:
         value = attribute.graphs
     else:
-        raise ValueError("attribute {} has unsupported type {}.".format(attribute.name, attribute.type))
+        raise ValueError(f"attribute {attribute.name} has unsupported type {attribute.type}.")
 
     return {attribute.name: value}
 
@@ -403,7 +401,7 @@ def write_calibration_table(calibration_cache):
     import onnxruntime.quantization.CalTableFlatBuffers.KeyValue as KeyValue
     import onnxruntime.quantization.CalTableFlatBuffers.TrtTable as TrtTable
 
-    logging.info("calibration cache: {}".format(calibration_cache))
+    logging.info(f"calibration cache: {calibration_cache}")
 
     with open("calibration.json", "w") as file:
         file.write(json.dumps(calibration_cache))  # use `json.loads` to do the reverse
diff --git a/onnxruntime/python/tools/quantization/registry.py b/onnxruntime/python/tools/quantization/registry.py
index 2a0e922f9b756..188cbfb96acce 100644
--- a/onnxruntime/python/tools/quantization/registry.py
+++ b/onnxruntime/python/tools/quantization/registry.py
@@ -84,20 +84,20 @@
 }
 
 
-def CreateDefaultOpQuantizer(onnx_quantizer, node):
+def CreateDefaultOpQuantizer(onnx_quantizer, node):  # noqa: N802
     return QuantOperatorBase(onnx_quantizer, node)
 
 
-def CreateOpQuantizer(onnx_quantizer, node):
+def CreateOpQuantizer(onnx_quantizer, node):  # noqa: N802
     registry = IntegerOpsRegistry if onnx_quantizer.mode == QuantizationMode.IntegerOps else QLinearOpsRegistry
-    if node.op_type in registry.keys():
+    if node.op_type in registry:
         op_quantizer = registry[node.op_type](onnx_quantizer, node)
         if op_quantizer.should_quantize():
             return op_quantizer
     return QuantOperatorBase(onnx_quantizer, node)
 
 
-def CreateQDQQuantizer(onnx_quantizer, node):
-    if node.op_type in QDQRegistry.keys():
+def CreateQDQQuantizer(onnx_quantizer, node):  # noqa: N802
+    if node.op_type in QDQRegistry:
         return QDQRegistry[node.op_type](onnx_quantizer, node)
     return QDQOperatorBase(onnx_quantizer, node)
diff --git a/onnxruntime/python/tools/quantization/shape_inference.py b/onnxruntime/python/tools/quantization/shape_inference.py
index 7df2dec59bf42..9aaac95a8dc5b 100644
--- a/onnxruntime/python/tools/quantization/shape_inference.py
+++ b/onnxruntime/python/tools/quantization/shape_inference.py
@@ -89,7 +89,7 @@ def quant_pre_process(
                 sess_option.optimized_model_filepath = opt_model_path
                 sess_option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
                 _ = onnxruntime.InferenceSession(input_model_path, sess_option, providers=["CPUExecutionProvider"])
-            except Exception as e:
+            except Exception:
                 logger.error(
                     "ONNX Runtime Model Optimization Failed! Consider rerun with option `--skip_optimization'."
                 )
diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py
index ce221cb2eb43f..685d3b19500ef 100755
--- a/onnxruntime/python/tools/symbolic_shape_infer.py
+++ b/onnxruntime/python/tools/symbolic_shape_infer.py
@@ -53,7 +53,7 @@ def get_shape_from_value_info(vi):
     if cls_type is None:
         return None
     if is_sequence(vi.type):
-        if "tensor_type" == vi.type.sequence_type.elem_type.WhichOneof("value"):
+        if vi.type.sequence_type.elem_type.WhichOneof("value") == "tensor_type":
             return get_shape_from_type_proto(vi.type.sequence_type.elem_type)
         else:
             return None
@@ -297,19 +297,14 @@ def _apply_suggested_merge(self, graph_input_only=False):
     def _preprocess(self, in_mp):
         self.out_mp_ = onnx.ModelProto()
         self.out_mp_.CopyFrom(in_mp)
-        self.graph_inputs_ = dict([(i.name, i) for i in list(self.out_mp_.graph.input)])
-        self.initializers_ = dict([(i.name, i) for i in self.out_mp_.graph.initializer])
-        self.known_vi_ = dict([(i.name, i) for i in list(self.out_mp_.graph.input)])
+        self.graph_inputs_ = {i.name: i for i in list(self.out_mp_.graph.input)}
+        self.initializers_ = {i.name: i for i in self.out_mp_.graph.initializer}
+        self.known_vi_ = {i.name: i for i in list(self.out_mp_.graph.input)}
         self.known_vi_.update(
-            dict(
-                [
-                    (
-                        i.name,
-                        helper.make_tensor_value_info(i.name, i.data_type, list(i.dims)),
-                    )
-                    for i in self.out_mp_.graph.initializer
-                ]
-            )
+            {
+                i.name: helper.make_tensor_value_info(i.name, i.data_type, list(i.dims))
+                for i in self.out_mp_.graph.initializer
+            }
         )
 
     def _merge_symbols(self, dims):
@@ -331,7 +326,7 @@ def _merge_symbols(self, dims):
                     return unique_dims[int_dim]
                 else:
                     if self.verbose_ > 0:
-                        logger.debug("dim {} has been merged with dim {}".format(unique_dims[1:], unique_dims[0]))
+                        logger.debug(f"dim {unique_dims[1:]} has been merged with dim {unique_dims[0]}")
                     return dims[0]
             else:
                 return None
@@ -367,7 +362,7 @@ def _broadcast_shapes(self, shape1, shape2):
                         self._add_suggested_merge([dim1, dim2], apply=True)
                     else:
                         logger.warning("unsupported broadcast between " + str(dim1) + " " + str(dim2))
-            new_shape = [new_dim] + new_shape
+            new_shape = [new_dim, *new_shape]
         return new_shape
 
     def _get_shape(self, node, idx):
@@ -403,7 +398,7 @@ def _get_sympy_shape(self, node, idx):
                     else sympy.Symbol(d, integer=True, nonnegative=True)
                 )
             else:
-                assert None != d
+                assert None is not d
                 sympy_shape.append(d)
         return sympy_shape
 
@@ -422,7 +417,7 @@ def _try_get_value(self, node, idx):
 
     def _update_computed_dims(self, new_sympy_shape):
         for i, new_dim in enumerate(new_sympy_shape):
-            if not is_literal(new_dim) and not type(new_dim) == str:
+            if not is_literal(new_dim) and type(new_dim) != str:
                 str_dim = str(new_dim)
                 if str_dim in self.suggested_merge_:
                     if is_literal(self.suggested_merge_[str_dim]):
@@ -430,7 +425,7 @@ def _update_computed_dims(self, new_sympy_shape):
                     new_sympy_shape[i] = self.symbolic_dims_[self.suggested_merge_[str_dim]]
                 else:
                     # add new_dim if it's a computational expression
-                    if not str(new_dim) in self.symbolic_dims_:
+                    if str(new_dim) not in self.symbolic_dims_:
                         self.symbolic_dims_[str(new_dim)] = new_dim
 
     def _onnx_infer_single_node(self, node):
@@ -499,15 +494,13 @@ def _onnx_infer_single_node(self, node):
 
     def _onnx_infer_subgraph(self, node, subgraph, use_node_input=True, inc_subgraph_id=True):
         if self.verbose_ > 2:
-            logger.debug(
-                "Inferencing subgraph of node {} with output({}...): {}".format(node.name, node.output[0], node.op_type)
-            )
+            logger.debug(f"Inferencing subgraph of node {node.name} with output({node.output[0]}...): {node.op_type}")
         # node inputs are not passed directly to the subgraph
         # it's up to the node dispatcher to prepare subgraph input
         # for example, with Scan/Loop, subgraph input shape would be trimmed from node input shape
         # besides, inputs in subgraph could shadow implicit inputs
-        subgraph_inputs = set([i.name for i in list(subgraph.initializer) + list(subgraph.input)])
-        subgraph_implicit_input = set([name for name in self.known_vi_.keys() if not name in subgraph_inputs])
+        subgraph_inputs = {i.name for i in list(subgraph.initializer) + list(subgraph.input)}
+        subgraph_implicit_input = {name for name in self.known_vi_ if name not in subgraph_inputs}
         tmp_graph = helper.make_graph(
             list(subgraph.node),
             "tmp",
@@ -528,11 +521,10 @@ def _onnx_infer_subgraph(self, node, subgraph, use_node_input=True, inc_subgraph
         if inc_subgraph_id:
             self.subgraph_id_ += 1
 
-        all_shapes_inferred = False
         symbolic_shape_inference._preprocess(self.tmp_mp_)
         symbolic_shape_inference.suggested_merge_ = self.suggested_merge_.copy()
         while symbolic_shape_inference.run_:
-            all_shapes_inferred = symbolic_shape_inference._infer_impl(self.sympy_data_.copy())
+            symbolic_shape_inference._infer_impl(self.sympy_data_.copy())
         symbolic_shape_inference._update_output_from_vi()
         if use_node_input:
             # if subgraph uses node input, it needs to update to merged dims
@@ -546,9 +538,9 @@ def _onnx_infer_subgraph(self, node, subgraph, use_node_input=True, inc_subgraph
         subgraph.node.extend(symbolic_shape_inference.out_mp_.graph.node)
         # for new symbolic dims from subgraph output, add to main graph symbolic dims
         subgraph_shapes = [get_shape_from_value_info(o) for o in symbolic_shape_inference.out_mp_.graph.output]
-        subgraph_new_symbolic_dims = set(
-            [d for s in subgraph_shapes if s for d in s if type(d) == str and not d in self.symbolic_dims_]
-        )
+        subgraph_new_symbolic_dims = {
+            d for s in subgraph_shapes if s for d in s if type(d) == str and d not in self.symbolic_dims_
+        }
         new_dims = {}
         for d in subgraph_new_symbolic_dims:
             assert d in symbolic_shape_inference.symbolic_dims_
@@ -617,7 +609,7 @@ def _pass_on_shape_and_type(self, node):
         )
 
     def _new_symbolic_dim(self, prefix, dim):
-        new_dim = "{}_d{}".format(prefix, dim)
+        new_dim = f"{prefix}_d{dim}"
         if new_dim in self.suggested_merge_:
             v = self.suggested_merge_[new_dim]
             new_symbolic_dim = sympy.Integer(int(v)) if is_literal(v) else v
@@ -643,12 +635,12 @@ def _new_symbolic_shape(self, rank, node, out_idx=0):
     def _compute_conv_pool_shape(self, node, channels_last=False):
         sympy_shape = self._get_sympy_shape(node, 0)
         if len(node.input) > 1:
-            W_shape = self._get_sympy_shape(node, 1)
+            W_shape = self._get_sympy_shape(node, 1)  # noqa: N806
             rank = len(W_shape) - 2  # number of spatial axes
             kernel_shape = W_shape[-rank - 1 : -1] if channels_last else W_shape[-rank:]
             sympy_shape[3 if channels_last else 1] = W_shape[0]
         else:
-            W_shape = None
+            W_shape = None  # noqa: N806
             kernel_shape = get_attribute(node, "kernel_shape")
             rank = len(kernel_shape)
 
@@ -733,7 +725,7 @@ def _compute_matmul_shape(self, node, output_dtype=None):
         else:
             lhs_reduce_dim = -1
             rhs_reduce_dim = -2
-            new_shape = self._broadcast_shapes(lhs_shape[:-2], rhs_shape[:-2]) + [lhs_shape[-2]] + [rhs_shape[-1]]
+            new_shape = [*self._broadcast_shapes(lhs_shape[:-2], rhs_shape[:-2]), lhs_shape[-2]] + [rhs_shape[-1]]
         # merge reduce dim
         self._check_merged_dims(
             [lhs_shape[lhs_reduce_dim], rhs_shape[rhs_reduce_dim]],
@@ -774,7 +766,7 @@ def _fuse_tensor_type(self, node, out_idx, dst_type, src_type):
         else:
             dst_tensor_type.CopyFrom(src_tensor_type)
 
-    def _infer_ArrayFeatureExtractor(self, node):
+    def _infer_ArrayFeatureExtractor(self, node):  # noqa: N802
         data_shape = self._get_shape(node, 0)
         indices_shape = self._get_shape(node, 1)
         vi = self.known_vi_[node.output[0]]
@@ -788,28 +780,28 @@ def _infer_ArrayFeatureExtractor(self, node):
 
     def _infer_symbolic_compute_ops(self, node):
         funcs = {
-            "Add": lambda l: l[0] + l[1],
-            "Div": lambda l: l[0] // l[1],  # integer div in sympy
-            "Equal": lambda l: l[0] == l[1],
-            "Floor": lambda l: sympy.floor(l[0]),
-            "Max": lambda l: l[1]
+            "Add": lambda l: l[0] + l[1],  # noqa: E741
+            "Div": lambda l: l[0] // l[1],  # integer div in sympy  # noqa: E741
+            "Equal": lambda l: l[0] == l[1],  # noqa: E741
+            "Floor": lambda l: sympy.floor(l[0]),  # noqa: E741
+            "Max": lambda l: l[1]  # noqa: E741
             if is_literal(l[0]) and int(l[0]) < -self.int_max_
             else (l[0] if is_literal(l[1]) and int(l[1]) < -self.int_max_ else sympy.Max(l[0], l[1])),
-            "Min": lambda l: l[1]
+            "Min": lambda l: l[1]  # noqa: E741
             if is_literal(l[0]) and int(l[0]) > self.int_max_
             else (l[0] if is_literal(l[1]) and int(l[1]) > self.int_max_ else sympy.Min(l[0], l[1])),
-            "Mul": lambda l: l[0] * l[1],
-            "Sub": lambda l: l[0] - l[1],
-            "Where": lambda l: l[1] if l[0] else l[2],
-            "Neg": lambda l: -l[0],
+            "Mul": lambda l: l[0] * l[1],  # noqa: E741
+            "Sub": lambda l: l[0] - l[1],  # noqa: E741
+            "Where": lambda l: l[1] if l[0] else l[2],  # noqa: E741
+            "Neg": lambda l: -l[0],  # noqa: E741
         }
         assert node.op_type in funcs
         self._compute_on_sympy_data(node, funcs[node.op_type])
 
-    def _infer_Cast(self, node):
+    def _infer_Cast(self, node):  # noqa: N802
         self._pass_on_sympy_data(node)
 
-    def _infer_CategoryMapper(self, node):
+    def _infer_CategoryMapper(self, node):  # noqa: N802
         input_type = self.known_vi_[node.input[0]].type.tensor_type.elem_type
         if input_type == onnx.TensorProto.STRING:
             output_type = onnx.TensorProto.INT64
@@ -818,12 +810,12 @@ def _infer_CategoryMapper(self, node):
         vi = self.known_vi_[node.output[0]]
         vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_type, self._get_shape(node, 0)))
 
-    def _infer_Compress(self, node):
+    def _infer_Compress(self, node):  # noqa: N802
         input_shape = self._get_shape(node, 0)
         # create a new symbolic dimension for Compress output
         compress_len = str(self._new_symbolic_dim_from_output(node))
         axis = get_attribute(node, "axis")
-        if axis == None:
+        if axis is None:
             # when axis is not specified, input is flattened before compress so output is 1D
             output_shape = [compress_len]
         else:
@@ -838,11 +830,11 @@ def _infer_Compress(self, node):
             )
         )
 
-    def _infer_Concat(self, node):
+    def _infer_Concat(self, node):  # noqa: N802
         if any([i in self.sympy_data_ or i in self.initializers_ for i in node.input]):
             values = self._get_int_values(node)
             if all([v is not None for v in values]):
-                assert 0 == get_attribute(node, "axis")
+                assert get_attribute(node, "axis") == 0
                 self.sympy_data_[node.output[0]] = []
                 for i in range(len(node.input)):
                     value = values[i]
@@ -879,7 +871,7 @@ def _infer_Concat(self, node):
             )
         )
 
-    def _infer_ConcatFromSequence(self, node):
+    def _infer_ConcatFromSequence(self, node):  # noqa: N802
         seq_shape = self._get_shape(node, 0)
         new_axis = 1 if get_attribute(node, "new_axis") else 0
         axis = handle_negative_axis(get_attribute(node, "axis"), len(seq_shape) + new_axis)
@@ -898,11 +890,11 @@ def _infer_ConcatFromSequence(self, node):
             )
         )
 
-    def _infer_Constant(self, node):
+    def _infer_Constant(self, node):  # noqa: N802
         t = get_attribute(node, "value")
         self.sympy_data_[node.output[0]] = numpy_helper.to_array(t)
 
-    def _infer_ConstantOfShape(self, node):
+    def _infer_ConstantOfShape(self, node):  # noqa: N802
         sympy_shape = self._get_int_values(node)[0]
         vi = self.known_vi_[node.output[0]]
         if sympy_shape is not None:
@@ -927,7 +919,7 @@ def _infer_ConstantOfShape(self, node):
             )
         )
 
-    def _infer_Conv(self, node):
+    def _infer_Conv(self, node):  # noqa: N802
         sympy_shape = self._compute_conv_pool_shape(node)
         self._update_computed_dims(sympy_shape)
         vi = self.known_vi_[node.output[0]]
@@ -939,7 +931,7 @@ def _infer_Conv(self, node):
             )
         )
 
-    def _infer_NhwcConv(self, node):
+    def _infer_NhwcConv(self, node):  # noqa: N802
         sympy_shape = self._compute_conv_pool_shape(node, channels_last=True)
         self._update_computed_dims(sympy_shape)
         vi = self.known_vi_[node.output[0]]
@@ -951,7 +943,7 @@ def _infer_NhwcConv(self, node):
             )
         )
 
-    def _infer_Einsum(self, node):
+    def _infer_Einsum(self, node):  # noqa: N802
         # ref:https://github.com/onnx/onnx/blob/623dfaa0151b2e4ce49779c3ec31cbd78c592b80/onnx/defs/math/defs.cc#L3275
         equation = get_attribute(node, "equation")
         equation = equation.replace(b" ", b"")
@@ -1013,7 +1005,7 @@ def _infer_Einsum(self, node):
         vi = self.known_vi_[node.output[0]]
         vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, new_sympy_shape))
 
-    def _infer_Expand(self, node):
+    def _infer_Expand(self, node):  # noqa: N802
         expand_to_shape = as_list(self._try_get_value(node, 1), keep_none=True)
         if expand_to_shape is not None:
             # new_shape's dim can come from shape value
@@ -1029,7 +1021,7 @@ def _infer_Expand(self, node):
                 )
             )
 
-    def _infer_Gather(self, node):
+    def _infer_Gather(self, node):  # noqa: N802
         data_shape = self._get_shape(node, 0)
         axis = handle_negative_axis(get_attribute(node, "axis", 0), len(data_shape))
         indices_shape = self._get_shape(node, 1)
@@ -1042,7 +1034,7 @@ def _infer_Gather(self, node):
             )
         )
         # for 1D input, do some sympy compute
-        if node.input[0] in self.sympy_data_ and len(data_shape) == 1 and 0 == get_attribute(node, "axis", 0):
+        if node.input[0] in self.sympy_data_ and len(data_shape) == 1 and get_attribute(node, "axis", 0) == 0:
             idx = self._try_get_value(node, 1)
             if idx is not None:
                 data = self.sympy_data_[node.input[0]]
@@ -1055,7 +1047,7 @@ def _infer_Gather(self, node):
                     assert idx == 0 or idx == -1
                     self.sympy_data_[node.output[0]] = data
 
-    def _infer_GatherElements(self, node):
+    def _infer_GatherElements(self, node):  # noqa: N802
         indices_shape = self._get_shape(node, 1)
         vi = self.known_vi_[node.output[0]]
         vi.CopyFrom(
@@ -1066,11 +1058,11 @@ def _infer_GatherElements(self, node):
             )
         )
 
-    def _infer_GatherND(self, node):
+    def _infer_GatherND(self, node):  # noqa: N802
         data_shape = self._get_shape(node, 0)
         data_rank = len(data_shape)
         indices_shape = self._get_shape(node, 1)
-        indices_rank = len(indices_shape)
+        len(indices_shape)
         last_index_dimension = indices_shape[-1]
         assert is_literal(last_index_dimension) and last_index_dimension <= data_rank
         new_shape = indices_shape[:-1] + data_shape[last_index_dimension:]
@@ -1083,7 +1075,7 @@ def _infer_GatherND(self, node):
             )
         )
 
-    def _infer_If(self, node):
+    def _infer_If(self, node):  # noqa: N802
         # special case for constant condition, in case there are mismatching shape from the non-executed branch
         subgraphs = [
             get_attribute(node, "then_branch"),
@@ -1111,7 +1103,7 @@ def _infer_If(self, node):
                     if subgraph.output[i_out].name in subgraph_infer.sympy_data_:
                         self.sympy_data_[vi.name] = subgraph_infer.sympy_data_[subgraph.output[i_out].name]
 
-    def _infer_Loop(self, node):
+    def _infer_Loop(self, node):  # noqa: N802
         subgraph = get_attribute(node, "body")
         assert len(subgraph.input) == len(node.input)
         num_loop_carried = len(node.input) - 2  # minus the length and initial loop condition
@@ -1172,25 +1164,25 @@ def _infer_Loop(self, node):
                 vi_dim.extend(list(subgraph_vi_dim))
             vi.name = node.output[i]
 
-    def _infer_MatMul(self, node):
+    def _infer_MatMul(self, node):  # noqa: N802
         self._compute_matmul_shape(node)
 
-    def _infer_MatMulInteger(self, node):
+    def _infer_MatMulInteger(self, node):  # noqa: N802
         self._compute_matmul_shape(node, onnx.TensorProto.INT32)
 
-    def _infer_NonMaxSuppression(self, node):
+    def _infer_NonMaxSuppression(self, node):  # noqa: N802
         selected = str(self._new_symbolic_dim_from_output(node))
         vi = self.known_vi_[node.output[0]]
         vi.CopyFrom(helper.make_tensor_value_info(node.output[0], onnx.TensorProto.INT64, [selected, 3]))
 
-    def _infer_NonZero(self, node):
+    def _infer_NonZero(self, node):  # noqa: N802
         input_rank = self._get_shape_rank(node, 0)
         # create a new symbolic dimension for NonZero output
         nz_len = str(self._new_symbolic_dim_from_output(node, 0, 1))
         vi = self.known_vi_[node.output[0]]
         vi.CopyFrom(helper.make_tensor_value_info(node.output[0], vi.type.tensor_type.elem_type, [input_rank, nz_len]))
 
-    def _infer_OneHot(self, node):
+    def _infer_OneHot(self, node):  # noqa: N802
         sympy_shape = self._get_sympy_shape(node, 0)
         depth = self._try_get_value(node, 1)
         axis = get_attribute(node, "axis", -1)
@@ -1209,7 +1201,7 @@ def _infer_OneHot(self, node):
             )
         )
 
-    def _infer_Pad(self, node):
+    def _infer_Pad(self, node):  # noqa: N802
         if get_opset(self.out_mp_) <= 10:
             pads = get_attribute(node, "pads")
         else:
@@ -1234,7 +1226,7 @@ def _infer_Pad(self, node):
             helper.make_tensor_value_info(node.output[0], output_tp, get_shape_from_sympy_shape(new_sympy_shape))
         )
 
-    def _infer_Pool(self, node):
+    def _infer_Pool(self, node):  # noqa: N802
         sympy_shape = self._compute_conv_pool_shape(node)
         self._update_computed_dims(sympy_shape)
         for o in node.output:
@@ -1404,7 +1396,7 @@ def _infer_aten_argmax(self, node):
     def _infer_aten_group_norm(self, node):
         self._propagate_shape_and_type(node)
         input_shape = self._get_shape(node, 0)
-        N = input_shape[0] if input_shape is not None and len(input_shape) != 0 else None
+        N = input_shape[0] if input_shape is not None and len(input_shape) != 0 else None  # noqa: N806
         group = self._try_get_value(node, 6)
         output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
         for i in [1, 2]:
@@ -1439,7 +1431,7 @@ def _infer_aten_upsample(self, node):
             vi = self.known_vi_[node.output[0]]
             vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, new_shape))
 
-    def _infer_BatchNormalization(self, node):
+    def _infer_BatchNormalization(self, node):  # noqa: N802
         self._propagate_shape_and_type(node)
 
         # this works for opsets < 14 and 14 since we check i < len(node.output) in the loop
@@ -1448,7 +1440,7 @@ def _infer_BatchNormalization(self, node):
                 # all of these parameters have the same shape as the 1st input
                 self._propagate_shape_and_type(node, input_index=1, output_index=i)
 
-    def _infer_Range(self, node):
+    def _infer_Range(self, node):  # noqa: N802
         vi = self.known_vi_[node.output[0]]
         input_data = self._get_int_values(node)
         if all([i is not None for i in input_data]):
@@ -1467,7 +1459,7 @@ def _infer_Range(self, node):
             )
         )
 
-    def _infer_ReduceSum(self, node):
+    def _infer_ReduceSum(self, node):  # noqa: N802
         keep_dims = get_attribute(node, "keepdims", 1)
         if get_opset(self.out_mp_) >= 13 and len(node.input) > 1:
             # ReduceSum changes axes to input[1] in opset 13
@@ -1500,7 +1492,7 @@ def _infer_ReduceSum(self, node):
                     )
                 )
 
-    def _infer_ReduceProd(self, node):
+    def _infer_ReduceProd(self, node):  # noqa: N802
         axes = get_attribute(node, "axes")
         keep_dims = get_attribute(node, "keepdims", 1)
         if keep_dims == 0 and axes == [0]:
@@ -1508,7 +1500,7 @@ def _infer_ReduceProd(self, node):
             if data is not None:
                 self.sympy_data_[node.output[0]] = sympy_reduce_product(data)
 
-    def _infer_RelativePositionBias(self, node):
+    def _infer_RelativePositionBias(self, node):  # noqa: N802
         seq_len = self._try_get_value(node, 1)
         real_seq_len = self._try_get_value(node, 2)
         if seq_len is None or real_seq_len is None:
@@ -1521,7 +1513,7 @@ def _infer_RelativePositionBias(self, node):
         vi = self.known_vi_[node.output[0]]
         vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, new_shape))
 
-    def _infer_Reshape(self, node):
+    def _infer_Reshape(self, node):  # noqa: N802
         shape_value = self._try_get_value(node, 1)
         vi = self.known_vi_[node.output[0]]
         if shape_value is None:
@@ -1573,7 +1565,7 @@ def _infer_Reshape(self, node):
 
         self._pass_on_sympy_data(node)
 
-    def _infer_Resize(self, node):
+    def _infer_Resize(self, node):  # noqa: N802
         vi = self.known_vi_[node.output[0]]
         input_sympy_shape = self._get_sympy_shape(node, 0)
         if get_opset(self.out_mp_) <= 10:
@@ -1621,7 +1613,7 @@ def _infer_Resize(self, node):
                 )
             )
 
-    def _infer_Scan(self, node):
+    def _infer_Scan(self, node):  # noqa: N802
         subgraph = get_attribute(node, "body")
         num_scan_inputs = get_attribute(node, "num_scan_inputs")
         scan_input_axes = get_attribute(node, "scan_input_axes", [0] * num_scan_inputs)
@@ -1656,7 +1648,7 @@ def _infer_Scan(self, node):
                 vi.CopyFrom(subgraph.output[i])
             vi.name = o
 
-    def _infer_ScatterElements(self, node):
+    def _infer_ScatterElements(self, node):  # noqa: N802
         data_shape = self._get_shape(node, 0)
         vi = self.known_vi_[node.output[0]]
         vi.CopyFrom(
@@ -1667,7 +1659,7 @@ def _infer_ScatterElements(self, node):
             )
         )
 
-    def _infer_SequenceAt(self, node):
+    def _infer_SequenceAt(self, node):  # noqa: N802
         # need to create new symbolic dimension if sequence shape has None:
         seq_shape = self._get_shape(node, 0)
         vi = self.known_vi_[node.output[0]]
@@ -1679,7 +1671,7 @@ def _infer_SequenceAt(self, node):
                 new_dim.dim_param = str(self._new_symbolic_dim_from_output(node, 0, di))
                 vi.type.tensor_type.shape.dim[di].CopyFrom(new_dim)
 
-    def _infer_SequenceInsert(self, node):
+    def _infer_SequenceInsert(self, node):  # noqa: N802
         # workaround bug in onnx's shape inference
         vi_seq = self.known_vi_[node.input[0]]
         vi_tensor = self.known_vi_[node.input[1]]
@@ -1688,17 +1680,17 @@ def _infer_SequenceInsert(self, node):
         vi_out_seq.name = node.output[0]
         self._fuse_tensor_type(node, 0, vi_out_seq.type, vi_tensor.type)
 
-    def _infer_Shape(self, node):
+    def _infer_Shape(self, node):  # noqa: N802
         self.sympy_data_[node.output[0]] = self._get_sympy_shape(node, 0)
 
-    def _infer_Size(self, node):
+    def _infer_Size(self, node):  # noqa: N802
         sympy_shape = self._get_sympy_shape(node, 0)
         self.sympy_data_[node.output[0]] = sympy_reduce_product(sympy_shape)
         self.known_vi_[node.output[0]].CopyFrom(
             helper.make_tensor_value_info(node.output[0], onnx.TensorProto.INT64, [])
         )
 
-    def _infer_Slice(self, node):
+    def _infer_Slice(self, node):  # noqa: N802
         # SymPy fails to prove that `x_0 + ... + x_n >= 0` if one of `x_i` is a `sympy.Min(a, b)`,
         # even when the relation holds for both `a` and `b`.
         #
@@ -1761,7 +1753,7 @@ def handle_negative_index(index, bound):
                         return index
                     return bound + index
             except TypeError:
-                logger.warning("Cannot determine if {} < 0".format(index))
+                logger.warning(f"Cannot determine if {index} < 0")
             return index
 
         if get_opset(self.out_mp_) <= 9:
@@ -1817,9 +1809,7 @@ def handle_negative_index(index, bound):
                             if not less_equal(e, new_sympy_shape[i]):
                                 e = new_sympy_shape[i]
                         except Exception:
-                            logger.warning(
-                                "Unable to determine if {} <= {}, treat as equal".format(e, new_sympy_shape[i])
-                            )
+                            logger.warning(f"Unable to determine if {e} <= {new_sympy_shape[i]}, treat as equal")
                             e = new_sympy_shape[i]
 
                 s = handle_negative_index(s, new_sympy_shape[i])
@@ -1853,7 +1843,7 @@ def handle_negative_index(index, bound):
             ):
                 self.sympy_data_[node.output[0]] = input_sympy_data[starts[0] : ends[0] : steps[0]]
 
-    def _infer_SoftmaxCrossEntropyLoss(self, node):
+    def _infer_SoftmaxCrossEntropyLoss(self, node):  # noqa: N802
         vi = self.known_vi_[node.output[0]]
         elem_type = self.known_vi_[node.input[0]].type.tensor_type.elem_type
 
@@ -1870,7 +1860,7 @@ def _infer_SoftmaxCrossEntropyLoss(self, node):
             vi = self.known_vi_[node.output[1]]
             vi.CopyFrom(helper.make_tensor_value_info(vi.name, elem_type, data_shape))
 
-    def _infer_Split_Common(self, node, make_value_info_func):
+    def _infer_Split_Common(self, node, make_value_info_func):  # noqa: N802
         input_sympy_shape = self._get_sympy_shape(node, 0)
         axis = handle_negative_axis(get_attribute(node, "axis", 0), len(input_sympy_shape))
         split = get_attribute(node, "split")
@@ -1892,13 +1882,13 @@ def _infer_Split_Common(self, node, make_value_info_func):
             )
             self.known_vi_[vi.name] = vi
 
-    def _infer_Split(self, node):
+    def _infer_Split(self, node):  # noqa: N802
         self._infer_Split_Common(node, helper.make_tensor_value_info)
 
-    def _infer_SplitToSequence(self, node):
+    def _infer_SplitToSequence(self, node):  # noqa: N802
         self._infer_Split_Common(node, helper.make_sequence_value_info)
 
-    def _infer_Squeeze(self, node):
+    def _infer_Squeeze(self, node):  # noqa: N802
         input_shape = self._get_shape(node, 0)
         op_set = get_opset(self.out_mp_)
 
@@ -1946,7 +1936,7 @@ def _infer_Squeeze(self, node):
         )
         self._pass_on_sympy_data(node)
 
-    def _infer_Tile(self, node):
+    def _infer_Tile(self, node):  # noqa: N802
         repeats_value = self._try_get_value(node, 1)
         new_sympy_shape = []
         if repeats_value is not None:
@@ -1966,7 +1956,7 @@ def _infer_Tile(self, node):
             )
         )
 
-    def _infer_TopK(self, node):
+    def _infer_TopK(self, node):  # noqa: N802
         rank = self._get_shape_rank(node, 0)
         axis = handle_negative_axis(get_attribute(node, "axis", -1), rank)
         new_shape = self._get_shape(node, 0)
@@ -1976,7 +1966,7 @@ def _infer_TopK(self, node):
         else:
             k = self._get_int_values(node)[1]
 
-        if k == None:
+        if k is None:
             k = self._new_symbolic_dim_from_output(node)
         else:
             k = as_scalar(k)
@@ -1995,7 +1985,7 @@ def _infer_TopK(self, node):
             vi = self.known_vi_[node.output[i_o]]
             vi.CopyFrom(helper.make_tensor_value_info(node.output[i_o], vi.type.tensor_type.elem_type, new_shape))
 
-    def _infer_Transpose(self, node):
+    def _infer_Transpose(self, node):  # noqa: N802
         if node.input[0] in self.sympy_data_:
             data_shape = self._get_shape(node, 0)
             perm = get_attribute(node, "perm", reversed(list(range(len(data_shape)))))
@@ -2004,7 +1994,7 @@ def _infer_Transpose(self, node):
                 np.transpose(np.array(input_data).reshape(*data_shape), axes=tuple(perm)).flatten().tolist()
             )
 
-    def _infer_Unsqueeze(self, node):
+    def _infer_Unsqueeze(self, node):  # noqa: N802
         input_shape = self._get_shape(node, 0)
         op_set = get_opset(self.out_mp_)
 
@@ -2039,7 +2029,7 @@ def _infer_Unsqueeze(self, node):
 
         self._pass_on_sympy_data(node)
 
-    def _infer_ZipMap(self, node):
+    def _infer_ZipMap(self, node):  # noqa: N802
         map_key_type = None
         if get_attribute(node, "classlabels_int64s") is not None:
             map_key_type = onnx.TensorProto.INT64
@@ -2054,7 +2044,7 @@ def _infer_ZipMap(self, node):
         vi = self.known_vi_[node.output[0]]
         vi.CopyFrom(new_vi)
 
-    def _infer_Attention(self, node):
+    def _infer_Attention(self, node):  # noqa: N802
         shape = self._get_shape(node, 0)
         shape_weights = self._get_shape(node, 1)
         shape_bias = self._try_get_shape(node, 2)
@@ -2091,10 +2081,10 @@ def _infer_Attention(self, node):
                     vi = self.known_vi_[node.output[1]]
                     vi.CopyFrom(helper.make_tensor_value_info(vi.name, output_dtype, past_shape))
 
-    def _infer_BiasGelu(self, node):
+    def _infer_BiasGelu(self, node):  # noqa: N802
         self._propagate_shape_and_type(node)
 
-    def _infer_MultiHeadAttention(self, node):
+    def _infer_MultiHeadAttention(self, node):  # noqa: N802
         # Output 0 has shape (batch_size, sequence_length, v_hidden_size)
         # Q, K and V without packing:
         #   Input 0 (query) has shape (batch_size, sequence_length, hidden_size)
@@ -2169,26 +2159,26 @@ def _infer_MultiHeadAttention(self, node):
                 vi = self.known_vi_[node.output[2]]
                 vi.CopyFrom(helper.make_tensor_value_info(vi.name, output_dtype, present_shape))
 
-    def _infer_FastGelu(self, node):
+    def _infer_FastGelu(self, node):  # noqa: N802
         self._propagate_shape_and_type(node)
 
-    def _infer_Gelu(self, node):
+    def _infer_Gelu(self, node):  # noqa: N802
         self._propagate_shape_and_type(node)
 
-    def _infer_GemmFastGelu(self, node):
+    def _infer_GemmFastGelu(self, node):  # noqa: N802
         self._compute_matmul_shape(node)
 
-    def _infer_LayerNormalization(self, node):
+    def _infer_LayerNormalization(self, node):  # noqa: N802
         self._propagate_shape_and_type(node)
 
-    def _infer_LongformerAttention(self, node):
+    def _infer_LongformerAttention(self, node):  # noqa: N802
         self._propagate_shape_and_type(node)
 
-    def _infer_EmbedLayerNormalization(self, node):
+    def _infer_EmbedLayerNormalization(self, node):  # noqa: N802
         input_ids_shape = self._get_shape(node, 0)
         word_embedding_shape = self._get_shape(node, 2)
         assert len(input_ids_shape) == 2 and len(word_embedding_shape) == 2
-        output_shape = input_ids_shape + [word_embedding_shape[1]]
+        output_shape = [*input_ids_shape, word_embedding_shape[1]]
 
         word_embedding_dtype = self.known_vi_[node.input[2]].type.tensor_type.elem_type
         vi = self.known_vi_[node.output[0]]
@@ -2204,7 +2194,7 @@ def _infer_EmbedLayerNormalization(self, node):
             vi = self.known_vi_[node.output[2]]
             vi.CopyFrom(helper.make_tensor_value_info(node.output[2], word_embedding_dtype, output_shape))
 
-    def _infer_SkipLayerNormalization(self, node):
+    def _infer_SkipLayerNormalization(self, node):  # noqa: N802
         self._propagate_shape_and_type(node)
 
         # If the SkipLayerNormalization node contains the optional
@@ -2212,10 +2202,10 @@ def _infer_SkipLayerNormalization(self, node):
         if len(node.output) > 3:
             self._propagate_shape_and_type(node, 0, 3)
 
-    def _infer_GroupNorm(self, node):
+    def _infer_GroupNorm(self, node):  # noqa: N802
         self._propagate_shape_and_type(node)
 
-    def _infer_BiasSplitGelu(self, node):
+    def _infer_BiasSplitGelu(self, node):  # noqa: N802
         input_shape = self._get_shape(node, 0)
         bias_shape = self._get_shape(node, 1)
         if input_shape and bias_shape and isinstance(bias_shape[0], int):
@@ -2225,10 +2215,10 @@ def _infer_BiasSplitGelu(self, node):
             output_dtype = self.known_vi_[node.input[0]].type.tensor_type.elem_type
             vi.CopyFrom(helper.make_tensor_value_info(vi.name, output_dtype, output_shape))
 
-    def _infer_BiasAdd(self, node):
+    def _infer_BiasAdd(self, node):  # noqa: N802
         self._propagate_shape_and_type(node)
 
-    def _infer_PythonOp(self, node):
+    def _infer_PythonOp(self, node):  # noqa: N802
         output_tensor_types = get_attribute(node, "output_tensor_types")
         assert output_tensor_types
         output_tensor_ranks = get_attribute(node, "output_tensor_ranks")
@@ -2275,7 +2265,7 @@ def _is_none_dim(self, dim_value):
             return False
         if "unk__" not in dim_value:
             return False
-        if dim_value in self.symbolic_dims_.keys():
+        if dim_value in self.symbolic_dims_:
             return False
         return True
 
@@ -2327,9 +2317,9 @@ def _infer_impl(self, start_sympy_data=None):
         prereq_for_node = {}  # map from node to all its inputs, including implicit ones in subgraph
 
         def get_prereq(node):
-            names = set(i for i in node.input if i)
+            names = {i for i in node.input if i}
             subgraphs = []
-            if "If" == node.op_type:
+            if node.op_type == "If":
                 subgraphs = [
                     get_attribute(node, "then_branch"),
                     get_attribute(node, "else_branch"),
@@ -2355,7 +2345,7 @@ def get_prereq(node):
 
         # topological sort nodes, note there might be dead nodes so we check if all graph outputs are reached to terminate
         sorted_nodes = []
-        sorted_known_vi = set([i.name for i in list(self.out_mp_.graph.input) + list(self.out_mp_.graph.initializer)])
+        sorted_known_vi = {i.name for i in list(self.out_mp_.graph.input) + list(self.out_mp_.graph.initializer)}
         if any([o.name in sorted_known_vi for o in self.out_mp_.graph.output]):
             # Loop/Scan will have some graph output in graph inputs, so don't do topological sort
             sorted_nodes = self.out_mp_.graph.node
@@ -2441,7 +2431,7 @@ def get_prereq(node):
                     if self.verbose_ > 2:
                         if out_type_kind == "sequence_type":
                             seq_cls_type = out_type.sequence_type.elem_type.WhichOneof("value")
-                            if "tensor_type" == seq_cls_type:
+                            if seq_cls_type == "tensor_type":
                                 logger.debug(
                                     "  {}: sequence of {} {}".format(
                                         node.output[i_o],
@@ -2452,9 +2442,9 @@ def get_prereq(node):
                                     )
                                 )
                             else:
-                                logger.debug("  {}: sequence of {}".format(node.output[i_o], seq_cls_type))
+                                logger.debug(f"  {node.output[i_o]}: sequence of {seq_cls_type}")
                         else:
-                            logger.debug("  {}: {}".format(node.output[i_o], out_type_kind))
+                            logger.debug(f"  {node.output[i_o]}: {out_type_kind}")
                     continue
 
                 out_shape = get_shape_from_value_info(vi)
@@ -2540,7 +2530,7 @@ def get_prereq(node):
                         self.run_ = False
 
                     # create new dynamic dims for ops not handled by symbolic shape inference
-                    if self.run_ == False and not node.op_type in self.dispatcher_ and not known_aten_op:
+                    if self.run_ is False and node.op_type not in self.dispatcher_ and not known_aten_op:
                         is_unknown_op = out_type_undefined and (out_shape is None or len(out_shape) == 0)
                         if is_unknown_op:
                             # unknown op to ONNX, maybe from higher opset or other domain
diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
index 7bb23084e1ca9..70ce993896ffa 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py
@@ -95,10 +95,7 @@ def split_and_sort_output(string_list):
 
 def is_dynamic(model):
     inp = model.graph.input[0]
-    for dim in inp.type.tensor_type.shape.dim:
-        if not dim.HasField("dim_value"):
-            return True
-    return False
+    return any(not dim.HasField("dim_value") for dim in inp.type.tensor_type.shape.dim)
 
 
 def get_model_inputs(model):
@@ -168,12 +165,12 @@ def run_trt_standalone(trtexec, model_name, model_path, test_data_dir, all_input
     # save engine
     engine_suffix = "_trtexec_fp16.engine" if fp16 else "_trtexec.engine"
     engine_name = model_name + engine_suffix
-    save_command = command + ["--saveEngine=" + engine_name]
+    save_command = [*command, "--saveEngine=" + engine_name]
     logger.info(save_command)
     out = get_output(save_command)
 
     # load engine and inference
-    load_command = command + ["--loadEngine=" + engine_name]
+    load_command = [*command, "--loadEngine=" + engine_name]
     logger.info(load_command)
 
     mem_usage = None
@@ -205,7 +202,7 @@ def run_trt_standalone(trtexec, model_name, model_path, test_data_dir, all_input
     avg_latency_match = re.search("mean = (.*?) ms", target)
     if avg_latency_match:
         result["average_latency_ms"] = avg_latency_match.group(1)  # extract number
-    percentile_match = re.search("percentile\(90%\) = (.*?) ms", target)
+    percentile_match = re.search("percentile\\(90%\\) = (.*?) ms", target)
     if percentile_match:
         result["latency_90_percentile"] = percentile_match.group(1)  # extract number
     if mem_usage:
@@ -222,18 +219,17 @@ def get_latency_result(runtimes, batch_size):
 
     result = {
         "test_times": len(runtimes),
-        "latency_variance": "{:.2f}".format(latency_variance),
-        "latency_90_percentile": "{:.2f}".format(np.percentile(runtimes, 90) * 1000.0),
-        "latency_95_percentile": "{:.2f}".format(np.percentile(runtimes, 95) * 1000.0),
-        "latency_99_percentile": "{:.2f}".format(np.percentile(runtimes, 99) * 1000.0),
-        "average_latency_ms": "{:.2f}".format(latency_ms),
-        "QPS": "{:.2f}".format(throughput),
+        "latency_variance": f"{latency_variance:.2f}",
+        "latency_90_percentile": f"{np.percentile(runtimes, 90) * 1000.0:.2f}",
+        "latency_95_percentile": f"{np.percentile(runtimes, 95) * 1000.0:.2f}",
+        "latency_99_percentile": f"{np.percentile(runtimes, 99) * 1000.0:.2f}",
+        "average_latency_ms": f"{latency_ms:.2f}",
+        "QPS": f"{throughput:.2f}",
     }
     return result
 
 
 def get_ort_session_inputs_and_outputs(name, session, ort_input):
-
     sess_inputs = {}
     sess_outputs = None
 
@@ -428,7 +424,6 @@ def inference_ort(
 
 
 def inference_ort_and_get_prediction(name, session, ort_inputs):
-
     ort_outputs = []
     for ort_input in ort_inputs:
         sess_inputs, sess_outputs = get_ort_session_inputs_and_outputs(name, session, ort_input)
@@ -478,7 +473,7 @@ def get_acl_version():
 # outputs: [[test_data_0_output_0.pb, test_data_0_output_1.pb ...], [test_data_1_output_0.pb, test_data_1_output_1.pb ...] ...]
 #######################################################################################################################################
 def load_onnx_model_zoo_test_data(path, all_inputs_shape, fp16):
-    logger.info("Parsing test data in {} ...".format(path))
+    logger.info(f"Parsing test data in {path} ...")
     output = get_output(["find", path, "-name", "test_data*", "-type", "d"])
     test_data_set_dir = split_and_sort_output(output)
     logger.info(test_data_set_dir)
@@ -516,7 +511,7 @@ def load_onnx_model_zoo_test_data(path, all_inputs_shape, fp16):
                     all_inputs_shape.append(input_data_pb[-1].shape)
                 logger.info(all_inputs_shape[-1])
         inputs.append(input_data_pb)
-        logger.info("Loaded {} inputs successfully.".format(len(inputs)))
+        logger.info(f"Loaded {len(inputs)} inputs successfully.")
 
         # load outputs
         output = get_output(["find", ".", "-name", "output*"])
@@ -538,7 +533,7 @@ def load_onnx_model_zoo_test_data(path, all_inputs_shape, fp16):
 
                     logger.info(np.array(output_data_pb[-1]).shape)
             outputs.append(output_data_pb)
-            logger.info("Loaded {} outputs successfully.".format(len(outputs)))
+            logger.info(f"Loaded {len(outputs)} outputs successfully.")
 
         os.chdir(pwd)
     return inputs, outputs
@@ -547,8 +542,7 @@ def load_onnx_model_zoo_test_data(path, all_inputs_shape, fp16):
 def generate_onnx_model_random_input(test_times, ref_input):
     inputs = []
 
-    for i in range(test_times):
-
+    for _i in range(test_times):
         input_data = []
         for tensor in ref_input:
             shape = tensor.shape
@@ -594,9 +588,9 @@ def validate(all_ref_outputs, all_outputs, rtol, atol, percent_mismatch):
         logger.info("No reference output provided.")
         return True, None
 
-    logger.info("Reference {} results.".format(len(all_ref_outputs)))
-    logger.info("Predicted {} results.".format(len(all_outputs)))
-    logger.info("rtol: {}, atol: {}".format(rtol, atol))
+    logger.info(f"Reference {len(all_ref_outputs)} results.")
+    logger.info(f"Predicted {len(all_outputs)} results.")
+    logger.info(f"rtol: {rtol}, atol: {atol}")
 
     for i in range(len(all_outputs)):
         ref_outputs = all_ref_outputs[i]
@@ -643,7 +637,7 @@ def update_metrics_map(model_to_metrics, model_name, ep_to_operator):
         if ep not in model_to_metrics[model_name]:
             model_to_metrics[model_name][ep] = {}
 
-        if ep == cuda or ep == cuda_fp16:
+        if ep in (cuda, cuda_fp16):
             model_to_metrics[model_name][ep]["ratio_of_ops_in_cuda_not_fallback_cpu"] = calculate_cuda_op_percentage(
                 op_map
             )
@@ -743,7 +737,6 @@ def update_metrics_map_ori(model_to_metrics, name, ep_to_operator):
 #
 ###################################################################################################
 def update_fail_model_map(model_to_fail_ep, model_name, ep, e_type, e):
-
     if model_name in model_to_fail_ep and ep in model_to_fail_ep[model_name]:
         return
 
@@ -766,7 +759,6 @@ def update_fail_model_map(model_to_fail_ep, model_name, ep, e_type, e):
 
 
 def update_fail_model_map_ori(model_to_fail_ep, fail_results, model_name, ep, e_type, e):
-
     if model_name in model_to_fail_ep and ep in model_to_fail_ep[model_name]:
         return
 
@@ -785,7 +777,6 @@ def update_fail_model_map_ori(model_to_fail_ep, fail_results, model_name, ep, e_
 
 
 def skip_ep(model_name, ep, model_to_fail_ep):
-
     if model_name not in model_to_fail_ep:
         return False
 
@@ -821,7 +812,7 @@ def write_map_to_file(result, file_name):
     if os.path.exists(file_name):
         existed_result = read_map_from_file(file_name)
 
-    for model, ep_list in result.items():
+    for model, _ep_list in result.items():
         if model in existed_result:
             existed_result[model] = {**existed_result[model], **result[model]}
         else:
@@ -969,7 +960,6 @@ def find_test_data_directory(path):
 
 
 def parse_models_info_from_directory(path, models):
-
     test_data_dir = find_test_data_directory(path)
 
     if test_data_dir:
@@ -996,7 +986,6 @@ def parse_models_info_from_directory(path, models):
 
 
 def parse_models_info_from_file(root_dir, path, models):
-
     # default working directory
     root_working_directory = root_dir + "perf/"
 
@@ -1004,7 +993,6 @@ def parse_models_info_from_file(root_dir, path, models):
         data = json.load(f)
 
         for row in data:
-
             if "root_working_directory" in row:
                 root_working_directory = row["root_working_directory"]
                 continue
@@ -1120,10 +1108,10 @@ def create_session(model_path, providers, provider_options, session_options):
                 status = run_symbolic_shape_inference(model_path, new_model_path)
                 if not status[0]:  # symbolic shape inference error
                     e = status[1]
-                    raise Exception(e)
+                    raise Exception(e)  # noqa: B904
             return time_and_create_session(new_model_path, providers, provider_options, session_options)
         else:
-            raise Exception(e)
+            raise Exception(e)  # noqa: B904
 
 
 def calculate_gain(value, ep1, ep2):
@@ -1134,19 +1122,19 @@ def calculate_gain(value, ep1, ep2):
 
 
 def add_improvement_information(model_to_latency):
-    for key, value in model_to_latency.items():
+    for _key, value in model_to_latency.items():
         if trt in value and cuda in value:
             gain = calculate_gain(value, trt, cuda)
-            value[trt_cuda_gain] = "{:.2f} %".format(gain)
+            value[trt_cuda_gain] = f"{gain:.2f} %"
         if trt_fp16 in value and cuda_fp16 in value:
             gain = calculate_gain(value, trt_fp16, cuda_fp16)
-            value[trt_cuda_fp16_gain] = "{:.2f} %".format(gain)
+            value[trt_cuda_fp16_gain] = f"{gain:.2f} %"
         if trt in value and standalone_trt in value:
             gain = calculate_gain(value, trt, standalone_trt)
-            value[trt_native_gain] = "{:.2f} %".format(gain)
+            value[trt_native_gain] = f"{gain:.2f} %"
         if trt_fp16 in value and standalone_trt_fp16 in value:
             gain = calculate_gain(value, trt_fp16, standalone_trt_fp16)
-            value[trt_native_fp16_gain] = "{:.2f} %".format(gain)
+            value[trt_native_fp16_gain] = f"{gain:.2f} %"
 
 
 def output_details(results, csv_filename):
@@ -1185,7 +1173,6 @@ def output_details(results, csv_filename):
 
 
 def output_fail(model_to_fail_ep, csv_filename):
-
     with open(csv_filename, mode="w", newline="") as csv_file:
         column_names = ["model", "ep", "error type", "error message"]
 
@@ -1220,17 +1207,16 @@ def add_status_dict(status_dict, model_name, ep, status):
 
 
 def build_status(status_dict, results, is_fail):
-
     if is_fail:
         for model, model_info in results.items():
-            for ep, ep_info in model_info.items():
+            for ep, _ep_info in model_info.items():
                 model_name = model
                 ep = ep
                 status = "Fail"
                 add_status_dict(status_dict, model_name, ep, status)
     else:
         for model, value in results.items():
-            for ep, ep_info in value.items():
+            for ep, _ep_info in value.items():
                 model_name = model
                 ep = ep
                 status = "Pass"
@@ -1240,7 +1226,6 @@ def build_status(status_dict, results, is_fail):
 
 
 def output_status(results, csv_filename):
-
     need_write_header = True
     if os.path.exists(csv_filename):
         need_write_header = False
@@ -1329,7 +1314,7 @@ def output_session_creation(results, csv_filename):
     with open(csv_filename, mode="a", newline="") as csv_file:
         session_1 = [p + session_ending for p in ort_provider_list]
         session_2 = [p + second_session_ending for p in ort_provider_list]
-        column_names = [model_title] + session_1 + session_2
+        column_names = [model_title, *session_1, *session_2]
         csv_writer = csv.writer(csv_file)
 
         csv_writer = csv.writer(csv_file)
@@ -1533,7 +1518,6 @@ def output_metrics(model_to_metrics, csv_filename):
 
         results = []
         for model, ep_info in model_to_metrics.items():
-
             result = {}
             result_fp16 = {}
             result["model_name"] = model
@@ -1663,7 +1647,6 @@ def test_models_eps(args, models):
         ep_results = {"latency": {}, "metrics": {}, "session": {}}
 
         for exec_provider in ep_list:
-
             # Skip model + EP combinations that have already failed in a previous run.
             if skip_ep(name, exec_provider, model_to_fail_ep):
                 continue
@@ -1752,7 +1735,6 @@ def run_model_on_ep(
 
     # use float16.py for cuda fp16 only
     if cuda_fp16 == exec_provider:
-
         # handle model
         if "model_path_fp16" in model_info:
             model_path = os.path.normpath(os.path.join(model_work_dir, model_info["model_path_fp16"]))
@@ -1942,7 +1924,6 @@ def benchmark_model_on_ep(
             return
 
     if result:
-
         ep_results["latency"][exec_provider] = {}
         ep_results["latency"][exec_provider]["average_latency_ms"] = result["average_latency_ms"]
         ep_results["latency"][exec_provider]["latency_90_percentile"] = result["latency_90_percentile"]
@@ -2073,12 +2054,10 @@ def __call__(self, parser, namespace, values, option_string):
             try:
                 k, v = kv.split("=")
             except ValueError:
-                parser.error("argument {opt_str}: Expected '=' between key and value".format(opt_str=option_string))
+                parser.error(f"argument {option_string}: Expected '=' between key and value")
 
             if k in dict_arg:
-                parser.error(
-                    "argument {opt_str}: Specified duplicate key '{dup_key}'".format(opt_str=option_string, dup_key=k)
-                )
+                parser.error(f"argument {option_string}: Specified duplicate key '{k}'")
 
             dict_arg[k] = v
 
@@ -2287,17 +2266,17 @@ def main():
     perf_end_time = datetime.now()
 
     logger.info("Done running the perf.")
-    logger.info("\nTotal time for benchmarking all models: {}".format(perf_end_time - perf_start_time))
+    logger.info(f"\nTotal time for benchmarking all models: {perf_end_time - perf_start_time}")
     logger.info(list(models.keys()))
 
-    logger.info("\nTotal models: {}".format(len(models)))
+    logger.info(f"\nTotal models: {len(models)}")
 
     fail_model_cnt = 0
-    for key, value in models.items():
+    for key, _value in models.items():
         if key in model_to_fail_ep:
             fail_model_cnt += 1
-    logger.info("Fail models: {}".format(fail_model_cnt))
-    logger.info("Success models: {}".format(len(models) - fail_model_cnt))
+    logger.info(f"Fail models: {fail_model_cnt}")
+    logger.info(f"Success models: {len(models) - fail_model_cnt}")
 
     path = os.path.join(os.getcwd(), args.perf_result_path)
     if not os.path.exists(path):
diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py b/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
index 918add64ce5f3..2948b9a8bda65 100644
--- a/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
+++ b/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py
@@ -1,15 +1,15 @@
-import argparse
-import copy
-import csv
+import argparse  # noqa: F401
+import copy  # noqa: F401
+import csv  # noqa: F401
 import json
-import logging
+import logging  # noqa: F401
 import os
 import pprint
 import re
 
-import coloredlogs
-from benchmark import *
-from perf_utils import *
+import coloredlogs  # noqa: F401
+from benchmark import *  # noqa: F403
+from perf_utils import *  # noqa: F403
 
 
 def write_model_info_to_file(model, path):
@@ -19,35 +19,35 @@ def write_model_info_to_file(model, path):
 
 def get_ep_list(comparison):
     if comparison == "acl":
-        ep_list = [cpu, acl]
+        ep_list = [cpu, acl]  # noqa: F405
     else:
         # test with cuda and trt
         ep_list = [
-            cpu,
-            cuda,
-            trt,
-            standalone_trt,
-            cuda_fp16,
-            trt_fp16,
-            standalone_trt_fp16,
+            cpu,  # noqa: F405
+            cuda,  # noqa: F405
+            trt,  # noqa: F405
+            standalone_trt,  # noqa: F405
+            cuda_fp16,  # noqa: F405
+            trt_fp16,  # noqa: F405
+            standalone_trt_fp16,  # noqa: F405
         ]
     return ep_list
 
 
 def resolve_trtexec_path(workspace):
-    trtexec_options = get_output(["find", workspace, "-name", "trtexec"])
+    trtexec_options = get_output(["find", workspace, "-name", "trtexec"])  # noqa: F405
     trtexec_path = re.search(r".*/bin/trtexec", trtexec_options).group(0)
-    logger.info("using trtexec {}".format(trtexec_path))
+    logger.info(f"using trtexec {trtexec_path}")  # noqa: F405
     return trtexec_path
 
 
 def dict_to_args(dct):
-    return ",".join(["{}={}".format(k, v) for k, v in dct.items()])
+    return ",".join([f"{k}={v}" for k, v in dct.items()])
 
 
 def main():
-    args = parse_arguments()
-    setup_logger(False)
+    args = parse_arguments()  # noqa: F405
+    setup_logger(False)  # noqa: F405
     pp = pprint.PrettyPrinter(indent=4)
 
     # create ep list to iterate through
@@ -59,25 +59,25 @@ def main():
     trtexec = resolve_trtexec_path(args.workspace)
 
     models = {}
-    parse_models_helper(args, models)
+    parse_models_helper(args, models)  # noqa: F405
 
     model_to_fail_ep = {}
 
-    benchmark_fail_csv = fail_name + csv_ending
-    benchmark_metrics_csv = metrics_name + csv_ending
-    benchmark_success_csv = success_name + csv_ending
-    benchmark_latency_csv = latency_name + csv_ending
-    benchmark_status_csv = status_name + csv_ending
-    benchmark_session_csv = session_name + csv_ending
-    specs_csv = specs_name + csv_ending
+    benchmark_fail_csv = fail_name + csv_ending  # noqa: F405
+    benchmark_metrics_csv = metrics_name + csv_ending  # noqa: F405
+    benchmark_success_csv = success_name + csv_ending  # noqa: F405
+    benchmark_latency_csv = latency_name + csv_ending  # noqa: F405
+    benchmark_status_csv = status_name + csv_ending  # noqa: F405
+    benchmark_session_csv = session_name + csv_ending  # noqa: F405
+    specs_csv = specs_name + csv_ending  # noqa: F405
 
-    validate = is_validate_mode(args.running_mode)
-    benchmark = is_benchmark_mode(args.running_mode)
+    validate = is_validate_mode(args.running_mode)  # noqa: F405
+    benchmark = is_benchmark_mode(args.running_mode)  # noqa: F405
 
     for model, model_info in models.items():
-        logger.info("\n" + "=" * 40 + "=" * len(model))
-        logger.info("=" * 20 + model + "=" * 20)
-        logger.info("=" * 40 + "=" * len(model))
+        logger.info("\n" + "=" * 40 + "=" * len(model))  # noqa: F405
+        logger.info("=" * 20 + model + "=" * 20)  # noqa: F405
+        logger.info("=" * 40 + "=" * len(model))  # noqa: F405
 
         model_info["model_name"] = model
 
@@ -103,7 +103,7 @@ def main():
             if args.track_memory:
                 command.append("-z")
 
-            if ep == standalone_trt or ep == standalone_trt_fp16:
+            if ep in (standalone_trt, standalone_trt_fp16):  # noqa: F405
                 command.extend(["--trtexec", trtexec])
 
             if len(args.cuda_ep_options):
@@ -133,9 +133,9 @@ def main():
                     ]
                 )
 
-            p = subprocess.run(command, stderr=subprocess.PIPE)
-            logger.info("Completed subprocess %s ", " ".join(p.args))
-            logger.info("Return code: %d", p.returncode)
+            p = subprocess.run(command, stderr=subprocess.PIPE)  # noqa: F405
+            logger.info("Completed subprocess %s ", " ".join(p.args))  # noqa: F405
+            logger.info("Return code: %d", p.returncode)  # noqa: F405
 
             if p.returncode != 0:
                 error_type = "runtime error"
@@ -144,10 +144,10 @@ def main():
                 if p.stderr:
                     error_message += "\nSTDERR:\n" + p.stderr.decode("utf-8")
 
-                logger.error(error_message)
-                update_fail_model_map(model_to_fail_ep, model, ep, error_type, error_message)
-                write_map_to_file(model_to_fail_ep, FAIL_MODEL_FILE)
-                logger.info(model_to_fail_ep)
+                logger.error(error_message)  # noqa: F405
+                update_fail_model_map(model_to_fail_ep, model, ep, error_type, error_message)  # noqa: F405
+                write_map_to_file(model_to_fail_ep, FAIL_MODEL_FILE)  # noqa: F405
+                logger.info(model_to_fail_ep)  # noqa: F405
 
         os.remove(model_list_file)
 
@@ -158,76 +158,76 @@ def main():
         Path(path).mkdir(parents=True, exist_ok=True)
 
     if validate:
-        logger.info("\n=========================================")
-        logger.info("=========== Models/EPs metrics ==========")
-        logger.info("=========================================")
+        logger.info("\n=========================================")  # noqa: F405
+        logger.info("=========== Models/EPs metrics ==========")  # noqa: F405
+        logger.info("=========================================")  # noqa: F405
 
-        if os.path.exists(METRICS_FILE):
-            model_to_metrics = read_map_from_file(METRICS_FILE)
-            output_metrics(model_to_metrics, os.path.join(path, benchmark_metrics_csv))
-            logger.info("\nSaved model metrics results to {}".format(benchmark_metrics_csv))
+        if os.path.exists(METRICS_FILE):  # noqa: F405
+            model_to_metrics = read_map_from_file(METRICS_FILE)  # noqa: F405
+            output_metrics(model_to_metrics, os.path.join(path, benchmark_metrics_csv))  # noqa: F405
+            logger.info(f"\nSaved model metrics results to {benchmark_metrics_csv}")  # noqa: F405
 
     if benchmark:
-        logger.info("\n=========================================")
-        logger.info("======= Models/EPs session creation =======")
-        logger.info("=========================================")
-
-        if os.path.exists(SESSION_FILE):
-            model_to_session = read_map_from_file(SESSION_FILE)
-            pretty_print(pp, model_to_session)
-            output_session_creation(model_to_session, os.path.join(path, benchmark_session_csv))
-            logger.info("\nSaved session creation results to {}".format(benchmark_session_csv))
-
-        logger.info("\n=========================================================")
-        logger.info("========== Failing Models/EPs (accumulated) ==============")
-        logger.info("==========================================================")
-
-        if os.path.exists(FAIL_MODEL_FILE) or len(model_to_fail_ep) > 1:
-            model_to_fail_ep = read_map_from_file(FAIL_MODEL_FILE)
-            output_fail(model_to_fail_ep, os.path.join(path, benchmark_fail_csv))
-            logger.info(model_to_fail_ep)
-            logger.info("\nSaved model failing results to {}".format(benchmark_fail_csv))
-
-        logger.info("\n=======================================================")
-        logger.info("=========== Models/EPs Status (accumulated) ===========")
-        logger.info("=======================================================")
+        logger.info("\n=========================================")  # noqa: F405
+        logger.info("======= Models/EPs session creation =======")  # noqa: F405
+        logger.info("=========================================")  # noqa: F405
+
+        if os.path.exists(SESSION_FILE):  # noqa: F405
+            model_to_session = read_map_from_file(SESSION_FILE)  # noqa: F405
+            pretty_print(pp, model_to_session)  # noqa: F405
+            output_session_creation(model_to_session, os.path.join(path, benchmark_session_csv))  # noqa: F405
+            logger.info(f"\nSaved session creation results to {benchmark_session_csv}")  # noqa: F405
+
+        logger.info("\n=========================================================")  # noqa: F405
+        logger.info("========== Failing Models/EPs (accumulated) ==============")  # noqa: F405
+        logger.info("==========================================================")  # noqa: F405
+
+        if os.path.exists(FAIL_MODEL_FILE) or len(model_to_fail_ep) > 1:  # noqa: F405
+            model_to_fail_ep = read_map_from_file(FAIL_MODEL_FILE)  # noqa: F405
+            output_fail(model_to_fail_ep, os.path.join(path, benchmark_fail_csv))  # noqa: F405
+            logger.info(model_to_fail_ep)  # noqa: F405
+            logger.info(f"\nSaved model failing results to {benchmark_fail_csv}")  # noqa: F405
+
+        logger.info("\n=======================================================")  # noqa: F405
+        logger.info("=========== Models/EPs Status (accumulated) ===========")  # noqa: F405
+        logger.info("=======================================================")  # noqa: F405
 
         model_status = {}
-        if os.path.exists(LATENCY_FILE):
-            model_latency = read_map_from_file(LATENCY_FILE)
+        if os.path.exists(LATENCY_FILE):  # noqa: F405
+            model_latency = read_map_from_file(LATENCY_FILE)  # noqa: F405
             is_fail = False
-            model_status = build_status(model_status, model_latency, is_fail)
-        if os.path.exists(FAIL_MODEL_FILE):
-            model_fail = read_map_from_file(FAIL_MODEL_FILE)
+            model_status = build_status(model_status, model_latency, is_fail)  # noqa: F405
+        if os.path.exists(FAIL_MODEL_FILE):  # noqa: F405
+            model_fail = read_map_from_file(FAIL_MODEL_FILE)  # noqa: F405
             is_fail = True
-            model_status = build_status(model_status, model_fail, is_fail)
+            model_status = build_status(model_status, model_fail, is_fail)  # noqa: F405
 
-        pretty_print(pp, model_status)
+        pretty_print(pp, model_status)  # noqa: F405
 
-        output_status(model_status, os.path.join(path, benchmark_status_csv))
-        logger.info("\nSaved model status results to {}".format(benchmark_status_csv))
+        output_status(model_status, os.path.join(path, benchmark_status_csv))  # noqa: F405
+        logger.info(f"\nSaved model status results to {benchmark_status_csv}")  # noqa: F405
 
-        logger.info("\n=========================================================")
-        logger.info("=========== Models/EPs latency (accumulated)  ===========")
-        logger.info("=========================================================")
+        logger.info("\n=========================================================")  # noqa: F405
+        logger.info("=========== Models/EPs latency (accumulated)  ===========")  # noqa: F405
+        logger.info("=========================================================")  # noqa: F405
 
-        if os.path.exists(LATENCY_FILE):
-            model_to_latency = read_map_from_file(LATENCY_FILE)
-            add_improvement_information(model_to_latency)
+        if os.path.exists(LATENCY_FILE):  # noqa: F405
+            model_to_latency = read_map_from_file(LATENCY_FILE)  # noqa: F405
+            add_improvement_information(model_to_latency)  # noqa: F405
 
-            pretty_print(pp, model_to_latency)
+            pretty_print(pp, model_to_latency)  # noqa: F405
 
-            output_latency(model_to_latency, os.path.join(path, benchmark_latency_csv))
-            logger.info("\nSaved model latency results to {}".format(benchmark_latency_csv))
+            output_latency(model_to_latency, os.path.join(path, benchmark_latency_csv))  # noqa: F405
+            logger.info(f"\nSaved model latency results to {benchmark_latency_csv}")  # noqa: F405
 
-    logger.info("\n===========================================")
-    logger.info("=========== System information  ===========")
-    logger.info("===========================================")
-    info = get_system_info(args)
-    pretty_print(pp, info)
-    logger.info("\n")
-    output_specs(info, os.path.join(path, specs_csv))
-    logger.info("\nSaved hardware specs to {}".format(specs_csv))
+    logger.info("\n===========================================")  # noqa: F405
+    logger.info("=========== System information  ===========")  # noqa: F405
+    logger.info("===========================================")  # noqa: F405
+    info = get_system_info(args)  # noqa: F405
+    pretty_print(pp, info)  # noqa: F405
+    logger.info("\n")  # noqa: F405
+    output_specs(info, os.path.join(path, specs_csv))  # noqa: F405
+    logger.info(f"\nSaved hardware specs to {specs_csv}")  # noqa: F405
 
 
 if __name__ == "__main__":
diff --git a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
index 5cd72fa944fcf..8cc917cbc7ff1 100644
--- a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
+++ b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
@@ -82,11 +82,7 @@ def is_valid_ver_str(version: str, min_comps: int = 0, max_comps: int = 0) -> bo
     if num_comps > max_comps > 0:
         return False
 
-    for num in ver_nums:
-        if not num.isdecimal():
-            return False
-
-    return True
+    return all(num.isdecimal() for num in ver_nums)
 
 
 def docker_build_trt(args: argparse.Namespace):
diff --git a/onnxruntime/python/tools/tensorrt/perf/build/ort_build_latest.py b/onnxruntime/python/tools/tensorrt/perf/build/ort_build_latest.py
index 2efacd1965f40..6e20071683d90 100755
--- a/onnxruntime/python/tools/tensorrt/perf/build/ort_build_latest.py
+++ b/onnxruntime/python/tools/tensorrt/perf/build/ort_build_latest.py
@@ -1,8 +1,9 @@
+import argparse
 import os
 import subprocess
-import argparse
 import tarfile
 
+
 def parse_arguments():
     parser = argparse.ArgumentParser()
 
@@ -15,52 +16,73 @@ def parse_arguments():
     args = parser.parse_args()
     return args
 
+
 def archive_wheel_file(save_path, ort_wheel_file):
     if not os.path.exists(save_path):
         os.mkdir(save_path)
     subprocess.run(["cp", ort_wheel_file, save_path], check=True)
 
+
 def install_new_ort_wheel(ort_master_path):
-    ort_wheel_path = os.path.join(ort_master_path, "build", "Linux", "Release", "dist") 
+    ort_wheel_path = os.path.join(ort_master_path, "build", "Linux", "Release", "dist")
     p1 = subprocess.run(["find", ort_wheel_path, "-name", "*.whl"], stdout=subprocess.PIPE, check=True)
     stdout = p1.stdout.decode("utf-8").strip()
     ort_wheel = stdout.split("\n")[0]
     subprocess.run(["python3", "-m", "pip", "install", "--force-reinstall", ort_wheel], check=True)
     return ort_wheel
 
+
 def main():
     args = parse_arguments()
 
-    cmake_tar = "cmake-3.18.4-Linux-x86_64.tar.gz" 
+    cmake_tar = "cmake-3.18.4-Linux-x86_64.tar.gz"
     if not os.path.exists(cmake_tar):
-        p = subprocess.run(["wget", "-c", "https://cmake.org/files/v3.18/" + cmake_tar], check=True)
+        subprocess.run(["wget", "-c", "https://cmake.org/files/v3.18/" + cmake_tar], check=True)
     tar = tarfile.open(cmake_tar)
     tar.extractall()
     tar.close()
-    
+
     os.environ["PATH"] = os.path.join(os.path.abspath("cmake-3.18.4-Linux-x86_64"), "bin") + ":" + os.environ["PATH"]
-    os.environ["CUDACXX"] = os.path.join(args.cuda_home, "bin", "nvcc") 
+    os.environ["CUDACXX"] = os.path.join(args.cuda_home, "bin", "nvcc")
 
-    ort_master_path = args.ort_master_path 
+    ort_master_path = args.ort_master_path
     pwd = os.getcwd()
     os.chdir(ort_master_path)
 
     if args.use_archived:
         ort_wheel_file = args.use_archived
         subprocess.run(["python3", "-m", "pip", "install", "--force-reinstall", ort_wheel_file], check=True)
-    
+
     else:
         subprocess.run(["git", "fetch"], check=True)
         subprocess.run(["git", "checkout", args.branch], check=True)
         subprocess.run(["git", "pull", "origin", args.branch], check=True)
-        subprocess.run(["./build.sh", "--config", "Release", "--use_tensorrt", "--tensorrt_home", args.tensorrt_home, "--cuda_home", args.cuda_home, "--cudnn", "/usr/lib/x86_64-linux-gnu", "--build_wheel", "--skip_tests", "--parallel"], check=True)
+        subprocess.run(
+            [
+                "./build.sh",
+                "--config",
+                "Release",
+                "--use_tensorrt",
+                "--tensorrt_home",
+                args.tensorrt_home,
+                "--cuda_home",
+                args.cuda_home,
+                "--cudnn",
+                "/usr/lib/x86_64-linux-gnu",
+                "--build_wheel",
+                "--skip_tests",
+                "--parallel",
+            ],
+            check=True,
+        )
 
         ort_wheel_file = install_new_ort_wheel(ort_master_path)
-    
+
         if args.save:
             archive_wheel_file(args.save, ort_wheel_file)
 
     os.chdir(pwd)
 
+
 if __name__ == "__main__":
     main()
diff --git a/onnxruntime/python/tools/tensorrt/perf/comparison_scripts/compare_latency.py b/onnxruntime/python/tools/tensorrt/perf/comparison_scripts/compare_latency.py
index 93df53c9825db..b44a672e7723b 100644
--- a/onnxruntime/python/tools/tensorrt/perf/comparison_scripts/compare_latency.py
+++ b/onnxruntime/python/tools/tensorrt/perf/comparison_scripts/compare_latency.py
@@ -48,7 +48,7 @@ def main():
     condition_fp16 = get_table_condition(common, "fp16", args.ep, args.tolerance)
 
     common["greater"] = np.where((condition_fp32 | condition_fp16), True, False)
-    greater = common[common["greater"] == True].drop(["greater"], axis=1)
+    greater = common[common["greater"] is True].drop(["greater"], axis=1)
 
     # arrange columns
     keys = list(greater.keys().sort_values())
diff --git a/onnxruntime/python/tools/tensorrt/perf/perf_utils.py b/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
index 61cac72b271c1..868a5973e8d35 100644
--- a/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
+++ b/onnxruntime/python/tools/tensorrt/perf/perf_utils.py
@@ -1,11 +1,11 @@
 import json
-import logging
+import logging  # noqa: F401
 import pprint
 import re
 import subprocess
 import sys
 
-import coloredlogs
+import coloredlogs  # noqa: F401
 
 debug = False
 debug_verbose = False
@@ -62,7 +62,7 @@
     trt_fp16,
     standalone_trt_fp16,
 ]
-table_headers = [model_title] + provider_list
+table_headers = [model_title, *provider_list]
 
 # graph options
 disable = "disable"
@@ -96,7 +96,7 @@ def is_validate_mode(running_mode):
 
 
 def is_standalone(ep):
-    return ep == standalone_trt or ep == standalone_trt_fp16
+    return ep in (standalone_trt, standalone_trt_fp16)
 
 
 def get_output(command):
@@ -119,10 +119,9 @@ def pretty_print(pp, json_object):
 
 
 def parse_single_file(f):
-
     try:
         data = json.load(f)
-    except Exception as e:
+    except Exception:
         return None
 
     model_run_flag = False
@@ -131,7 +130,7 @@ def parse_single_file(f):
     provider_op_map_first_run = {}  # ep -> map of operator to duration
 
     for row in data:
-        if not "cat" in row:
+        if "cat" not in row:
             continue
 
         if row["cat"] == "Session":
@@ -146,7 +145,7 @@ def parse_single_file(f):
             if "name" in row and "args" in row and re.search(".*kernel_time", row["name"]):
                 args = row["args"]
 
-                if not "op_name" in args or not "provider" in args:
+                if "op_name" not in args or "provider" not in args:
                     continue
 
                 provider = args["provider"]
@@ -172,7 +171,7 @@ def parse_single_file(f):
                     op_map = provider_op_map[provider]
 
                     # avoid duplicated metrics
-                    if not row["name"] in op_map:
+                    if row["name"] not in op_map:
                         op_map[row["name"]] = row["dur"]
                         provider_op_map[provider] = op_map
 
@@ -245,9 +244,9 @@ def calculate_trt_op_percentage(trt_op_map, cuda_op_map):
     #
     ratio_of_ops_in_trt = (total_ops - total_cuda_and_cpu_ops) / total_ops
     if debug:
-        print("total_cuda_and_cpu_ops: {}".format(total_cuda_and_cpu_ops))
-        print("total_ops: {}".format(total_ops))
-        print("ratio_of_ops_in_trt: {}".format(ratio_of_ops_in_trt))
+        print(f"total_cuda_and_cpu_ops: {total_cuda_and_cpu_ops}")
+        print(f"total_ops: {total_ops}")
+        print(f"ratio_of_ops_in_trt: {ratio_of_ops_in_trt}")
 
     return ((total_ops - total_cuda_and_cpu_ops), total_ops, ratio_of_ops_in_trt)
 
@@ -280,7 +279,7 @@ def calculate_trt_latency_percentage(trt_op_map):
             op_map = trt_op_map[ep]
 
             total_time = 0
-            for key, value in op_map.items():
+            for _key, value in op_map.items():
                 total_time += int(value)
 
             if ep == "TensorrtExecutionProvider":
@@ -294,9 +293,9 @@ def calculate_trt_latency_percentage(trt_op_map):
         ratio_of_trt_execution_time = total_trt_execution_time / total_execution_time
 
     if debug:
-        print("total_trt_execution_time: {}".format(total_trt_execution_time))
-        print("total_execution_time: {}".format(total_execution_time))
-        print("ratio_of_trt_execution_time: {}".format(ratio_of_trt_execution_time))
+        print(f"total_trt_execution_time: {total_trt_execution_time}")
+        print(f"total_execution_time: {total_execution_time}")
+        print(f"ratio_of_trt_execution_time: {ratio_of_trt_execution_time}")
 
     return (total_trt_execution_time, total_execution_time, ratio_of_trt_execution_time)
 
diff --git a/onnxruntime/python/tools/tensorrt/perf/post.py b/onnxruntime/python/tools/tensorrt/perf/post.py
index 0957f9c051a2a..350e8b3914ab7 100644
--- a/onnxruntime/python/tools/tensorrt/perf/post.py
+++ b/onnxruntime/python/tools/tensorrt/perf/post.py
@@ -331,7 +331,7 @@ def get_session(session, model_group):
     """
 
     session_columns = session.keys()
-    session_db_columns = [model_title] + ort_provider_list + [p + second for p in ort_provider_list]
+    session_db_columns = [model_title, *ort_provider_list] + [p + second for p in ort_provider_list]
     session = adjust_columns(session, session_columns, session_db_columns, model_group)
     return session
 
diff --git a/onnxruntime/python/tools/tensorrt/perf/setup_scripts/setup_onnx_zoo.py b/onnxruntime/python/tools/tensorrt/perf/setup_scripts/setup_onnx_zoo.py
index b54f315c77a6d..4f763ad84426d 100644
--- a/onnxruntime/python/tools/tensorrt/perf/setup_scripts/setup_onnx_zoo.py
+++ b/onnxruntime/python/tools/tensorrt/perf/setup_scripts/setup_onnx_zoo.py
@@ -70,7 +70,7 @@ def write_json(models):
 
 def main():
     links = []
-    with open("links.txt", "r") as fh:
+    with open("links.txt") as fh:
         links = [link.rstrip() for link in fh.readlines()]
 
     model_list = []
diff --git a/onnxruntime/python/tools/transformers/__init__.py b/onnxruntime/python/tools/transformers/__init__.py
index 4200447eefee5..edfb82b253b81 100644
--- a/onnxruntime/python/tools/transformers/__init__.py
+++ b/onnxruntime/python/tools/transformers/__init__.py
@@ -8,9 +8,9 @@
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "models", "gpt2"))
 
-import convert_to_onnx
+import convert_to_onnx  # noqa: E402, F401
 
 # added for backward compatible
-import gpt2_helper
+import gpt2_helper  # noqa: E402, F401
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "models", "t5"))
diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index 23f1be3eeed2f..78ff542f0037d 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -45,16 +45,16 @@
 import os
 import timeit
 from datetime import datetime
-from enum import Enum
+from enum import Enum  # noqa: F401
 
 import numpy
-import onnx
+import onnx  # noqa: F401
 import psutil
+from benchmark_helper import allocateOutputBuffers  # noqa: F401
 from benchmark_helper import (
     ConfigModifier,
     OptimizerInfo,
     Precision,
-    allocateOutputBuffers,
     create_onnxruntime_session,
     get_latency_result,
     inference_ort,
@@ -76,7 +76,7 @@
 
 logger = logging.getLogger("")
 
-from huggingface_models import MODEL_CLASSES, MODELS
+from huggingface_models import MODEL_CLASSES, MODELS  # noqa: E402
 
 cpu_count = psutil.cpu_count(logical=False)
 
@@ -84,8 +84,8 @@
 if "OMP_NUM_THREADS" not in os.environ:
     os.environ["OMP_NUM_THREADS"] = str(cpu_count)
 
-import torch
-from transformers import AutoConfig, AutoModel, AutoTokenizer, GPT2Model, LxmertConfig
+import torch  # noqa: E402
+from transformers import AutoConfig, AutoModel, AutoTokenizer, GPT2Model, LxmertConfig  # noqa: E402, F401
 
 
 def run_onnxruntime(
@@ -178,7 +178,12 @@ def run_onnxruntime(
                         fusion_options,
                     )
             if "tf" in model_source:
-                (onnx_model_file, is_valid_onnx_model, vocab_size, max_sequence_length,) = export_onnx_model_from_tf(
+                (
+                    onnx_model_file,
+                    is_valid_onnx_model,
+                    vocab_size,
+                    max_sequence_length,
+                ) = export_onnx_model_from_tf(
                     model_name,
                     MODELS[model_name][1],
                     MODELS[model_name][2],
@@ -257,9 +262,7 @@ def run_onnxruntime(
                         "datetime": str(datetime.now()),
                     }
 
-                    logger.info(
-                        "Run onnxruntime on {} with input shape {}".format(model_name, [batch_size, sequence_length])
-                    )
+                    logger.info(f"Run onnxruntime on {model_name} with input shape {[batch_size, sequence_length]}")
 
                     if disable_ort_io_binding:
                         result = inference_ort(
@@ -359,7 +362,7 @@ def run_pytorch(
                 if max_input_size is not None and sequence_length > max_input_size:
                     continue
 
-                logger.info("Run PyTorch on {} with input shape {}".format(model_name, [batch_size, sequence_length]))
+                logger.info(f"Run PyTorch on {model_name} with input shape {[batch_size, sequence_length]}")
                 input_ids = torch.randint(
                     low=0,
                     high=config.vocab_size - 1,
@@ -373,7 +376,7 @@ def run_pytorch(
                     )
                     inference(input_ids)
 
-                    runtimes = timeit.repeat(lambda: inference(input_ids), repeat=repeat_times, number=1)
+                    runtimes = timeit.repeat(lambda: inference(input_ids), repeat=repeat_times, number=1)  # noqa: B023
 
                     result = {
                         "engine": "torchscript" if torchscript else "torch2" if torch2 else "torch",
@@ -491,9 +494,7 @@ def run_tensorflow(
                 if max_input_size is not None and sequence_length > max_input_size:
                     continue
 
-                logger.info(
-                    "Run Tensorflow on {} with input shape {}".format(model_name, [batch_size, sequence_length])
-                )
+                logger.info(f"Run Tensorflow on {model_name} with input shape {[batch_size, sequence_length]}")
 
                 import random
 
@@ -505,18 +506,18 @@ def run_tensorflow(
                     # Disable both for better inference perf
                     @run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
                     def encoder_forward():
-                        return model(input_ids, training=False)
+                        return model(input_ids, training=False)  # noqa: B023
 
                     @run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
                     def encoder_decoder_forward():
-                        return model(input_ids, decoder_input_ids=input_ids, training=False)
+                        return model(input_ids, decoder_input_ids=input_ids, training=False)  # noqa: B023
 
                     @run_with_tf_optimizations(do_eager_mode=False, use_xla=False)
                     def lxmert_forward():
-                        feats = tf.random.normal([1, 1, config.visual_feat_dim])
-                        pos = tf.random.normal([1, 1, config.visual_pos_dim])
-                        return model(
-                            input_ids,
+                        feats = tf.random.normal([1, 1, config.visual_feat_dim])  # noqa: B023
+                        pos = tf.random.normal([1, 1, config.visual_pos_dim])  # noqa: B023
+                        return model(  # noqa: B023
+                            input_ids,  # noqa: B023
                             visual_feats=feats,
                             visual_pos=pos,
                             training=False,
@@ -530,7 +531,7 @@ def lxmert_forward():
 
                     inference()
 
-                    runtimes = timeit.repeat(lambda: inference(), repeat=repeat_times, number=1)
+                    runtimes = timeit.repeat(lambda: inference(), repeat=repeat_times, number=1)  # noqa: B023
 
                     result = {
                         "engine": "tensorflow",
@@ -766,7 +767,7 @@ def main():
         logger.error("int8 is for CPU only")
         return
 
-    args.num_threads = sorted(set(cpu_count if x <= 0 else x for x in args.num_threads))
+    args.num_threads = sorted({cpu_count if x <= 0 else x for x in args.num_threads})
 
     logger.info(f"Arguments: {args}")
 
@@ -891,8 +892,8 @@ def main():
                     args.model_source,
                     args,
                 )
-            except:
-                logger.error(f"Exception", exc_info=True)
+            except Exception:
+                logger.error("Exception", exc_info=True)
 
     time_stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
     if model_fusion_statistics:
diff --git a/onnxruntime/python/tools/transformers/benchmark_helper.py b/onnxruntime/python/tools/transformers/benchmark_helper.py
index bf8dd931c6927..fc2ec8ad8fd56 100644
--- a/onnxruntime/python/tools/transformers/benchmark_helper.py
+++ b/onnxruntime/python/tools/transformers/benchmark_helper.py
@@ -81,7 +81,7 @@ def create_onnxruntime_session(
     num_threads=-1,
     enable_profiling=False,
     verbose=False,
-    provider_options={},  # map execution provider name to its option
+    provider_options={},  # map execution provider name to its option  # noqa: B006
 ):
     session = None
     try:
@@ -133,7 +133,7 @@ def create_onnxruntime_session(
             providers = [(name, provider_options[name]) if name in provider_options else name for name in providers]
 
         session = onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=providers)
-    except:
+    except Exception:
         logger.error("Exception", exc_info=True)
 
     return session
@@ -185,12 +185,12 @@ def get_latency_result(latency_list, batch_size):
 
     return {
         "test_times": len(latency_list),
-        "latency_variance": "{:.2f}".format(latency_variance),
-        "latency_90_percentile": "{:.2f}".format(numpy.percentile(latency_list, 90) * 1000.0),
-        "latency_95_percentile": "{:.2f}".format(numpy.percentile(latency_list, 95) * 1000.0),
-        "latency_99_percentile": "{:.2f}".format(numpy.percentile(latency_list, 99) * 1000.0),
-        "average_latency_ms": "{:.2f}".format(latency_ms),
-        "QPS": "{:.2f}".format(throughput),
+        "latency_variance": f"{latency_variance:.2f}",
+        "latency_90_percentile": f"{numpy.percentile(latency_list, 90) * 1000.0:.2f}",
+        "latency_95_percentile": f"{numpy.percentile(latency_list, 95) * 1000.0:.2f}",
+        "latency_99_percentile": f"{numpy.percentile(latency_list, 99) * 1000.0:.2f}",
+        "average_latency_ms": f"{latency_ms:.2f}",
+        "QPS": f"{throughput:.2f}",
     }
 
 
@@ -282,12 +282,16 @@ def output_summary(results, csv_filename, args):
 
 def output_fusion_statistics(model_fusion_statistics, csv_filename):
     with open(csv_filename, mode="a", newline="", encoding="ascii") as csv_file:
-        column_names = ["model_filename", "datetime", "transformers", "torch"] + list(
-            next(iter(model_fusion_statistics.values())).keys()
-        )
+        column_names = [
+            "model_filename",
+            "datetime",
+            "transformers",
+            "torch",
+            *list(next(iter(model_fusion_statistics.values())).keys()),
+        ]
         csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
         csv_writer.writeheader()
-        for key in model_fusion_statistics.keys():
+        for key in model_fusion_statistics:
             model_fusion_statistics[key]["datetime"] = str(datetime.now())
             model_fusion_statistics[key]["transformers"] = transformers.__version__
             model_fusion_statistics[key]["torch"] = torch.__version__
@@ -325,7 +329,7 @@ def inference_ort_with_io_binding(
     # Bind inputs and outputs to onnxruntime session
     io_binding = ort_session.io_binding()
     # Bind inputs to device
-    for name in ort_inputs.keys():
+    for name in ort_inputs:
         np_input = torch.from_numpy(ort_inputs[name]).to(device)
         input_type = (
             IO_BINDING_DATA_TYPE_MAP[str(ort_inputs[name].dtype)]
@@ -371,7 +375,7 @@ def inference_ort_with_io_binding(
     return result
 
 
-def allocateOutputBuffers(output_buffers, output_buffer_max_sizes, device):
+def allocateOutputBuffers(output_buffers, output_buffer_max_sizes, device):  # noqa: N802
     # Allocate output tensors with the largest test size needed. So the allocated memory can be reused
     # for each test run.
 
diff --git a/onnxruntime/python/tools/transformers/bert_perf_test.py b/onnxruntime/python/tools/transformers/bert_perf_test.py
index 022ee076770be..1829fcabe6d97 100644
--- a/onnxruntime/python/tools/transformers/bert_perf_test.py
+++ b/onnxruntime/python/tools/transformers/bert_perf_test.py
@@ -173,7 +173,7 @@ def onnxruntime_inference_with_io_binding(session, all_inputs, output_names, tes
     results = []
     latency_list = []
     device = "cuda" if test_setting.use_gpu else "cpu"
-    for test_case_id, inputs in enumerate(all_inputs):
+    for _test_case_id, inputs in enumerate(all_inputs):
         result = session.run(output_names, inputs)
         results.append(result)
         outputs = {}
@@ -201,7 +201,7 @@ def onnxruntime_inference(session, all_inputs, output_names):
 
     results = []
     latency_list = []
-    for test_case_id, inputs in enumerate(all_inputs):
+    for _test_case_id, inputs in enumerate(all_inputs):
         start_time = timeit.default_timer()
         result = session.run(output_names, inputs)
         latency = timeit.default_timer() - start_time
@@ -212,7 +212,7 @@ def onnxruntime_inference(session, all_inputs, output_names):
 
 def to_string(model_path, session, test_setting):
     sess_options = session.get_session_options()
-    option = "model={},".format(os.path.basename(model_path))
+    option = f"model={os.path.basename(model_path)},"
     option += "graph_optimization_level={},intra_op_num_threads={},".format(
         sess_options.graph_optimization_level, sess_options.intra_op_num_threads
     ).replace("GraphOptimizationLevel.ORT_", "")
@@ -240,13 +240,13 @@ def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op
 
     all_latency_list = []
     if test_setting.use_io_binding:
-        for i in range(test_setting.test_times):
+        for _i in range(test_setting.test_times):
             results, latency_list = onnxruntime_inference_with_io_binding(
                 session, all_inputs, output_names, test_setting
             )
             all_latency_list.extend(latency_list)
     else:
-        for i in range(test_setting.test_times):
+        for _i in range(test_setting.test_times):
             results, latency_list = onnxruntime_inference(session, all_inputs, output_names)
             all_latency_list.extend(latency_list)
 
@@ -305,7 +305,7 @@ def run_perf_tests(model_setting, test_setting, perf_results, all_inputs):
     cpu_count = psutil.cpu_count(logical=False)
     logical_cores = psutil.cpu_count(logical=True)
 
-    candidate_threads = list(set([logical_cores, cpu_count]))
+    candidate_threads = list({logical_cores, cpu_count})
     for i in range(1, min(16, logical_cores)):
         if i not in candidate_threads:
             candidate_threads.append(i)
@@ -517,7 +517,7 @@ def main():
     with open(summary_file, "w+", newline="") as tsv_file:
         tsv_writer = csv.writer(tsv_file, delimiter="\t", lineterminator="\n")
         headers = None
-        for (key, perf_result) in sorted_results:
+        for key, perf_result in sorted_results:
             params = key.split(",")
             if headers is None:
                 headers = [
diff --git a/onnxruntime/python/tools/transformers/bert_test_data.py b/onnxruntime/python/tools/transformers/bert_test_data.py
index 12c2145fe3eb0..430aa96b84ba6 100644
--- a/onnxruntime/python/tools/transformers/bert_test_data.py
+++ b/onnxruntime/python/tools/transformers/bert_test_data.py
@@ -134,7 +134,7 @@ def output_test_data(directory: str, inputs: Dict[str, np.ndarray]):
     index = 0
     for name, data in inputs.items():
         tensor = numpy_helper.from_array(data, name)
-        with open(os.path.join(directory, "input_{}.pb".format(index)), "wb") as file:
+        with open(os.path.join(directory, f"input_{index}.pb"), "wb") as file:
             file.write(tensor.SerializeToString())
         index += 1
 
@@ -175,7 +175,7 @@ def fake_test_data(
     random.seed(random_seed)
 
     all_inputs = []
-    for test_case in range(test_cases):
+    for _test_case in range(test_cases):
         input_1 = fake_input_ids_data(input_ids, batch_size, sequence_length, dictionary_size)
         inputs = {input_ids.name: input_1}
 
@@ -302,7 +302,7 @@ def find_bert_inputs(
         return input_ids, segment_ids, input_mask
 
     if len(graph_inputs) != 3:
-        raise ValueError("Expect the graph to have 3 inputs. Got {}".format(len(graph_inputs)))
+        raise ValueError(f"Expect the graph to have 3 inputs. Got {len(graph_inputs)}")
 
     embed_nodes = onnx_model.get_nodes_by_op_type("EmbedLayerNormalization")
     if len(embed_nodes) == 1:
@@ -317,7 +317,7 @@ def find_bert_inputs(
                 if "mask" in input_name_lower:
                     input_mask = input
         if input_mask is None:
-            raise ValueError(f"Failed to find attention mask input")
+            raise ValueError("Failed to find attention mask input")
 
         return input_ids, segment_ids, input_mask
 
@@ -504,7 +504,7 @@ def create_and_save_test_data(
         result = session.run(output_names, inputs)
         for i, output_name in enumerate(output_names):
             tensor_result = numpy_helper.from_array(np.asarray(result[i]), output_name)
-            with open(os.path.join(directory, "output_{}.pb".format(i)), "wb") as file:
+            with open(os.path.join(directory, f"output_{i}.pb"), "wb") as file:
                 file.write(tensor_result.SerializeToString())
 
 
@@ -515,7 +515,7 @@ def main():
     if output_dir is None:
         # Default output directory is a sub-directory under the directory of model.
         p = Path(args.model)
-        output_dir = os.path.join(p.parent, "batch_{}_seq_{}".format(args.batch_size, args.sequence_length))
+        output_dir = os.path.join(p.parent, f"batch_{args.batch_size}_seq_{args.sequence_length}")
 
     if output_dir is not None:
         # create the output directory if not existed
diff --git a/onnxruntime/python/tools/transformers/compare_bert_results.py b/onnxruntime/python/tools/transformers/compare_bert_results.py
index 07f5dba88065f..33562acfd2242 100644
--- a/onnxruntime/python/tools/transformers/compare_bert_results.py
+++ b/onnxruntime/python/tools/transformers/compare_bert_results.py
@@ -52,9 +52,9 @@ def compare(baseline_results, treatment_results, verbose, rtol=1e-3, atol=1e-4):
                     diff_count += 1
 
                     if verbose:
-                        print("case {} output {}".format(test_case_id, i))
-                        print("baseline={}\ntreatment={}".format(results[i].tolist(), treatment_output))
-                        print("rel_diff={} abs_diff={}".format(rel_diff, abs_diff))
+                        print(f"case {test_case_id} output {i}")
+                        print(f"baseline={results[i].tolist()}\ntreatment={treatment_output}")
+                        print(f"rel_diff={rel_diff} abs_diff={abs_diff}")
 
     if diff_count == 0:
         print(
@@ -69,9 +69,9 @@ def compare(baseline_results, treatment_results, verbose, rtol=1e-3, atol=1e-4):
             )
         )
 
-    print("maximum absolute difference={}".format(max_abs_diff))
+    print(f"maximum absolute difference={max_abs_diff}")
 
-    print("maximum relative difference={}".format(max_rel_diff))
+    print(f"maximum relative difference={max_rel_diff}")
 
 
 def run_test(
@@ -90,7 +90,6 @@ def run_test(
     segment_ids_name,
     input_mask_name,
 ):
-
     # Try deduce input names from optimized model.
     input_ids, segment_ids, input_mask = get_bert_inputs(
         optimized_model, input_ids_name, segment_ids_name, input_mask_name
@@ -127,7 +126,7 @@ def run_test(
         optimized_model, all_inputs, use_gpu, disable_optimization=False
     )
     if verbose:
-        print("treatment average latency: {} ms".format(statistics.mean(treatment_latency) * 1000))
+        print(f"treatment average latency: {statistics.mean(treatment_latency) * 1000} ms")
 
     # Validate the output of baseline and treatment, to make sure the results are similar.
     compare(baseline_results, treatment_results, verbose, rtol, atol)
diff --git a/onnxruntime/python/tools/transformers/convert_generation.py b/onnxruntime/python/tools/transformers/convert_generation.py
index acbee9c429154..3e959e1db65b6 100644
--- a/onnxruntime/python/tools/transformers/convert_generation.py
+++ b/onnxruntime/python/tools/transformers/convert_generation.py
@@ -64,10 +64,10 @@
 from models.gpt2.convert_to_onnx import main as convert_gpt2_to_onnx  # noqa: E402
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "models", "t5"))
-from benchmark_helper import setup_logger
+from benchmark_helper import setup_logger  # noqa: E402
 from models.t5.convert_to_onnx import export_onnx_models as export_t5_onnx_models  # noqa: E402
 from models.t5.t5_helper import PRETRAINED_MT5_MODELS, PRETRAINED_T5_MODELS  # noqa: E402
-from onnx_model import OnnxModel
+from onnx_model import OnnxModel  # noqa: E402
 
 logger = logging.getLogger("")
 
@@ -646,7 +646,7 @@ def verify_gpt2_subgraph(graph: onnx.GraphProto, precision: Precision):
         ValueError: Output name is not expected.
         ValueError: Output data type is not expected.
     """
-    is_float16 = Precision.FLOAT16 == precision
+    is_float16 = precision == Precision.FLOAT16
 
     input_count = len(graph.input)
     layer_count = input_count - 3
@@ -702,7 +702,7 @@ def verify_t5_decoder_subgraph(graph: onnx.GraphProto, precision: Precision):
         ValueError: Output name is not expected.
         ValueError: Output data type is not expected.
     """
-    is_float16 = Precision.FLOAT16 == precision
+    is_float16 = precision == Precision.FLOAT16
     float_type = TensorProto.FLOAT16 if is_float16 else TensorProto.FLOAT
 
     input_count = len(graph.input)
@@ -778,7 +778,7 @@ def verify_t5_encoder_decoder_init_subgraph(graph: onnx.GraphProto, precision: P
         ValueError: Output name is not expected.
         ValueError: Output data type is not expected.
     """
-    is_float16 = Precision.FLOAT16 == precision
+    is_float16 = precision == Precision.FLOAT16
     layer_count = (len(graph.output) - 2) // 4
     assert layer_count >= 1
 
@@ -982,7 +982,7 @@ def _attribute_to_pair(attribute):
         :return: attribute in {key: value} format.
     """
     if attribute.type == 0:
-        raise ValueError("attribute {} does not have type specified.".format(attribute.name))
+        raise ValueError(f"attribute {attribute.name} does not have type specified.")
 
     # Based on attribute type definitions from AttributeProto
     # definition in https://github.com/onnx/onnx/blob/master/onnx/onnx.proto
@@ -1007,7 +1007,7 @@ def _attribute_to_pair(attribute):
     elif attribute.type == 10:
         value = attribute.graphs
     else:
-        raise ValueError("attribute {} has unsupported type {}.".format(attribute.name, attribute.type))
+        raise ValueError(f"attribute {attribute.name} has unsupported type {attribute.type}.")
 
     return (attribute.name, value)
 
@@ -1086,7 +1086,7 @@ def update_decoder_subgraph_use_decoder_masked_self_attention(
     """
     if is_beam_search:
         new_inputs = []
-        for i, vi in enumerate(subg.input):
+        for _i, vi in enumerate(subg.input):
             new_inputs.extend([vi])
 
         # Add 2 BeamSearch specific inputs
@@ -1251,7 +1251,6 @@ def generate_gpt2_init_decoder(
 
     # Try without the Casts before and after the MatMuls
     if logits_matmul_to_residual_add_path is None:
-
         # Normalization Node is : LayerNormalization
         logits_matmul_to_residual_add_path = gpt2_init_decoder_model.match_parent_path(
             logits_matmul_node,
@@ -1280,7 +1279,7 @@ def generate_gpt2_init_decoder(
     residual_add_node = logits_matmul_to_residual_add_path[-1]
 
     # If the last node in the pattern is SkipLayerNormalization, we need to adjust our pattern searches accordingly
-    is_skiplayernorm_path = True if residual_add_node.op_type == "SkipLayerNormalization" else False
+    is_skiplayernorm_path = residual_add_node.op_type == "SkipLayerNormalization"
 
     # Regular LayerNormalization path
     if not is_skiplayernorm_path:
@@ -1601,7 +1600,7 @@ def convert_generation_model(args: argparse.Namespace, generation_type: Generati
             args.decoder_onnx, args.use_external_data_format
         ):
             # Can't proceed further - better to raise an exception
-            raise ValueError(f"Could not update the input shapes for the non-initial decoder subgraph.")
+            raise ValueError("Could not update the input shapes for the non-initial decoder subgraph.")
 
     # If the user explicitly requests running shape inference or if we padded/mutated
     # weight(s)/input shape(s) in the decoder, we want to run shape inference to capture the new
@@ -2337,7 +2336,7 @@ def test_t5_model(args: argparse.Namespace, sentences: Optional[List[str]] = Non
         for i, sequence in enumerate(beam_outputs.sequences):
             decoded_sequence = tokenizer.decode(sequence, skip_special_tokens=True)
             torch_decoded_sequences.append(decoded_sequence)
-            print("{}: {}".format(i, decoded_sequence))
+            print(f"{i}: {decoded_sequence}")
 
     print("-" * 50)
     print("Testing beam search with onnxruntime...")
diff --git a/onnxruntime/python/tools/transformers/convert_tf_models_to_pytorch.py b/onnxruntime/python/tools/transformers/convert_tf_models_to_pytorch.py
index a035790b50954..7d1bd6bdbcc77 100644
--- a/onnxruntime/python/tools/transformers/convert_tf_models_to_pytorch.py
+++ b/onnxruntime/python/tools/transformers/convert_tf_models_to_pytorch.py
@@ -90,7 +90,7 @@ def download_tf_checkpoint(model_name, tf_models_dir="tf_models"):
 
     import re
 
-    if re.search(".zip$", tf_ckpt_url) != None:
+    if re.search(".zip$", tf_ckpt_url) is not None:
         zip_dir = download_compressed_file(tf_ckpt_url, ckpt_dir)
 
         # unzip file
@@ -102,7 +102,7 @@ def download_tf_checkpoint(model_name, tf_models_dir="tf_models"):
 
         return get_ckpt_prefix_path(ckpt_dir)
 
-    elif re.search(".tar.gz$", tf_ckpt_url) != None:
+    elif re.search(".tar.gz$", tf_ckpt_url) is not None:
         tar_dir = download_compressed_file(tf_ckpt_url, ckpt_dir)
 
         # untar file
@@ -190,7 +190,7 @@ def tf2pt_pipeline_test():
     import torch
 
     logger = logging.getLogger("")
-    for model_name in TFMODELS.keys():
+    for model_name in TFMODELS:
         config, model = tf2pt_pipeline(model_name)
         assert config.model_type is TFMODELS[model_name][0]
 
diff --git a/onnxruntime/python/tools/transformers/float16.py b/onnxruntime/python/tools/transformers/float16.py
index b830205fdbf89..ad8679038c962 100644
--- a/onnxruntime/python/tools/transformers/float16.py
+++ b/onnxruntime/python/tools/transformers/float16.py
@@ -16,7 +16,7 @@
 import logging
 import os
 import tempfile
-from typing import Dict, List
+from typing import Dict
 
 import numpy as np
 import onnx
@@ -53,17 +53,17 @@ def between(a, b, c):
         positive_max = np_array[np.where(np_array > 0)].max()
         positive_min = np_array[np.where(np_array > 0)].min()
         if positive_max >= max_finite_val:
-            logger.info("the float32 number {} will be truncated to {}".format(positive_max, max_finite_val))
+            logger.info(f"the float32 number {positive_max} will be truncated to {max_finite_val}")
         if positive_min <= min_positive_val:
-            logger.info("the float32 number {} will be truncated to {}".format(positive_min, min_positive_val))
+            logger.info(f"the float32 number {positive_min} will be truncated to {min_positive_val}")
 
     if np_array[np.where(np_array < 0)].shape[0] > 0:
         negative_max = np_array[np.where(np_array < 0)].max()
         negative_min = np_array[np.where(np_array < 0)].min()
         if negative_min <= -max_finite_val:
-            logger.info("the float32 number {} will be truncated to {}".format(negative_min, -max_finite_val))
+            logger.info(f"the float32 number {negative_min} will be truncated to {-max_finite_val}")
         if negative_max >= -min_positive_val:
-            logger.info("the float32 number {} will be truncated to {}".format(negative_max, -min_positive_val))
+            logger.info(f"the float32 number {negative_max} will be truncated to {-min_positive_val}")
 
     np_array = np.where(between(0, np_array, min_positive_val), min_positive_val, np_array)
     np_array = np.where(between(-min_positive_val, np_array, 0), -min_positive_val, np_array)
@@ -373,7 +373,7 @@ def convert_float_to_float16(
 
         queue = next_level
 
-    for key, value in fp32_initializers.items():
+    for _key, value in fp32_initializers.items():
         # By default, to avoid precision loss, do not convert an initializer to fp16 when it is used only by fp32 nodes.
         if force_fp16_initializers or value.fp16_nodes:
             value.initializer = convert_tensor_float_to_float16(value.initializer, min_positive_val, max_finite_val)
diff --git a/onnxruntime/python/tools/transformers/fusion_attention.py b/onnxruntime/python/tools/transformers/fusion_attention.py
index c05424e39cc38..093d3c430416d 100644
--- a/onnxruntime/python/tools/transformers/fusion_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_attention.py
@@ -112,7 +112,7 @@ def __init__(
         num_heads: int,
         attention_mask: AttentionMask,
         use_multi_head_attention: bool = False,
-        search_op_types: List[str] = ["SkipLayerNormalization", "LayerNormalization"],
+        search_op_types: List[str] = ["SkipLayerNormalization", "LayerNormalization"],  # noqa: B006
     ):
         attention_op_name = "MultiHeadAttention" if use_multi_head_attention else "Attention"
         super().__init__(model, attention_op_name, search_op_types)
@@ -454,7 +454,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                 return
 
         other_inputs = []
-        for i, input in enumerate(start_node.input):
+        for _i, input in enumerate(start_node.input):
             if input not in output_name_to_node:
                 continue
 
diff --git a/onnxruntime/python/tools/transformers/fusion_bias_add.py b/onnxruntime/python/tools/transformers/fusion_bias_add.py
index cdf54a3629726..8489af0940983 100644
--- a/onnxruntime/python/tools/transformers/fusion_bias_add.py
+++ b/onnxruntime/python/tools/transformers/fusion_bias_add.py
@@ -36,7 +36,7 @@ def fuse(self, add_node, input_name_to_nodes: Dict, output_name_to_node: Dict):
         skip_layer_norm = nodes[-1]
 
         # Check skip connection is from SkipLayerNormalization output
-        if not (add_node.input[1] in skip_layer_norm.output):
+        if add_node.input[1] not in skip_layer_norm.output:
             return
 
         bias_index, bias_value = self.model.get_constant_input(bias_node)
diff --git a/onnxruntime/python/tools/transformers/fusion_biassplitgelu.py b/onnxruntime/python/tools/transformers/fusion_biassplitgelu.py
index 106d3de25d39d..67a7c0fb9ceb3 100644
--- a/onnxruntime/python/tools/transformers/fusion_biassplitgelu.py
+++ b/onnxruntime/python/tools/transformers/fusion_biassplitgelu.py
@@ -79,7 +79,8 @@ def fuse(self, gelu_node, input_name_to_nodes: Dict, output_name_to_node: Dict):
         ):  # end index of slice_before_mul is start index of slice_before_gelu
             return
 
-        subgraph_nodes = start_index_nodes + [
+        subgraph_nodes = [
+            *start_index_nodes,
             end_index_nodes[0],
             mul_after_gelu,
             gelu_node,
diff --git a/onnxruntime/python/tools/transformers/fusion_embedlayer.py b/onnxruntime/python/tools/transformers/fusion_embedlayer.py
index 16c15e7a0523e..f8396d4919947 100644
--- a/onnxruntime/python/tools/transformers/fusion_embedlayer.py
+++ b/onnxruntime/python/tools/transformers/fusion_embedlayer.py
@@ -112,7 +112,12 @@ def check_attention_subgraph(
                 logger.debug("No Attention like subgraph in children of LayerNormalization")
                 return False
         else:
-            if children_types != ["Add", "MatMul", "MatMul", "MatMul",] and children_types != [
+            if children_types != [
+                "Add",
+                "MatMul",
+                "MatMul",
+                "MatMul",
+            ] and children_types != [
                 "MatMul",
                 "MatMul",
                 "MatMul",
@@ -245,11 +250,11 @@ def match_position_embedding_bert(self, position_embedding_gather, input_ids, ou
                             /                  Add (optional, B=0)
                            /                    |
                         Gather (segment_ids) Unsqueeze (axes=0)
-                           \        |           |
-                            \     Gather      Slice (data[1,512], starts=0, ends=*, axes=1, steps=1)
-                              \    /            |
+                           \\        |           |
+                            \\     Gather      Slice (data[1,512], starts=0, ends=*, axes=1, steps=1)
+                              \\    /            |
                                 Add          Gather
-                                   \       /
+                                   \\       /
                                       Add
                                        |
                                 LayerNormalization
diff --git a/onnxruntime/python/tools/transformers/fusion_gelu_approximation.py b/onnxruntime/python/tools/transformers/fusion_gelu_approximation.py
index ba231e9e05ea4..085723ce75c61 100644
--- a/onnxruntime/python/tools/transformers/fusion_gelu_approximation.py
+++ b/onnxruntime/python/tools/transformers/fusion_gelu_approximation.py
@@ -3,7 +3,7 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-from logging import getLogger
+from logging import getLogger  # noqa: F401
 
 from fusion_base import Fusion
 from onnx import helper
diff --git a/onnxruntime/python/tools/transformers/fusion_gpt_attention.py b/onnxruntime/python/tools/transformers/fusion_gpt_attention.py
index b8a1cbb9f2044..964d49e8fecdd 100644
--- a/onnxruntime/python/tools/transformers/fusion_gpt_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_gpt_attention.py
@@ -7,7 +7,7 @@
 import numpy as np
 from fusion_base import Fusion
 from fusion_utils import FusionUtils
-from onnx import TensorProto, helper, numpy_helper
+from onnx import TensorProto, helper, numpy_helper  # noqa: F401
 from onnx_model import OnnxModel
 
 logger = getLogger(__name__)
@@ -47,7 +47,7 @@ def match_past_pattern_1(self, concat_k, concat_v, output_name_to_node):
             logger.debug("match_past_pattern_1: expect Gather for past")
             return None
 
-        if not self.model.find_constant_input(gather, 1) == 1:
+        if self.model.find_constant_input(gather, 1) != 1:
             logger.debug("match_past_pattern_1: expect indices=1 for Gather of past")
             return None
         past = gather.input[0]
@@ -62,7 +62,7 @@ def match_past_pattern_1(self, concat_k, concat_v, output_name_to_node):
                 return None
             gather_past_k = past_k_nodes[-1]
 
-        if not self.model.find_constant_input(gather_past_k, 0) == 1:
+        if self.model.find_constant_input(gather_past_k, 0) != 1:
             logger.debug("match_past_pattern_1: expect indices=0 for Gather k of past")
             return None
         past_k = gather_past_k.input[0]
@@ -335,7 +335,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
         # (2) SkipLayerNorm fusion was turned ON but upstream layer's LayerNorm + Add was not
         # fused into a SkipLayerNorm. This can happen if the shapes to the Add node are different.
         # So, keep the following check if SkipLayerNorm fusion is turned ON or OFF.
-        if another_input is not None and not another_input in layernorm_before_attention.input:
+        if another_input is not None and another_input not in layernorm_before_attention.input:
             logger.debug("Upstream Add and (Skip)LayerNormalization shall have one same input")
             return
 
diff --git a/onnxruntime/python/tools/transformers/fusion_gpt_attention_megatron.py b/onnxruntime/python/tools/transformers/fusion_gpt_attention_megatron.py
index 1c0b0b7074745..4d5aac78efb76 100644
--- a/onnxruntime/python/tools/transformers/fusion_gpt_attention_megatron.py
+++ b/onnxruntime/python/tools/transformers/fusion_gpt_attention_megatron.py
@@ -5,10 +5,10 @@
 from logging import getLogger
 
 import numpy as np
-from fusion_base import Fusion
+from fusion_base import Fusion  # noqa: F401
 from fusion_gpt_attention import FusionGptAttentionPastBase
-from fusion_utils import FusionUtils
-from onnx import TensorProto, helper, numpy_helper
+from fusion_utils import FusionUtils  # noqa: F401
+from onnx import TensorProto, helper, numpy_helper  # noqa: F401
 from onnx_model import OnnxModel
 
 logger = getLogger(__name__)
diff --git a/onnxruntime/python/tools/transformers/fusion_gpt_attention_no_past.py b/onnxruntime/python/tools/transformers/fusion_gpt_attention_no_past.py
index 8176be523bcca..3a83de5f40bba 100644
--- a/onnxruntime/python/tools/transformers/fusion_gpt_attention_no_past.py
+++ b/onnxruntime/python/tools/transformers/fusion_gpt_attention_no_past.py
@@ -4,10 +4,10 @@
 # --------------------------------------------------------------------------
 from logging import getLogger
 
-import numpy as np
+import numpy as np  # noqa: F401
 from fusion_base import Fusion
-from fusion_utils import FusionUtils
-from onnx import TensorProto, helper, numpy_helper
+from fusion_utils import FusionUtils  # noqa: F401
+from onnx import TensorProto, helper, numpy_helper  # noqa: F401
 from onnx_model import OnnxModel
 
 logger = getLogger(__name__)
@@ -146,9 +146,9 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
         # fused into a SkipLayerNorm. This can happen if the shapes to the Add node are different.
         # So, keep the following check if SkipLayerNorm fusion is turned ON or OFF.
         if another_input is not None:
-            if not another_input in layernorm_before_attention.input:
+            if another_input not in layernorm_before_attention.input:
                 # match openai-gpt
-                if not another_input in layernorm_before_attention.output:
+                if another_input not in layernorm_before_attention.output:
                     logger.debug("Add and (Skip)LayerNormalization shall have one same input")
                     return
 
diff --git a/onnxruntime/python/tools/transformers/fusion_layernorm.py b/onnxruntime/python/tools/transformers/fusion_layernorm.py
index 893d3283691be..e817e1a5892ff 100644
--- a/onnxruntime/python/tools/transformers/fusion_layernorm.py
+++ b/onnxruntime/python/tools/transformers/fusion_layernorm.py
@@ -84,7 +84,7 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
             return
 
         pow_node = parent_nodes[3]
-        if not self.model.find_constant_input(pow_node, 2.0) == 1:
+        if self.model.find_constant_input(pow_node, 2.0) != 1:
             return
 
         mul_node = input_name_to_nodes[div_node.output[0]][0]
@@ -106,7 +106,7 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
             input_name_to_nodes,
             output_name_to_node,
         ):
-            logger.debug(f"It is not safe to fuse LayerNormalization node. Skip")
+            logger.debug("It is not safe to fuse LayerNormalization node. Skip")
             return
 
         weight_input = mul_node.input[1 - self.model.input_index(div_node.output[0], mul_node)]
diff --git a/onnxruntime/python/tools/transformers/fusion_qordered_attention.py b/onnxruntime/python/tools/transformers/fusion_qordered_attention.py
index b3d8743414b91..fb020298bc210 100644
--- a/onnxruntime/python/tools/transformers/fusion_qordered_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_qordered_attention.py
@@ -128,7 +128,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
 
         # Identify the root input to the Attention node
         other_inputs = []
-        for i, input in enumerate(start_node.input):
+        for _i, input in enumerate(start_node.input):
             if input not in output_name_to_node:
                 continue
 
diff --git a/onnxruntime/python/tools/transformers/fusion_qordered_gelu.py b/onnxruntime/python/tools/transformers/fusion_qordered_gelu.py
index a92c8f94d49af..6c44bb11e24dc 100644
--- a/onnxruntime/python/tools/transformers/fusion_qordered_gelu.py
+++ b/onnxruntime/python/tools/transformers/fusion_qordered_gelu.py
@@ -81,7 +81,7 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
             input_name_to_nodes,
             output_name_to_node,
         ):
-            logger.debug(f"It is not safe to fuse QOrderedGelu node. Skip")
+            logger.debug("It is not safe to fuse QOrderedGelu node. Skip")
             return
 
         self.nodes_to_remove.extend(subgraph_nodes)
diff --git a/onnxruntime/python/tools/transformers/fusion_qordered_layernorm.py b/onnxruntime/python/tools/transformers/fusion_qordered_layernorm.py
index f8198bcaa1419..cf2b357721757 100644
--- a/onnxruntime/python/tools/transformers/fusion_qordered_layernorm.py
+++ b/onnxruntime/python/tools/transformers/fusion_qordered_layernorm.py
@@ -83,7 +83,7 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
             input_name_to_nodes,
             output_name_to_node,
         ):
-            logger.debug(f"It is not safe to fuse QOrderedLayerNormalization node. Skip")
+            logger.debug("It is not safe to fuse QOrderedLayerNormalization node. Skip")
             return
 
         self.nodes_to_remove.extend(subgraph_nodes)
diff --git a/onnxruntime/python/tools/transformers/fusion_qordered_matmul.py b/onnxruntime/python/tools/transformers/fusion_qordered_matmul.py
index 2fbd3262684ce..681160479faef 100644
--- a/onnxruntime/python/tools/transformers/fusion_qordered_matmul.py
+++ b/onnxruntime/python/tools/transformers/fusion_qordered_matmul.py
@@ -170,7 +170,7 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
         if not self.model.is_safe_to_fuse_nodes(
             subgraph_nodes, downstream_quantize_node.output, input_name_to_nodes, output_name_to_node
         ):
-            logger.debug(f"It is not safe to fuse QOrderedMatMul node. Skip")
+            logger.debug("It is not safe to fuse QOrderedMatMul node. Skip")
             return
 
         # Deal with the case where-in the Attention subgraph is not fused
diff --git a/onnxruntime/python/tools/transformers/fusion_shape.py b/onnxruntime/python/tools/transformers/fusion_shape.py
index a6a74719b9c42..11d6b7a8d3cf4 100644
--- a/onnxruntime/python/tools/transformers/fusion_shape.py
+++ b/onnxruntime/python/tools/transformers/fusion_shape.py
@@ -58,7 +58,7 @@ def fuse(
             Gather(indices=0)  Gather(indices=1)
                 |                |
             Unsqueeze(axes=0)   Unsqueeze(axes=0)
-                   \          /
+                   \\          /
                       Concat
                         |
 
diff --git a/onnxruntime/python/tools/transformers/fusion_skiplayernorm.py b/onnxruntime/python/tools/transformers/fusion_skiplayernorm.py
index 34606135b9727..2737369361d7d 100644
--- a/onnxruntime/python/tools/transformers/fusion_skiplayernorm.py
+++ b/onnxruntime/python/tools/transformers/fusion_skiplayernorm.py
@@ -42,7 +42,7 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node):
             return
 
         for add_input in add.input:
-            if self.model.get_initializer(add_input) != None:
+            if self.model.get_initializer(add_input) is not None:
                 return
 
         # The number of input node of add should be 2
@@ -159,15 +159,15 @@ def fuse(self, node, input_name_to_nodes, output_name_to_node):
             bias_weight = NumpyHelper.to_array(initializer)
             break
         if bias_weight is None:
-            logger.debug(f"Bias weight not found")
+            logger.debug("Bias weight not found")
             return
         if len(bias_weight.shape) != 1:
-            logger.debug(f"Bias weight is not 1D")
+            logger.debug("Bias weight is not 1D")
             return
 
         subgraph_nodes = [node, add]
         if not self.model.is_safe_to_fuse_nodes(subgraph_nodes, node.output, input_name_to_nodes, output_name_to_node):
-            logger.debug(f"Skip fusing SkipLayerNormalization with Bias since it is not safe")
+            logger.debug("Skip fusing SkipLayerNormalization with Bias since it is not safe")
             return
 
         self.nodes_to_remove.extend(subgraph_nodes)
diff --git a/onnxruntime/python/tools/transformers/fusion_transpose.py b/onnxruntime/python/tools/transformers/fusion_transpose.py
index 8c4f867bdb461..286adf0fce42c 100644
--- a/onnxruntime/python/tools/transformers/fusion_transpose.py
+++ b/onnxruntime/python/tools/transformers/fusion_transpose.py
@@ -69,7 +69,7 @@ def fuse(
         assert len(parent_permutation) == len(permutation)
 
         output_permutation = []
-        for j, index in enumerate(permutation):
+        for _j, index in enumerate(permutation):
             output_permutation.append(parent_permutation[index])
 
         if cast_node is None:
diff --git a/onnxruntime/python/tools/transformers/fusion_utils.py b/onnxruntime/python/tools/transformers/fusion_utils.py
index 0945be6cc6898..afc968fab46c1 100644
--- a/onnxruntime/python/tools/transformers/fusion_utils.py
+++ b/onnxruntime/python/tools/transformers/fusion_utils.py
@@ -130,9 +130,7 @@ def check_node_attribute(node, attribute_name: str, expected_value, default_valu
                 value = helper.get_attribute_value(attr)
 
         if isinstance(expected_value, list):
-            return (isinstance(value, ndarray) or isinstance(value, list)) and array_equal(
-                expected_value, value, equal_nan=False
-            )
+            return (isinstance(value, (ndarray, list))) and array_equal(expected_value, value, equal_nan=False)
         else:
             return value == expected_value
 
@@ -172,7 +170,7 @@ def check_qdq_node_for_fusion(node: NodeProto, model: OnnxModel, allow_per_tenso
         Returns:
             bool: whether the check is passed or not
         """
-        if not node.op_type in {"QuantizeLinear", "DequantizeLinear"}:
+        if node.op_type not in {"QuantizeLinear", "DequantizeLinear"}:
             logger.debug(f"Provided node is not a Q/DQ node. Op Type: {node.op_type}")
 
         scale = model.get_constant_value(node.input[1])
@@ -219,9 +217,7 @@ def check_node_input_value(self, node, input_index: int, expected_value):
         value = self.model.get_constant_value(node.input[input_index])
 
         if isinstance(expected_value, list):
-            return (isinstance(value, ndarray) or isinstance(value, list)) and array_equal(
-                expected_value, value, equal_nan=False
-            )
+            return (isinstance(value, (ndarray, list))) and array_equal(expected_value, value, equal_nan=False)
         else:
             return value == expected_value
 
diff --git a/onnxruntime/python/tools/transformers/io_binding_helper.py b/onnxruntime/python/tools/transformers/io_binding_helper.py
index 3182107cd8050..0715395268ee8 100644
--- a/onnxruntime/python/tools/transformers/io_binding_helper.py
+++ b/onnxruntime/python/tools/transformers/io_binding_helper.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Dict, List, Union
+from typing import Dict, List
 
 import numpy
 import torch
@@ -12,14 +12,14 @@
 class TypeHelper:
     @staticmethod
     def get_input_type(ort_session: InferenceSession, name: str) -> str:
-        for i, input in enumerate(ort_session.get_inputs()):
+        for _i, input in enumerate(ort_session.get_inputs()):
             if input.name == name:
                 return input.type
         raise ValueError(f"input name {name} not found")
 
     @staticmethod
     def get_output_type(ort_session, name: str) -> str:
-        for i, output in enumerate(ort_session.get_outputs()):
+        for _i, output in enumerate(ort_session.get_outputs()):
             if output.name == name:
                 return output.type
 
diff --git a/onnxruntime/python/tools/transformers/machine_info.py b/onnxruntime/python/tools/transformers/machine_info.py
index e872e2a6c00c6..6d10b855d7e7d 100644
--- a/onnxruntime/python/tools/transformers/machine_info.py
+++ b/onnxruntime/python/tools/transformers/machine_info.py
@@ -9,9 +9,9 @@
 import json
 import logging
 import platform
-import sys
+import sys  # noqa: F401
 from os import environ
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Tuple, Union  # noqa: F401
 
 import cpuinfo
 import psutil
@@ -102,7 +102,7 @@ def get_gpu_info_by_nvml(self) -> Dict:
         try:
             nvmlInit()
             driver_version = nvmlSystemGetDriverVersion()
-            deviceCount = nvmlDeviceGetCount()
+            deviceCount = nvmlDeviceGetCount()  # noqa: N806
             for i in range(deviceCount):
                 handle = nvmlDeviceGetHandleByIndex(i)
                 info = nvmlDeviceGetMemoryInfo(handle)
diff --git a/onnxruntime/python/tools/transformers/models/bart/export.py b/onnxruntime/python/tools/transformers/models/bart/export.py
index c1e0f3224a445..5c32f9873ee22 100644
--- a/onnxruntime/python/tools/transformers/models/bart/export.py
+++ b/onnxruntime/python/tools/transformers/models/bart/export.py
@@ -32,7 +32,6 @@ def print_args(args):
 
 
 def user_command():
-
     parent_parser = argparse.ArgumentParser(add_help=False)
     parent_parser.add_argument("--max_length", type=int, default=20, help="default to 20")
     parent_parser.add_argument("--min_length", type=int, default=0, help="default to 0")
@@ -66,12 +65,11 @@ def user_command():
 
 
 if __name__ == "__main__":
-
     args = user_command()
     if args.opset_version < 14:
         raise ValueError(f"The minimum supported opset version is 14! The given one was {args.opset_version}.")
 
-    isExist = os.path.exists(args.output)
+    isExist = os.path.exists(args.output)  # noqa: N816
     if not isExist:
         os.makedirs(args.output)
 
@@ -87,14 +85,14 @@ def user_command():
         )
 
     if not args.no_encoder:
-        logger.info(f"========== EXPORTING ENCODER ==========")
+        logger.info("========== EXPORTING ENCODER ==========")
         export_summarization_edinit.export_encoder(args)
     if not args.no_decoder:
-        logger.info(f"========== EXPORTING DECODER ==========")
+        logger.info("========== EXPORTING DECODER ==========")
         export_summarization_enc_dec_past.export_decoder(args)
     if not args.no_chain:
-        logger.info(f"========== CONVERTING MODELS ==========")
+        logger.info("========== CONVERTING MODELS ==========")
         chain_enc_dec_with_beamsearch.convert_model(args)
     if not args.no_inference:
-        logger.info(f"========== INFERENCING WITH ONNX MODEL ==========")
+        logger.info("========== INFERENCING WITH ONNX MODEL ==========")
         onnx_inference.run_inference(args)
diff --git a/onnxruntime/python/tools/transformers/models/bart/utils/chain_enc_dec_with_beamsearch.py b/onnxruntime/python/tools/transformers/models/bart/utils/chain_enc_dec_with_beamsearch.py
index 4230684e5e7ee..e729b07013774 100644
--- a/onnxruntime/python/tools/transformers/models/bart/utils/chain_enc_dec_with_beamsearch.py
+++ b/onnxruntime/python/tools/transformers/models/bart/utils/chain_enc_dec_with_beamsearch.py
@@ -88,7 +88,7 @@ def convert_model(args):
     ]
     outputs = ["sequences"]
 
-    node = helper.make_node("BeamSearch", inputs=inputs, outputs=outputs, name=f"BeamSearch_zcode")
+    node = helper.make_node("BeamSearch", inputs=inputs, outputs=outputs, name="BeamSearch_zcode")
     node.domain = "com.microsoft"
     # NOTE: take value from args or config
     node.attribute.extend(
diff --git a/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_edinit.py b/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_edinit.py
index f8dc0051b2b9e..111520a6e3aeb 100644
--- a/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_edinit.py
+++ b/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_edinit.py
@@ -89,7 +89,6 @@ def _create_encoder_export(args, config: BartConfig):
     def _prepare_encoder_decoder_kwargs_for_generation(
         self, input_ids: torch.Tensor, model_kwargs, model_input_name: Optional[str] = None
     ) -> Dict[str, Any]:
-
         # retrieve encoder hidden states
         # 1. get encoder
         encoder = self.get_encoder()
@@ -157,7 +156,7 @@ def _prepare_encoder_decoder_kwargs_for_generation(
             opset_version=args.opset_version,
             do_constant_folding=False,
             input_names=["encoder_input_ids", "encoder_attention_mask", "decoder_input_ids"],
-            output_names=["logits", "encoder_hidden_states"] + output_past_names,
+            output_names=["logits", "encoder_hidden_states", *output_past_names],
             dynamic_axes=dynamic_axes,
             export_params=True,
             verbose=True,
@@ -189,7 +188,6 @@ def export_encoder(args):
     config, tokenizer = export_helper.initialize_config(args)
 
     with torch.no_grad():
-
         model, input_data = export_helper.initialize_model(config, tokenizer, args)
         start_time = time.time()
         model._prepare_encoder_decoder_kwargs_for_generation = _create_encoder_export(args, config).__get__(
diff --git a/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_enc_dec_past.py b/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_enc_dec_past.py
index 7e50c1dbc2aac..29c39730c79ef 100644
--- a/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_enc_dec_past.py
+++ b/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_enc_dec_past.py
@@ -154,9 +154,9 @@ def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwarg
         else:
             decoder_config.is_decoder_exported = True
 
-        input_names = ["input_ids", "encoder_attention_mask", "encoder_hidden_states"] + input_past_names
+        input_names = ["input_ids", "encoder_attention_mask", "encoder_hidden_states", *input_past_names]
         output_past_names = export_helper.get_output_names(past_outputs)
-        output_names = ["logits"] + output_past_names
+        output_names = ["logits", *output_past_names]
 
         sequence_length = "1"
         num_heads = str(decoder_config.encoder_attention_heads)
@@ -216,7 +216,7 @@ def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwarg
             sess = InferenceSession(onnx_model_path, sess_options, providers=["CPUExecutionProvider"])
             out = sess.run(None, ort_inputs)
 
-            for ort_out, torch_out in zip(out, [logits] + present):
+            for ort_out, torch_out in zip(out, [logits, *present]):
                 torch.testing.assert_close(ort_out, torch_out.cpu().numpy(), check_dtype=True, atol=1e-4, rtol=1e-2)
 
             print("========== [SUCCESS] ORT inference test on Decoder ==========")
@@ -249,7 +249,6 @@ def export_decoder(args):
     config = decoder_config_update(config)
 
     with torch.no_grad():
-
         model, input_data = export_helper.initialize_model(config, tokenizer, args)
         start_time = time.time()
 
diff --git a/onnxruntime/python/tools/transformers/models/bart/utils/onnx_inference.py b/onnxruntime/python/tools/transformers/models/bart/utils/onnx_inference.py
index 5375fa9aac5f1..c4c8a2dcba697 100644
--- a/onnxruntime/python/tools/transformers/models/bart/utils/onnx_inference.py
+++ b/onnxruntime/python/tools/transformers/models/bart/utils/onnx_inference.py
@@ -32,7 +32,6 @@ def run_inference(args):
     config, tokenizer = export_helper.initialize_config(args)
 
     with torch.no_grad():
-
         model, input_data = export_helper.initialize_model(config, tokenizer, args)
         batch_num = 3
         input_data = input_data.repeat(batch_num, 1)
diff --git a/onnxruntime/python/tools/transformers/models/bert/eval_squad.py b/onnxruntime/python/tools/transformers/models/bert/eval_squad.py
index 495c0f017bbf7..66265d7b1ea71 100644
--- a/onnxruntime/python/tools/transformers/models/bert/eval_squad.py
+++ b/onnxruntime/python/tools/transformers/models/bert/eval_squad.py
@@ -138,13 +138,13 @@ def output_summary(results: List[Dict[str, Any]], csv_filename: str, metric_name
             "use_io_binding",
         ]
 
-        model_list = list(set([result["onnx_path"] for result in results]))
+        model_list = list({result["onnx_path"] for result in results})
         model_list.sort()
 
-        batch_sizes = list(set([result["batch_size"] for result in results]))
+        batch_sizes = list({result["batch_size"] for result in results})
         batch_sizes.sort()
 
-        sequence_lengths = list(set([result["sequence_length"] for result in results]))
+        sequence_lengths = list({result["sequence_length"] for result in results})
         sequence_lengths.sort()
 
         key_names = []
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py b/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
index 01a5e5d8883d7..e8553e2cae0f7 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/benchmark_gpt2.py
@@ -21,14 +21,14 @@
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
 
-from benchmark_helper import (
+from benchmark_helper import (  # noqa: E402
     Precision,
     create_onnxruntime_session,
     get_ort_environment_variables,
     prepare_environment,
     setup_logger,
 )
-from quantize_helper import QuantizeHelper
+from quantize_helper import QuantizeHelper  # noqa: E402
 
 logger = logging.getLogger("")
 
@@ -404,8 +404,8 @@ def main(args):
                             "onnxruntime_latency": f"{ort_latency:.2f}",
                         }
                         csv_writer.writerow(row)
-                    except:
-                        logger.error(f"Exception", exc_info=True)
+                    except Exception:
+                        logger.error("Exception", exc_info=True)
                         return None
 
     logger.info(f"Results are saved to file {csv_filename}")
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py
index 78e718e6e80c4..1f56ff11b35a5 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/convert_to_onnx.py
@@ -30,14 +30,14 @@
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
 
-from benchmark_helper import (
+from benchmark_helper import (  # noqa: E402
     Precision,
     create_onnxruntime_session,
     get_ort_environment_variables,
     prepare_environment,
     setup_logger,
 )
-from quantize_helper import QuantizeHelper
+from quantize_helper import QuantizeHelper  # noqa: E402
 
 logger = logging.getLogger("")
 
@@ -348,7 +348,7 @@ def main(argv=None, experiment_name: str = "", run_id: str = "0", csv_filename:
         )
 
         nodes = m.nodes()
-        op_list = set([node.op_type for node in nodes])
+        op_list = {node.op_type for node in nodes}
         all_ops = ",".join(op_list)
 
         # print optimized operators
@@ -372,7 +372,7 @@ def main(argv=None, experiment_name: str = "", run_id: str = "0", csv_filename:
         output_path = args.output
 
     logger.info(f"Output path: {output_path}")
-    model_size_in_MB = int(get_onnx_model_size(output_path, args.use_external_data_format) / 1024 / 1024)
+    model_size_in_MB = int(get_onnx_model_size(output_path, args.use_external_data_format) / 1024 / 1024)  # noqa: N806
 
     session = create_onnxruntime_session(
         output_path, args.use_gpu, args.provider, enable_all_optimization=True, verbose=args.verbose
@@ -496,7 +496,7 @@ def main(argv=None, experiment_name: str = "", run_id: str = "0", csv_filename:
                 "nan_rate": parity_result["nan_rate"],
                 "top1_match_rate": parity_result["top1_match_rate"],
                 "top1_match_rate_per_run": parity_result["top1_match_rate_per_run"],
-                "onnx_size_in_MB": "{}".format(model_size_in_MB),
+                "onnx_size_in_MB": f"{model_size_in_MB}",
             }
             logger.info(f"result: {row}")
             result.update(row)
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
index 7eec8575f79a4..e5a8d4d2e274a 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_helper.py
@@ -22,11 +22,11 @@
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
 
-from benchmark_helper import Precision
-from float16 import float_to_float16_max_diff
-from io_binding_helper import IOBindingHelper
-from onnx_model import OnnxModel
-from torch_onnx_export_helper import torch_onnx_export
+from benchmark_helper import Precision  # noqa: E402
+from float16 import float_to_float16_max_diff  # noqa: E402
+from io_binding_helper import IOBindingHelper  # noqa: E402
+from onnx_model import OnnxModel  # noqa: E402
+from torch_onnx_export_helper import torch_onnx_export  # noqa: E402
 
 logger = logging.getLogger(__name__)
 
@@ -114,7 +114,7 @@ def forward(self, input_ids, position_ids, attention_mask, *past):
         return MyGPT2Model.post_process(result, self.config.n_layer)
 
 
-class MyGPT2LMHeadModel_NoPadding(GPT2LMHeadModel):
+class MyGPT2LMHeadModel_NoPadding(GPT2LMHeadModel):  # noqa: N801
     """Here we wrap a class for Onnx model conversion for GPT2LMHeadModel with past state and no padding.
     When you always use batch_size=1 in inference, there is no padding in inputs. In such case, position_ids
     and attention_mask need no be in inputs.
@@ -426,7 +426,7 @@ def export_onnx(
 
         # GPT2Model outputs last_state; GPT2LMHeadModel outputs logits (prediction_scores)
         assert outputs[0].shape[2] == config.vocab_size or outputs[0].shape[2] == config.hidden_size
-        output_names = ["logits" if outputs[0].shape[2] == config.vocab_size else "last_state"] + present_names
+        output_names = ["logits" if outputs[0].shape[2] == config.vocab_size else "last_state", *present_names]
 
         # Shape of input tensors:
         #    input_ids: (batch_size, seq_len)
@@ -551,7 +551,7 @@ def optimize_onnx(
     @staticmethod
     def auto_mixed_precision(
         onnx_model: OnnxModel,
-        op_block_list: List[str] = [
+        op_block_list: List[str] = [  # noqa: B006
             "Add",
             "LayerNormalization",
             "SkipLayerNormalization",
@@ -568,7 +568,7 @@ def auto_mixed_precision(
         Returns:
             parameters(dict): a dictionary of parameters used in float16 conversion
         """
-        op_full_set = set([node.op_type for node in onnx_model.nodes()])
+        op_full_set = {node.op_type for node in onnx_model.nodes()}
         fp32_op_set = set(op_block_list)
         fp16_op_set = op_full_set.difference(fp32_op_set)
         logger.info(f"fp32 op: {fp32_op_set} fp16 op: {fp16_op_set}")
@@ -647,7 +647,7 @@ def pytorch_inference(model, inputs: Gpt2Inputs, total_runs: int = 0):
     @staticmethod
     def onnxruntime_inference(ort_session, inputs: Gpt2Inputs, total_runs: int = 0):
         """Run inference of ONNX model, and returns average latency in ms when total_runs > 0 besides outputs."""
-        logger.debug(f"start onnxruntime_inference")
+        logger.debug("start onnxruntime_inference")
 
         ort_inputs = {"input_ids": numpy.ascontiguousarray(inputs.input_ids.cpu().numpy())}
 
@@ -715,7 +715,7 @@ def onnxruntime_inference_with_binded_io(
         include_copy_output_latency: bool = False,
     ):
         """Inference with IO binding. Returns outputs, and optional latency when total_runs > 0."""
-        logger.debug(f"start onnxruntime_inference_with_binded_io")
+        logger.debug("start onnxruntime_inference_with_binded_io")
 
         # Bind inputs and outputs to onnxruntime session
         io_binding = Gpt2Helper.prepare_io_binding(
@@ -888,8 +888,7 @@ def test_parity(
 
         if max_abs_diff_list:
             result = {
-                f"max_diff_percentile_{p}": "{:.5f}".format(numpy.percentile(max_abs_diff_list, p))
-                for p in [50, 90, 95, 99]
+                f"max_diff_percentile_{p}": f"{numpy.percentile(max_abs_diff_list, p):.5f}" for p in [50, 90, 95, 99]
             }
         else:
             result = {f"max_diff_percentile_{p}": "nan" for p in [50, 90, 95, 99]}
@@ -988,7 +987,7 @@ def get_onnx_paths(
         model_class: str = "GPT2LMHeadModel",
         has_past=True,
         new_folder=False,
-        remove_existing=["raw", "fp32", "fp16", "int8"],
+        remove_existing=["raw", "fp32", "fp16", "int8"],  # noqa: B006
     ):
         """Build a  path name for given model based on given attributes."""
         model_name = model_name_or_path
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py
index e48fcc1cfc119..3bcb80478e730 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_parity.py
@@ -26,7 +26,7 @@
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
 
-from benchmark_helper import get_ort_environment_variables, setup_logger
+from benchmark_helper import get_ort_environment_variables, setup_logger  # noqa: E402
 
 logger = logging.getLogger("")
 
@@ -113,14 +113,14 @@ def run(self, argv, experiment_name):
 
         try:
             result = main(
-                argv + ["-t", f"{self.test_cases}", "-r", f"{self.total_runs}"],
+                [*argv, "-t", f"{self.test_cases}", "-r", f"{self.total_runs}"],
                 experiment_name=experiment_name,
                 run_id=run_id,
                 csv_filename=self.csv_path,
             )
             if result:
                 self.results.append(result)
-        except:
+        except Exception:
             logger.exception(f"Failed to run experiment {experiment_name}")
             result = None
 
@@ -150,7 +150,7 @@ def score(row):
     """Scoring function based on 3 metrics. The larger score is better."""
     latency_in_ms = get_latency(row)
     top1_match_rate = float(row["top1_match_rate"])
-    onnx_size_in_MB = float(row["onnx_size_in_MB"])
+    onnx_size_in_MB = float(row["onnx_size_in_MB"])  # noqa: N806
     # A simple scoring function: cost of 0.1ms latency ~ 0.1% match rate ~ 100MB size
     return top1_match_rate * 1000 - latency_in_ms * 10 - onnx_size_in_MB / 100
 
@@ -321,7 +321,7 @@ def get_mixed_precision_parameters(args, last_matmul_node_name, op_block_list):
     ]
 
     if op_block_list:
-        parameters.extend(["--op_block_list"] + op_block_list)
+        parameters.extend(["--op_block_list", *op_block_list])
 
     return parameters
 
@@ -330,7 +330,7 @@ def run_candidate(
     task: ParityTask,
     args,
     last_matmul_node_name,
-    op_block_list=["FastGelu", "LayerNormalization"],
+    op_block_list=["FastGelu", "LayerNormalization"],  # noqa: B006
 ):
     parameters = get_mixed_precision_parameters(args, last_matmul_node_name, op_block_list)
     op_block_list_str = ",".join(sorted(op_block_list))
@@ -407,9 +407,9 @@ def run_tuning_step2(task, mixed_precision_baseline, optimized_ops):
     fp32_ops = [x for x in candidate_fp32_ops if x in optimized_ops]
     for op in optimized_ops:
         if op not in fp32_ops:
-            op_block_list = fp32_ops + [op]
+            op_block_list = [*fp32_ops, op]
             task.run(
-                mixed_precision_baseline + ["--op_block_list"] + op_block_list,
+                [*mixed_precision_baseline, "--op_block_list", *op_block_list],
                 "Mixed precision baseline + {},{} in FP32".format(",".join(fp32_ops), op),
             )
 
@@ -450,7 +450,8 @@ def run_parity(task: ParityTask, args):
     # Mixed precision baseline
     run_candidate(task, args, last_matmul_node_name, op_block_list=[])
 
-    get_fp32_ops = lambda x: [op for op in x if op in all_ops]
+    def get_fp32_ops(x):
+        return [op for op in x if op in all_ops]
 
     if args.all:
         run_tuning_step0(task, fp16_baseline, all_ops, optimized_ops)
@@ -509,7 +510,7 @@ def run_parity(task: ParityTask, args):
 
     try:
         rows = load_results_from_csv(task.csv_path)
-    except:
+    except Exception:
         logger.exception(f"Failed to load csv {task.csv_path}")
         rows = task.results
 
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_tester.py b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_tester.py
index be303b4e188bf..80fbbf8b380f3 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/gpt2_tester.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/gpt2_tester.py
@@ -17,7 +17,7 @@
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
 
-from benchmark_helper import Precision
+from benchmark_helper import Precision  # noqa: E402
 
 logger = logging.getLogger(__name__)
 
@@ -61,12 +61,12 @@ def print(self):
             for key in sorted(self.seq_len_latency.keys()):
                 average = statistics.mean(self.seq_len_latency[key]) * 1000.0
                 if key == 0:
-                    print("\t{}:         \t{:.2f} ms".format(key, average))
+                    print(f"\t{key}:         \t{average:.2f} ms")
                 else:
-                    print("\t[{}, {}]:\t{:.2f} ms".format(2**key, 2 ** (key + 1) - 1, average))
+                    print(f"\t[{2**key}, {2 ** (key + 1) - 1}]:\t{average:.2f} ms")
                 total += average * len(self.seq_len_latency[key])
                 count += len(self.seq_len_latency[key])
-            print("Average Latency: {:.2f} ms".format(total / count))
+            print(f"Average Latency: {total / count:.2f} ms")
 
     def diff_logits(self, baseline_logits, treatment_logits, is_empty_past: bool):
         diff = (baseline_logits - treatment_logits).abs().max()
@@ -130,7 +130,6 @@ def __init__(
         top_k=20,
         top_k_required_order=False,
     ):
-
         self.batch_size = input_ids.shape[0]
         self.input_length = input_ids.shape[1]
         self.n_layer = num_layer
@@ -151,7 +150,7 @@ def __init__(
             0,
             hidden_size // num_attention_heads,
         ]
-        for i in range(num_layer):
+        for _i in range(num_layer):
             empty_past = torch.empty(past_shape).type(torch.float16 if is_fp16 else torch.float32)
             self.past.append(empty_past.to(device))
 
@@ -190,15 +189,15 @@ def add_tensor(input_tensors, torch_tensor, name):
             add_tensor(input_tensors, self.past[i], "past_" + str(i))
 
         for i, tensor in enumerate(input_tensors):
-            with open(os.path.join(path, "input_{}.pb".format(i)), "wb") as f:
+            with open(os.path.join(path, f"input_{i}.pb"), "wb") as f:
                 f.write(tensor.SerializeToString())
 
         output_names = [output.name for output in session.get_outputs()]
-        for i, name in enumerate(output_names):
+        for i, _name in enumerate(output_names):
             tensor = numpy_helper.from_array(
                 output[i] if isinstance(output[i], numpy.ndarray) else output[i].clone().cpu().numpy()
             )
-            with open(os.path.join(path, "output_{}.pb".format(i)), "wb") as f:
+            with open(os.path.join(path, f"output_{i}.pb"), "wb") as f:
                 f.write(tensor.SerializeToString())
 
         print(f"Test data saved to directory {path}")
@@ -290,9 +289,9 @@ def predict_next_token(logits, top_k=1, required_order=False):
 
         # logits has shape (batch_size, seq_len, vocab_size)
         # last token logits has shape (batch_size, vocab_size)
-        lastTokenLogits = logits[:, -1]
+        lastTokenLogits = logits[:, -1]  # noqa: N806
         if top_k == 1:
-            generatedTokens = torch.argmax(lastTokenLogits, 1, True)
+            generatedTokens = torch.argmax(lastTokenLogits, 1, True)  # noqa: N806
             return generatedTokens
         else:
             topk = torch.argsort(lastTokenLogits, -1, descending=True)[:, :top_k]
@@ -462,7 +461,10 @@ def test_generation(
                     )
                     Gpt2Helper.auto_increase_buffer_size(output_buffers, output_shapes)
 
-                    (onnx_io_output, avg_latency_ms,) = Gpt2Helper.onnxruntime_inference_with_binded_io(
+                    (
+                        onnx_io_output,
+                        avg_latency_ms,
+                    ) = Gpt2Helper.onnxruntime_inference_with_binded_io(
                         session,
                         onnx_io_runner.get_inputs(),
                         output_buffers,
diff --git a/onnxruntime/python/tools/transformers/models/gpt2/parity_check_helper.py b/onnxruntime/python/tools/transformers/models/gpt2/parity_check_helper.py
index c122e243293aa..6c7f02812fa6c 100644
--- a/onnxruntime/python/tools/transformers/models/gpt2/parity_check_helper.py
+++ b/onnxruntime/python/tools/transformers/models/gpt2/parity_check_helper.py
@@ -19,7 +19,7 @@
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
 
-from benchmark_helper import create_onnxruntime_session
+from benchmark_helper import create_onnxruntime_session  # noqa: E402
 
 NON_ZERO_VALUE = str(1)
 ZERO_VALUE = str(0)
@@ -107,7 +107,7 @@ def post_processing(outputs_path, outputs_path_other):
                 record[Path(filename).name.split(".")[0]] = diff
                 if_close[Path(filename).name.split(".")[0]] = numpy.allclose(array, array_other, rtol=1e-04, atol=1e-04)
 
-    results = [f"Node\tDiff\tClose"]
+    results = ["Node\tDiff\tClose"]
     for k, v in sorted(record.items(), key=lambda x: x[1], reverse=True):
         results.append(f"{k}\t{v}\t{if_close[k]}")
     for line in results:
diff --git a/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py b/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py
index 679004c6ea89c..bf6c1e60308be 100644
--- a/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py
+++ b/onnxruntime/python/tools/transformers/models/longformer/benchmark_longformer.py
@@ -51,7 +51,7 @@
 import onnxruntime
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
-import benchmark_helper
+import benchmark_helper  # noqa: E402
 
 logger = logging.getLogger("")
 
@@ -80,7 +80,7 @@ def test_torch_latency(
                 input_list = inputs.to_list()
 
                 _ = model(*input_list)
-                runtimes = timeit.repeat(lambda: model(*input_list), repeat=test_times, number=1)
+                runtimes = timeit.repeat(lambda: model(*input_list), repeat=test_times, number=1)  # noqa: B023
                 result = {
                     "engine": "torch",  # TODO: test torchscript
                     "version": torch.__version__,
@@ -404,7 +404,7 @@ def test_torch(args, device) -> List[Dict[str, Any]]:
 
 
 def test_latency(args, device) -> List[Dict[str, Any]]:
-    if "onnxruntime" == args.engine:
+    if args.engine == "onnxruntime":
         return test_ort(args, device)
 
     return test_torch(args, device)
@@ -647,7 +647,7 @@ def run_tests(
                         latency_results = launch_test(args)
                     except KeyboardInterrupt as exc:
                         raise RuntimeError("Keyboard Interrupted") from exc
-                    except:
+                    except Exception:
                         traceback.print_exc()
                         continue
 
@@ -675,13 +675,13 @@ def output_summary(results, csv_filename, data_field="average_latency_ms"):
             "description",
         ]
 
-        description_list = list(set([result["description"] for result in results]))
+        description_list = list({result["description"] for result in results})
         description_list.sort()
 
-        batch_sizes = list(set([result["batch_size"] for result in results]))
+        batch_sizes = list({result["batch_size"] for result in results})
         batch_sizes.sort()
 
-        sequence_lengths = list(set([result["sequence_length"] for result in results]))
+        sequence_lengths = list({result["sequence_length"] for result in results})
         sequence_lengths.sort()
 
         data_names = []
diff --git a/onnxruntime/python/tools/transformers/models/longformer/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/longformer/convert_to_onnx.py
index 7427b65a2bf36..41d32432b9804 100644
--- a/onnxruntime/python/tools/transformers/models/longformer/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/models/longformer/convert_to_onnx.py
@@ -47,8 +47,8 @@
 from transformers import LongformerModel, LongformerSelfAttention
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
-from onnx_model_bert import BertOnnxModel
-from torch_onnx_export_helper import torch_onnx_export
+from onnx_model_bert import BertOnnxModel  # noqa: E402
+from torch_onnx_export_helper import torch_onnx_export  # noqa: E402
 
 # Supports format 0 or 1
 weight_bias_format = 0
@@ -149,7 +149,6 @@ def parse_arguments():
 
 # Create a dummy input for ONNX export.
 def get_dummy_inputs(config, export_padding, device):
-
     # When sequence length is multiple of windows size, there is no padding logic in ONNX graph
     sequence_length = config.attention_window[0] + 1 if export_padding else config.attention_window[0]
 
@@ -266,7 +265,7 @@ def my_longformer_self_attention_forward_4_3(
     is_global_attn=None,
     output_attentions=False,
 ):
-    assert output_attentions == False
+    assert output_attentions is False
     return my_longformer_self_attention_forward_4(
         self,
         hidden_states,
@@ -288,7 +287,7 @@ def my_longformer_self_attention_forward_4_3_2(
     is_global_attn=None,
     output_attentions=False,
 ):
-    assert output_attentions == False
+    assert output_attentions is False
     assert layer_head_mask is None
     return my_longformer_self_attention_forward_4(
         self,
diff --git a/onnxruntime/python/tools/transformers/models/longformer/generate_test_data.py b/onnxruntime/python/tools/transformers/models/longformer/generate_test_data.py
index 379efce27b27a..735d2d4899041 100644
--- a/onnxruntime/python/tools/transformers/models/longformer/generate_test_data.py
+++ b/onnxruntime/python/tools/transformers/models/longformer/generate_test_data.py
@@ -12,11 +12,11 @@
 from pathlib import Path
 
 import numpy as np
-from onnx import ModelProto, TensorProto, numpy_helper
+from onnx import ModelProto, TensorProto, numpy_helper  # noqa: F401
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
-from bert_test_data import fake_input_ids_data, fake_input_mask_data, output_test_data
-from onnx_model import OnnxModel
+from bert_test_data import fake_input_ids_data, fake_input_mask_data, output_test_data  # noqa: E402
+from onnx_model import OnnxModel  # noqa: E402
 
 
 def parse_arguments():
@@ -131,7 +131,7 @@ def get_longformer_inputs(onnx_file, input_ids_name=None, input_mask_name=None,
         return input_ids, input_mask, global_mask
 
     if len(graph_inputs) != 3:
-        raise ValueError("Expect the graph to have 3 inputs. Got {}".format(len(graph_inputs)))
+        raise ValueError(f"Expect the graph to have 3 inputs. Got {len(graph_inputs)}")
 
     # Try guess the inputs based on naming.
     input_ids = None
@@ -264,7 +264,6 @@ def create_longformer_test_data(
     global_mask_name,
     num_global_tokens,
 ):
-
     input_ids, input_mask, global_mask = get_longformer_inputs(model, input_ids_name, input_mask_name, global_mask_name)
     all_inputs = generate_test_data(
         batch_size,
@@ -290,7 +289,7 @@ def main():
         # Default output directory is a sub-directory under the directory of model.
         output_dir = os.path.join(
             Path(args.model).parent,
-            "b{}_s{}_g{}".format(args.batch_size, args.sequence_length, args.global_tokens),
+            f"b{args.batch_size}_s{args.sequence_length}_g{args.global_tokens}",
         )
 
     if output_dir is not None:
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py
index 06b5127dc3f64..46e46accf99fe 100644
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/optimize_pipeline.py
@@ -34,11 +34,11 @@
 import onnxruntime
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
-from fusion_options import FusionOptions
-from onnx_model_clip import ClipOnnxModel
-from onnx_model_unet import UnetOnnxModel
-from onnx_model_vae import VaeOnnxModel
-from optimizer import optimize_by_onnxruntime, optimize_model
+from fusion_options import FusionOptions  # noqa: E402
+from onnx_model_clip import ClipOnnxModel  # noqa: E402
+from onnx_model_unet import UnetOnnxModel  # noqa: E402
+from onnx_model_vae import VaeOnnxModel  # noqa: E402
+from optimizer import optimize_by_onnxruntime, optimize_model  # noqa: E402
 
 logger = logging.getLogger(__name__)
 
diff --git a/onnxruntime/python/tools/transformers/models/t5/past_helper.py b/onnxruntime/python/tools/transformers/models/t5/past_helper.py
index fe113491067fd..60fe27174ccc8 100644
--- a/onnxruntime/python/tools/transformers/models/t5/past_helper.py
+++ b/onnxruntime/python/tools/transformers/models/t5/past_helper.py
@@ -38,7 +38,7 @@ def group_by_self_or_cross(present_key_values):
         """
         present_self = []
         present_cross = []
-        for i, present_layer_i in enumerate(present_key_values):
+        for _i, present_layer_i in enumerate(present_key_values):
             assert len(present_layer_i) == 4, f"Expected to have four items. Got {len(present_layer_i)}"
             (
                 present_key_self,
diff --git a/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py b/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py
index bfa14d67c3d01..7bf3ec3adfdc3 100644
--- a/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py
+++ b/onnxruntime/python/tools/transformers/models/t5/t5_decoder.py
@@ -93,7 +93,6 @@ def __init__(self, decoder, lm_head, config):
         self.config = config
 
     def forward(self, decoder_input_ids, encoder_attention_mask, *past):
-
         past_key_values = PastKeyValuesHelper.group_by_layer(past, self.config.num_layers)
 
         # This is a hack since only the third dimension of encoder_hidden_states is used here
@@ -154,7 +153,6 @@ def create_dummy(
         Returns:
             T5DecoderInputs: dummy inputs for decoder
         """
-        hidden_size: int = config.d_model
         num_attention_heads: int = config.num_heads
         num_layers: int = config.num_layers
         vocab_size: int = config.vocab_size
@@ -263,7 +261,7 @@ def export_onnx(
 
         input_past_names = past_names if isinstance(decoder, T5Decoder) else []
         output_present_names = present_self_names if isinstance(decoder, T5Decoder) else present_names
-        output_names = ["logits"] + output_present_names
+        output_names = ["logits", *output_present_names]
 
         # Shape of input tensors (sequence_length==1):
         #    input_ids: (batch_size, sequence_length)
diff --git a/onnxruntime/python/tools/transformers/models/t5/t5_encoder_decoder_init.py b/onnxruntime/python/tools/transformers/models/t5/t5_encoder_decoder_init.py
index 5f7f4339aea6f..e3d600981ef0e 100644
--- a/onnxruntime/python/tools/transformers/models/t5/t5_encoder_decoder_init.py
+++ b/onnxruntime/python/tools/transformers/models/t5/t5_encoder_decoder_init.py
@@ -127,7 +127,7 @@ def export_onnx(
 
         present_names = PastKeyValuesHelper.get_past_names(model.config.num_layers, present=True)
 
-        output_names = ["logits", "encoder_hidden_states"] + present_names
+        output_names = ["logits", "encoder_hidden_states", *present_names]
 
         # Shape of input tensors (sequence_length==1):
         #    input_ids: (batch_size, sequence_length)
@@ -255,7 +255,7 @@ def verify_onnx(
 
         test_cases = [(4, 11), (1, 2), (3, 1), (8, 5)]
         test_cases_max_diff = []
-        for (batch_size, encode_sequence_length) in test_cases[:max_cases]:
+        for batch_size, encode_sequence_length in test_cases[:max_cases]:
             inputs = T5EncoderDecoderInitInputs.create_dummy(
                 model.config,
                 batch_size,
diff --git a/onnxruntime/python/tools/transformers/models/t5/t5_helper.py b/onnxruntime/python/tools/transformers/models/t5/t5_helper.py
index 17ea255386f99..4abf45ed1ea88 100644
--- a/onnxruntime/python/tools/transformers/models/t5/t5_helper.py
+++ b/onnxruntime/python/tools/transformers/models/t5/t5_helper.py
@@ -19,9 +19,9 @@
 from onnxruntime import InferenceSession
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
-from float16 import float_to_float16_max_diff
-from onnx_model import OnnxModel
-from optimizer import optimize_model
+from float16 import float_to_float16_max_diff  # noqa: E402
+from onnx_model import OnnxModel  # noqa: E402
+from optimizer import optimize_model  # noqa: E402
 
 logger = logging.getLogger(__name__)
 
@@ -150,7 +150,7 @@ def export_onnx(
     @staticmethod
     def auto_mixed_precision(
         onnx_model: OnnxModel,
-        op_block_list: List[str] = [
+        op_block_list: List[str] = [  # noqa: B006
             "SimplifiedLayerNormalization",
             "SkipSimplifiedLayerNormalization",
             "Relu",
@@ -165,7 +165,7 @@ def auto_mixed_precision(
         Returns:
             parameters(dict): a dictionary of parameters used in float16 conversion
         """
-        op_full_set = set([node.op_type for node in onnx_model.nodes()])
+        op_full_set = {node.op_type for node in onnx_model.nodes()}
         fp32_op_set = set(op_block_list)
         fp16_op_set = op_full_set.difference(fp32_op_set)
         logger.info(f"fp32 op: {fp32_op_set} fp16 op: {fp16_op_set}")
diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py
index c4dda99496ebe..801fdb080e5c4 100644
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@@ -19,7 +19,7 @@
 from transformers import AutoConfig, AutoTokenizer, LxmertConfig, TransfoXLConfig
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "models", "gpt2"))
-from gpt2_helper import PRETRAINED_GPT2_MODELS, GPT2ModelNoPastState, TFGPT2ModelNoPastState
+from gpt2_helper import PRETRAINED_GPT2_MODELS, GPT2ModelNoPastState, TFGPT2ModelNoPastState  # noqa: E402
 
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
 
@@ -95,7 +95,7 @@ def update_flatten_list(inputs, res_list):
 def build_dynamic_axes(example_inputs, outputs_flatten):
     sequence_length = example_inputs["input_ids"].shape[-1]
 
-    dynamic_axes = {key: {0: "batch_size", 1: "seq_len"} for key in example_inputs.keys()}
+    dynamic_axes = {key: {0: "batch_size", 1: "seq_len"} for key in example_inputs}
 
     output_names = ["output_" + str(i + 1) for i in range(len(outputs_flatten))]
     for i, output_name in enumerate(output_names):
@@ -172,7 +172,7 @@ def get_onnx_file_path(
         filename = f"{normalized_model_name}_{input_count}_{precision}_{device}"
 
     if optimized_by_onnxruntime:
-        filename += f"_ort"
+        filename += "_ort"
 
     directory = onnx_dir
     # ONNXRuntime will not write external data so the raw and optimized models shall be in same directory.
@@ -236,9 +236,9 @@ def optimize_onnx_model(
         if optimization_options is None:
             optimization_options = FusionOptions(model_type)
         optimization_options.use_raw_attention_mask(use_raw_attention_mask)
-        if Precision.FLOAT16 == precision:
+        if precision == Precision.FLOAT16:
             optimization_options.enable_gelu_approximation = True
-        if Precision.INT8 == precision:
+        if precision == Precision.INT8:
             optimization_options.enable_embed_layer_norm = False
 
         # Use script to optimize model.
@@ -259,7 +259,7 @@ def optimize_onnx_model(
 
         model_fusion_statistics[optimized_model_path] = opt_model.get_fused_operator_statistics()
 
-        if Precision.FLOAT16 == precision:
+        if precision == Precision.FLOAT16:
             opt_model.convert_float_to_float16(keep_io_types=True)
 
         opt_model.save_model_to_file(optimized_model_path, use_external_data_format)
@@ -268,7 +268,7 @@ def optimize_onnx_model(
 
 
 def modelclass_dispatcher(model_name, custom_model_class):
-    if custom_model_class != None:
+    if custom_model_class is not None:
         if custom_model_class in MODEL_CLASSES:
             return custom_model_class
         else:
@@ -279,11 +279,11 @@ def modelclass_dispatcher(model_name, custom_model_class):
 
     import re
 
-    if re.search("-squad$", model_name) != None:
+    if re.search("-squad$", model_name) is not None:
         return "AutoModelForQuestionAnswering"
-    elif re.search("-mprc$", model_name) != None:
+    elif re.search("-mprc$", model_name) is not None:
         return "AutoModelForSequenceClassification"
-    elif re.search("gpt2", model_name) != None:
+    elif re.search("gpt2", model_name) is not None:
         return "AutoModelWithLMHead"
 
     return "AutoModel"
@@ -461,7 +461,6 @@ def export_onnx_model_from_pt(
     model_fusion_statistics,
     fusion_options,
 ):
-
     config, model = load_pt_model(model_name, model_class, cache_dir, config_modifier)
     # config, model = load_pt_model_from_tf(model_name)
     model.cpu()
@@ -495,7 +494,7 @@ def export_onnx_model_from_pt(
     )
 
     if overwrite or not os.path.exists(onnx_model_path):
-        logger.info("Exporting ONNX model to {}".format(onnx_model_path))
+        logger.info(f"Exporting ONNX model to {onnx_model_path}")
         Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True)
 
         dynamic_axes, output_names = build_dynamic_axes(example_inputs, example_outputs_flatten)
@@ -600,7 +599,7 @@ def export_onnx_model_from_tf(
         # Use no past state for these models
         if config.use_cache:
             config.use_cache = False
-    except:
+    except Exception:
         pass
 
     example_outputs = model(example_inputs, training=False)
@@ -629,7 +628,7 @@ def export_onnx_model_from_tf(
     tf_internal_model_path = onnx_model_path[:-5] if use_external_data_format else onnx_model_path
 
     if overwrite or not os.path.exists(tf_internal_model_path):
-        logger.info("Exporting ONNX model to {}".format(onnx_model_path))
+        logger.info(f"Exporting ONNX model to {onnx_model_path}")
         if not use_external_data_format:
             Path(tf_internal_model_path).parent.mkdir(parents=True, exist_ok=True)
 
diff --git a/onnxruntime/python/tools/transformers/onnx_model.py b/onnxruntime/python/tools/transformers/onnx_model.py
index 7cfc6d355c7a3..aab2358e2ba68 100644
--- a/onnxruntime/python/tools/transformers/onnx_model.py
+++ b/onnxruntime/python/tools/transformers/onnx_model.py
@@ -31,7 +31,7 @@ def initialize(self, model):
     def disable_shape_inference(self):
         self.enable_shape_infer = False
 
-    def infer_runtime_shape(self, dynamic_axis_mapping={}, update=False):
+    def infer_runtime_shape(self, dynamic_axis_mapping={}, update=False):  # noqa: B006
         if self.enable_shape_infer:
             if self.shape_infer_helper is None or update:
                 self.shape_infer_helper = SymbolicShapeInferenceHelper(self.model)
@@ -39,7 +39,7 @@ def infer_runtime_shape(self, dynamic_axis_mapping={}, update=False):
             try:
                 if self.shape_infer_helper.infer(dynamic_axis_mapping):
                     return self.shape_infer_helper
-            except:  # noqa
+            except Exception:
                 self.enable_shape_infer = False  # disable shape inference to suppress same error message.
                 print("failed in shape inference", sys.exc_info()[0])
 
@@ -243,7 +243,7 @@ def get_parent(self, node, i, output_name_to_node=None):
 
         return output_name_to_node[input]
 
-    def match_first_parent(self, node, parent_op_type, output_name_to_node, exclude=[]):
+    def match_first_parent(self, node, parent_op_type, output_name_to_node, exclude=[]):  # noqa: B006
         """
         Find parent node based on constraints on op_type.
 
@@ -272,7 +272,7 @@ def match_parent(
         parent_op_type,
         input_index=None,
         output_name_to_node=None,
-        exclude=[],
+        exclude=[],  # noqa: B006
         return_indice=None,
     ):
         """
@@ -318,7 +318,7 @@ def match_parent(
 
     def match_parent_paths(self, node, paths, output_name_to_node):
         for i, path in enumerate(paths):
-            assert isinstance(path, List) or isinstance(path, Tuple)
+            assert isinstance(path, (List, Tuple))
             return_indice = []
             matched = self.match_parent_path(node, path[0], path[1], output_name_to_node, return_indice)
             if matched:
diff --git a/onnxruntime/python/tools/transformers/onnx_model_bart.py b/onnxruntime/python/tools/transformers/onnx_model_bart.py
index 33db231c52332..58c726cf7884d 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_bart.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_bart.py
@@ -97,7 +97,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             return
 
         other_inputs = []
-        for i, input in enumerate(normalize_node.input):
+        for _i, input in enumerate(normalize_node.input):
             if input not in output_name_to_node:
                 continue
             if input == qkv_nodes[0].output[0]:
@@ -159,7 +159,6 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             return
 
         if matmul_v.input[0] == root_input and matmul_q.input[0] == root_input and matmul_v.input[0] == root_input:
-
             mask_nodes = []
             mask_index = None
             attention_last_node = reshape_qkv_2
diff --git a/onnxruntime/python/tools/transformers/onnx_model_bert.py b/onnxruntime/python/tools/transformers/onnx_model_bert.py
index ac7d5a6df0804..c8288b4b157f0 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_bert.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_bert.py
@@ -33,7 +33,7 @@ class BertOptimizationOptions(FusionOptions):
     """This class is deprecated"""
 
     def __init__(self, model_type):
-        logger.warning(f"BertOptimizationOptions is depreciated. Please use FusionOptions instead.")
+        logger.warning("BertOptimizationOptions is depreciated. Please use FusionOptions instead.")
         super().__init__(model_type)
 
 
@@ -235,7 +235,6 @@ def use_dynamic_axes(self, dynamic_batch_dim="batch_size", dynamic_seq_len="max_
             casted=True
         ) + self.get_graph_inputs_from_fused_nodes(casted=False)
 
-        dynamic_batch_inputs = {}
         for input in self.model.graph.input:
             if input.name in bert_graph_inputs:
                 dim_proto = input.type.tensor_type.shape.dim[0]
@@ -324,7 +323,7 @@ def clean_graph(self):
                 if parent_nodes is not None:
                     (
                         cast,
-                        constantOfShape,
+                        constantOfShape,  # noqa: N806
                         concat,
                         unsqueeze,
                         gather,
diff --git a/onnxruntime/python/tools/transformers/onnx_model_bert_keras.py b/onnxruntime/python/tools/transformers/onnx_model_bert_keras.py
index 33bb1d66a7528..a85e0cc2ba3f7 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_bert_keras.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_bert_keras.py
@@ -3,14 +3,14 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-import argparse
+import argparse  # noqa: F401
 import logging
-import sys
-from collections import deque
+import sys  # noqa: F401
+from collections import deque  # noqa: F401
 
-import numpy as np
+import numpy as np  # noqa: F401
 import onnx
-from onnx import ModelProto, TensorProto, numpy_helper
+from onnx import ModelProto, TensorProto, numpy_helper  # noqa: F401
 from onnx_model_bert_tf import BertOnnxModelTF
 
 logger = logging.getLogger(__name__)
@@ -61,7 +61,7 @@ def check_attention_input(self, matmul_q, matmul_k, matmul_v, parent, output_nam
         return True, reshape_nodes
 
     def fuse_attention(self):
-        input_name_to_nodes = self.input_name_to_nodes()
+        self.input_name_to_nodes()
         output_name_to_node = self.output_name_to_node()
 
         nodes_to_remove = []
@@ -81,14 +81,10 @@ def fuse_attention(self):
                         "SkipLayerNormalization",
                         "EmbedLayerNormalization",
                     ]:
-                        logger.debug(
-                            "First input for skiplayernorm: {}".format(parent.op_type if parent is not None else None)
-                        )
+                        logger.debug(f"First input for skiplayernorm: {parent.op_type if parent is not None else None}")
                         continue
                 else:
-                    logger.debug(
-                        "First input for skiplayernorm: {}".format(parent.op_type if parent is not None else None)
-                    )
+                    logger.debug(f"First input for skiplayernorm: {parent.op_type if parent is not None else None}")
                     continue
             else:
                 # TODO: shall we add back the checking of children op types.
@@ -227,11 +223,8 @@ def preprocess(self):
         self.skip_reshape()
 
     def skip_reshape(self):
-        input_name_to_nodes = self.input_name_to_nodes()
-        output_name_to_node = self.output_name_to_node()
-
-        nodes_to_remove = []
-        attention_count = 0
+        self.input_name_to_nodes()
+        self.output_name_to_node()
 
         count = 0
         reshape_nodes = self.get_nodes_by_op_type("Reshape")
@@ -261,10 +254,10 @@ def fuse_embedding(self, node, output_name_to_node):
 
         temp = numpy_helper.to_array(word_initializer)
         if len(temp.shape) == 2:
-            logger.info("Found word embedding. name:{}, shape:{}".format(word_initializer.name, temp.shape))
+            logger.info(f"Found word embedding. name:{word_initializer.name}, shape:{temp.shape}")
             word_embedding = word_initializer.name
         else:
-            logger.info("Failed to find word embedding. name:{}, shape:{}".format(word_initializer.name, temp.shape))
+            logger.info(f"Failed to find word embedding. name:{word_initializer.name}, shape:{temp.shape}")
             return False
 
         pos_initializer = self.get_initializer(add_node.input[1])
@@ -273,12 +266,10 @@ def fuse_embedding(self, node, output_name_to_node):
             if len(temp.shape) == 3 and temp.shape[0] == 1:
                 tensor = numpy_helper.from_array(temp.reshape((temp.shape[1], temp.shape[2])), "position_embedding")
                 self.add_initializer(tensor)
-                logger.info("Found position embedding. name:{}, shape:{}".format(pos_initializer.name, temp.shape[1:]))
+                logger.info(f"Found position embedding. name:{pos_initializer.name}, shape:{temp.shape[1:]}")
                 position_embedding = "position_embedding"
             else:
-                logger.info(
-                    "Failed to find position embedding. name:{}, shape:{}".format(pos_initializer.name, temp.shape)
-                )
+                logger.info(f"Failed to find position embedding. name:{pos_initializer.name}, shape:{temp.shape}")
                 return False
         else:
             pos_embed_path = self.match_parent_path(add_node, ["Gather", "Slice"], [1, 1], output_name_to_node)
@@ -294,12 +285,10 @@ def fuse_embedding(self, node, output_name_to_node):
 
             temp = numpy_helper.to_array(pos_initializer)
             if len(temp.shape) == 2:
-                logger.info("Found word embedding. name:{}, shape:{}".format(pos_initializer.name, temp.shape))
+                logger.info(f"Found word embedding. name:{pos_initializer.name}, shape:{temp.shape}")
                 position_embedding = pos_initializer.name
             else:
-                logger.info(
-                    "Failed to find position embedding. name:{}, shape:{}".format(pos_initializer.name, temp.shape)
-                )
+                logger.info(f"Failed to find position embedding. name:{pos_initializer.name}, shape:{temp.shape}")
                 return False
 
         gather = self.get_parent(skip_node, 1, output_name_to_node)
@@ -314,12 +303,10 @@ def fuse_embedding(self, node, output_name_to_node):
 
         temp = numpy_helper.to_array(segment_initializer)
         if len(temp.shape) == 2:
-            logger.info("Found segment embedding. name:{}, shape:{}".format(segment_initializer.name, temp.shape))
+            logger.info(f"Found segment embedding. name:{segment_initializer.name}, shape:{temp.shape}")
             segment_embedding = segment_initializer.name
         else:
-            logger.info(
-                "Failed to find segment embedding. name:{}, shape:{}".format(segment_initializer.name, temp.shape)
-            )
+            logger.info(f"Failed to find segment embedding. name:{segment_initializer.name}, shape:{temp.shape}")
             return False
 
         logger.info("Create Embedding node")
@@ -349,7 +336,7 @@ def fuse_mask(self):
 
                 mask_input_name = self.attention_mask.get_first_mask()
                 if unsqueeze_node.input[0] != mask_input_name:
-                    print("Cast input {} is not mask input {}".format(unsqueeze_node.input[0], mask_input_name))
+                    print(f"Cast input {unsqueeze_node.input[0]} is not mask input {mask_input_name}")
                     continue
 
                 unsqueeze_added_1 = onnx.helper.make_node(
diff --git a/onnxruntime/python/tools/transformers/onnx_model_bert_tf.py b/onnxruntime/python/tools/transformers/onnx_model_bert_tf.py
index 7455777273846..d9538a2ec4838 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_bert_tf.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_bert_tf.py
@@ -3,14 +3,14 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-import argparse
+import argparse  # noqa: F401
 import logging
-import sys
-from collections import deque
+import sys  # noqa: F401
+from collections import deque  # noqa: F401
 
 import numpy as np
 import onnx
-from onnx import ModelProto, TensorProto, helper, numpy_helper
+from onnx import ModelProto, TensorProto, helper, numpy_helper  # noqa: F401
 from onnx_model_bert import BertOnnxModel
 
 logger = logging.getLogger(__name__)
@@ -303,10 +303,10 @@ def process_embedding(self):
 
             temp = numpy_helper.to_array(initializer)
             if len(temp.shape) == 2:
-                logger.info("Found position embedding. name:{}, shape:{}".format(initializer.name, temp.shape))
+                logger.info(f"Found position embedding. name:{initializer.name}, shape:{temp.shape}")
                 position_embedding = initializer.name
             else:
-                logger.info("Failed to find position embedding. name:{}, shape:{}".format(initializer.name, temp.shape))
+                logger.info(f"Failed to find position embedding. name:{initializer.name}, shape:{temp.shape}")
                 return
 
             first_parent = self.get_parent(add_node, 0, output_name_to_node)
@@ -314,7 +314,7 @@ def process_embedding(self):
                 embeddings = self.get_2d_initializers_from_parent_subgraphs(first_parent)
                 if len(embeddings) != 2:
                     logger.warning(
-                        "Failed to find two embeddings (word and segment) from Add node. Found {}".format(embeddings)
+                        f"Failed to find two embeddings (word and segment) from Add node. Found {embeddings}"
                     )
                     return
 
@@ -323,10 +323,10 @@ def process_embedding(self):
                 for name, shape in embeddings.items():
                     if shape[0] == 2:
                         segment_embedding = name
-                        logger.info("Found segment embedding. name:{}, shape:{}".format(name, shape))
+                        logger.info(f"Found segment embedding. name:{name}, shape:{shape}")
                     else:
                         word_embedding = name
-                        logger.info("Found words embedding. name:{}, shape:{}".format(name, shape))
+                        logger.info(f"Found words embedding. name:{name}, shape:{shape}")
 
                 if word_embedding is None or segment_embedding is None:
                     logger.info("Failed to find both word and segment embedding")
diff --git a/onnxruntime/python/tools/transformers/onnx_model_gpt2.py b/onnxruntime/python/tools/transformers/onnx_model_gpt2.py
index 92197e7e4f09f..263857ffbc130 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_gpt2.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_gpt2.py
@@ -31,7 +31,7 @@ def postprocess(self):
         """
         Remove extra reshape nodes.
         """
-        logger.debug(f"start postprocessing...")
+        logger.debug("start postprocessing...")
 
         input_name_to_nodes = self.input_name_to_nodes()
         output_name_to_node = self.output_name_to_node()
@@ -42,7 +42,6 @@ def postprocess(self):
                 gemm_node, "Reshape", input_name_to_nodes, recursive=False
             )
 
-            return_indice = []
             nodes = self.match_parent_path(gemm_node, ["Reshape", "FastGelu"], [0, 0], output_name_to_node)
             if nodes is None:
                 nodes = self.match_parent_path(
diff --git a/onnxruntime/python/tools/transformers/onnx_model_t5.py b/onnxruntime/python/tools/transformers/onnx_model_t5.py
index 0a1c62da59522..a0819612b7df3 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_t5.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_t5.py
@@ -54,7 +54,6 @@ def create_mha_node(
         num_heads: int,
         hidden_size: int,
     ) -> Union[NodeProto, None]:
-
         assert num_heads > 0
 
         if hidden_size > 0 and (hidden_size % num_heads) != 0:
@@ -587,7 +586,7 @@ def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
             return
 
         pow_node = sim_ln_nodes[-2]
-        if not self.model.find_constant_input(pow_node, 2.0) == 1:
+        if self.model.find_constant_input(pow_node, 2.0) != 1:
             return
 
         root_input = pow_node.input[0]
diff --git a/onnxruntime/python/tools/transformers/onnx_model_tnlr.py b/onnxruntime/python/tools/transformers/onnx_model_tnlr.py
index 85e510a828990..d1815394e9661 100644
--- a/onnxruntime/python/tools/transformers/onnx_model_tnlr.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_tnlr.py
@@ -5,6 +5,7 @@
 import logging
 from typing import Union
 
+import numpy as np
 from fusion_attention import AttentionMask, FusionAttention
 from fusion_utils import NumpyHelper
 from onnx import NodeProto, TensorProto, helper, numpy_helper
@@ -40,7 +41,6 @@ def create_attention_node(
         output: str,
         add_qk_str: str,
     ) -> Union[NodeProto, None]:
-
         assert num_heads > 0
         if hidden_size > 0 and (hidden_size % num_heads) != 0:
             logger.debug(f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}")
@@ -123,7 +123,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             return
 
         other_inputs = []
-        for i, input in enumerate(start_node.input):
+        for _i, input in enumerate(start_node.input):
             if input not in output_name_to_node:
                 continue
 
diff --git a/onnxruntime/python/tools/transformers/optimizer.py b/onnxruntime/python/tools/transformers/optimizer.py
index 056f6abdf301f..8614b18ee11f0 100644
--- a/onnxruntime/python/tools/transformers/optimizer.py
+++ b/onnxruntime/python/tools/transformers/optimizer.py
@@ -64,7 +64,7 @@ def optimize_by_onnxruntime(
     use_gpu: bool = False,
     optimized_model_path: Optional[str] = None,
     opt_level: Optional[int] = 99,
-    disabled_optimizers=[],
+    disabled_optimizers=[],  # noqa: B006
     verbose=False,
 ) -> str:
     """
@@ -113,9 +113,7 @@ def optimize_by_onnxruntime(
         kwargs["disabled_optimizers"] = disabled_optimizers
 
     if not use_gpu:
-        session = onnxruntime.InferenceSession(
-            onnx_model_path, sess_options, providers=["CPUExecutionProvider"], **kwargs
-        )
+        onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=["CPUExecutionProvider"], **kwargs)
     else:
         gpu_ep = []
 
@@ -124,7 +122,7 @@ def optimize_by_onnxruntime(
         elif torch_version.hip:
             gpu_ep.append("MIGraphXExecutionProvider")
             gpu_ep.append("ROCMExecutionProvider")
-        session = onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=gpu_ep, **kwargs)
+        onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=gpu_ep, **kwargs)
         assert not set(onnxruntime.get_available_providers()).isdisjoint(
             ["CUDAExecutionProvider", "ROCMExecutionProvider", "MIGraphXExecutionProvider"]
         )
@@ -296,7 +294,7 @@ def optimize_model(
     # Remove the temporary model.
     if temp_model_path:
         os.remove(temp_model_path)
-        logger.debug("Remove temporary model: {}".format(temp_model_path))
+        logger.debug(f"Remove temporary model: {temp_model_path}")
 
     return optimizer
 
diff --git a/onnxruntime/python/tools/transformers/profiler.py b/onnxruntime/python/tools/transformers/profiler.py
index 9f41654af3533..fc2417ea897c3 100644
--- a/onnxruntime/python/tools/transformers/profiler.py
+++ b/onnxruntime/python/tools/transformers/profiler.py
@@ -179,7 +179,7 @@ def run_profile(onnx_model_path, use_gpu, provider, basic_optimization, thread_n
 def load_profile_json(profile_file):
     print(f"loading profile output {profile_file} ...")
 
-    with open(profile_file, "r") as opened_file:
+    with open(profile_file) as opened_file:
         sess_time = json.load(opened_file)
 
     assert isinstance(sess_time, list)
@@ -256,7 +256,7 @@ def parse_kernel_results(sess_time, threshold=0):
         else:
             op_time[op_name] = duration
 
-    lines.append(f"\nGroup kernel time by operator:")
+    lines.append("\nGroup kernel time by operator:")
     lines.append("-" * 64)
     lines.append("Total(μs)\tTime%\tOperator")
     for op_name, duration in sorted(op_time.items(), key=lambda x: x[1], reverse=True):
diff --git a/onnxruntime/python/tools/transformers/quantize_helper.py b/onnxruntime/python/tools/transformers/quantize_helper.py
index d7e9eb9718a9e..a449e881ad361 100644
--- a/onnxruntime/python/tools/transformers/quantize_helper.py
+++ b/onnxruntime/python/tools/transformers/quantize_helper.py
@@ -7,7 +7,7 @@
 import logging
 import os
 
-import onnx
+import onnx  # noqa: F401
 import torch
 from transformers.modeling_utils import Conv1D
 
diff --git a/onnxruntime/python/tools/transformers/shape_infer_helper.py b/onnxruntime/python/tools/transformers/shape_infer_helper.py
index e877497ffb1cb..f8a5464d8af78 100644
--- a/onnxruntime/python/tools/transformers/shape_infer_helper.py
+++ b/onnxruntime/python/tools/transformers/shape_infer_helper.py
@@ -15,7 +15,7 @@
 else:
     sys.path.append(os.path.join(file_path, ".."))
 
-from symbolic_shape_infer import SymbolicShapeInference, get_shape_from_type_proto, sympy
+from symbolic_shape_infer import SymbolicShapeInference, get_shape_from_type_proto, sympy  # noqa: E402
 
 logger = logging.getLogger(__name__)
 
diff --git a/onnxruntime/python/tools/transformers/shape_optimizer.py b/onnxruntime/python/tools/transformers/shape_optimizer.py
index 7174af0ac9ba0..bf507a0d8a0a3 100644
--- a/onnxruntime/python/tools/transformers/shape_optimizer.py
+++ b/onnxruntime/python/tools/transformers/shape_optimizer.py
@@ -10,12 +10,12 @@
 import argparse
 import logging
 import os
-import re
+import re  # noqa: F401
 import sys
 import tempfile
-from collections import deque
+from collections import deque  # noqa: F401
 from datetime import datetime
-from pathlib import Path
+from pathlib import Path  # noqa: F401
 from typing import List
 
 import numpy as np
@@ -73,7 +73,7 @@ def get_reshape_shape_inputs(self):
         """
         Returns a list of shape input names of Reshape nodes.
         """
-        output_name_to_node = self.output_name_to_node()
+        self.output_name_to_node()
 
         shape_inputs = []
         for node in self.model.graph.node:
@@ -107,7 +107,6 @@ def add_extra_graph_output(self, extra_outputs):
         names_to_evaluate = []
         output_names = [output.name for output in self.model.graph.output]
         for name in extra_outputs:
-
             if self.get_initializer(name) is not None:  # already a constant
                 continue
             names_to_evaluate.append(name)
@@ -272,13 +271,13 @@ def update_target_shape(self, shapes, shape_input, input_shape, verbose):
     def validate_input(self, input: str):
         if not self.find_graph_input(input):
             valid_names = [input.name for input in self.model.graph.input]
-            raise Exception("Input {} does not exist in the graph inputs: {}".format(input, valid_names))
+            raise Exception(f"Input {input} does not exist in the graph inputs: {valid_names}")
 
     def validate_outputs(self, output_names: List[str]):
         valid_names = [output.name for output in self.model.graph.output]
         for name in output_names:
             if name not in valid_names:
-                raise Exception("Output {} does not exist in the graph outputs: {}".format(name, valid_names))
+                raise Exception(f"Output {name} does not exist in the graph outputs: {valid_names}")
 
     def optimize(
         self,
diff --git a/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py b/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py
index 119455684cea1..f3e67930adbff 100644
--- a/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py
+++ b/onnxruntime/python/tools/transformers/torch_onnx_export_helper.py
@@ -6,7 +6,7 @@
 import torch
 
 TrainingMode = torch.onnx.TrainingMode
-from packaging.version import Version
+from packaging.version import Version  # noqa: E402
 
 
 def torch_onnx_export(
diff --git a/onnxruntime/python/torch_cpp_extensions/ort_torch_ext/__init__.py b/onnxruntime/python/torch_cpp_extensions/ort_torch_ext/__init__.py
index f1b8e03ace977..7d5716b85db30 100644
--- a/onnxruntime/python/torch_cpp_extensions/ort_torch_ext/__init__.py
+++ b/onnxruntime/python/torch_cpp_extensions/ort_torch_ext/__init__.py
@@ -1,7 +1,7 @@
 import threading
 from functools import wraps
 
-import torch
+import torch  # noqa: F401
 
 from onnxruntime.capi import _pybind_state as _C
 
diff --git a/onnxruntime/test/contrib_ops/attention_lstm_data_gen.py b/onnxruntime/test/contrib_ops/attention_lstm_data_gen.py
index 424fadd3ac9c6..95b82df86b0fe 100644
--- a/onnxruntime/test/contrib_ops/attention_lstm_data_gen.py
+++ b/onnxruntime/test/contrib_ops/attention_lstm_data_gen.py
@@ -3,12 +3,12 @@
 
 import tensorflow as tf
 
-batchSize = 2
+batchSize = 2  # noqa: N816
 
-memMaxStep = 3
-memDepth = 3
-queryMaxStep = 4
-queryDepth = 3
+memMaxStep = 3  # noqa: N816
+memDepth = 3  # noqa: N816
+queryMaxStep = 4  # noqa: N816
+queryDepth = 3  # noqa: N816
 am_attn_size: int = 2
 cell_hidden_size = 3
 aw_attn_size: int = 2
@@ -50,7 +50,7 @@
         ),
     )
 
-    querySeqLen = tf.Variable(
+    querySeqLen = tf.Variable(  # noqa: N816
         tf.constant([queryMaxStep - 1, queryMaxStep - 2], tf.int32),
         name="query_seq_len",
     )
@@ -82,7 +82,7 @@
         ),
     )
 
-    memSeqLen = tf.Variable(tf.constant([memMaxStep, memMaxStep - 1], dtype=tf.int32), name="mem_seq_len")
+    memSeqLen = tf.Variable(tf.constant([memMaxStep, memMaxStep - 1], dtype=tf.int32), name="mem_seq_len")  # noqa: N816
 
     with tf.variable_scope("fwBahdanau"):
         fw_mem_layer_weights = tf.get_variable(
diff --git a/onnxruntime/test/contrib_ops/multihead_attention_op_test_data_gen.py b/onnxruntime/test/contrib_ops/multihead_attention_op_test_data_gen.py
index fcac5b3ab28eb..a99f215149f4f 100644
--- a/onnxruntime/test/contrib_ops/multihead_attention_op_test_data_gen.py
+++ b/onnxruntime/test/contrib_ops/multihead_attention_op_test_data_gen.py
@@ -24,7 +24,7 @@ def __init__(
         v_head_size,
         is_decoder: bool,
     ):
-        super(Attention, self).__init__()
+        super().__init__()
         self.num_attention_heads = num_attention_heads
         self.qk_head_size = qk_head_size
         self.v_head_size = v_head_size
@@ -142,7 +142,7 @@ def forward(
         outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
 
         if self.is_decoder:
-            outputs = outputs + (past_key_value,)
+            outputs = (*outputs, past_key_value)
 
         return outputs
 
diff --git a/onnxruntime/test/onnx/gen_test_models.py b/onnxruntime/test/onnx/gen_test_models.py
index 509c27ec4efea..3fda6aa8417b0 100644
--- a/onnxruntime/test/onnx/gen_test_models.py
+++ b/onnxruntime/test/onnx/gen_test_models.py
@@ -7,7 +7,7 @@
 
 import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, TensorProto, helper, numpy_helper, utils
+from onnx import AttributeProto, GraphProto, TensorProto, helper, numpy_helper, utils  # noqa: F401
 
 
 def parse_arguments():
@@ -47,8 +47,8 @@ def generate_abs_op_test(type, X, top_test_folder):
         data_dir = os.path.join(test_folder, "test_data_0")
         os.makedirs(data_dir, exist_ok=True)
         # Create one output (ValueInfoProto)
-        Y = helper.make_tensor_value_info("Y", type, X.shape)
-        X_INFO = helper.make_tensor_value_info("X", type, X.shape)
+        Y = helper.make_tensor_value_info("Y", type, X.shape)  # noqa: N806
+        X_INFO = helper.make_tensor_value_info("X", type, X.shape)  # noqa: N806
         if is_raw:
             tensor_x = onnx.helper.make_tensor(name="X", data_type=type, dims=X.shape, vals=X.tobytes(), raw=True)
         else:
@@ -75,8 +75,8 @@ def generate_size_op_test(type, X, test_folder):
     data_dir = os.path.join(test_folder, "test_data_0")
     os.makedirs(data_dir, exist_ok=True)
     # Create one output (ValueInfoProto)
-    Y = helper.make_tensor_value_info("Y", TensorProto.INT64, [])
-    X_INFO = helper.make_tensor_value_info("X", type, X.shape)
+    Y = helper.make_tensor_value_info("Y", TensorProto.INT64, [])  # noqa: N806
+    X_INFO = helper.make_tensor_value_info("X", type, X.shape)  # noqa: N806
     tensor_x = onnx.helper.make_tensor(name="X", data_type=type, dims=X.shape, vals=X.ravel(), raw=False)
     # Create a node (NodeProto)
     node_def = helper.make_node("Size", inputs=["X"], outputs=["Y"])
@@ -98,8 +98,8 @@ def generate_reducesum_op_test(X, test_folder):
     data_dir = os.path.join(test_folder, "test_data_0")
     os.makedirs(data_dir, exist_ok=True)
     # Create one output (ValueInfoProto)
-    Y = helper.make_tensor_value_info("Y", type, [])
-    X_INFO = helper.make_tensor_value_info("X", type, X.shape)
+    Y = helper.make_tensor_value_info("Y", type, [])  # noqa: N806
+    X_INFO = helper.make_tensor_value_info("X", type, X.shape)  # noqa: N806
     tensor_x = onnx.helper.make_tensor(name="X", data_type=type, dims=X.shape, vals=X.ravel(), raw=False)
     # Create a node (NodeProto)
     node_def = helper.make_node("ReduceSum", inputs=["X"], outputs=["Y"], keepdims=0)
diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py b/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py
index 235b4111bbcb0..727351cae84ac 100644
--- a/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py
@@ -1,12 +1,10 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-import os
-
 import numpy as np
 
 
-def TestReduction(op, data, axes, keepdims):
+def TestReduction(op, data, axes, keepdims):  # noqa: N802
     if op == "ReduceL1":
         return np.sum(a=np.abs(data), axis=axes, keepdims=keepdims)
     elif op == "ReduceL2":
@@ -41,7 +39,7 @@ def TestReduction(op, data, axes, keepdims):
         return res
 
 
-def PrintResult(op, axes, keepdims, res):
+def PrintResult(op, axes, keepdims, res):  # noqa: N802
     print('  {"%s",' % op)
     print("OpAttributesResult(")
     print("    // ReductionAttribute")
@@ -67,7 +65,7 @@ def PrintResult(op, axes, keepdims, res):
     print("})},")
 
 
-def PrintDisableOptimizations():
+def PrintDisableOptimizations():  # noqa: N802
     print("// Optimizations are disabled in this file to improve build throughput")
     print("#if defined(_MSC_VER) || defined(__INTEL_COMPILER)")
     print('#pragma optimize ("", off)')
@@ -81,7 +79,7 @@ def PrintDisableOptimizations():
     print("#endif")
 
 
-def PrintReenableOptimizations():
+def PrintReenableOptimizations():  # noqa: N802
     print("#if defined(_MSC_VER) || defined(__INTEL_COMPILER)")
     print('\t#pragma optimize ("", on)')
     print("#elif defined(__GNUC__)")
diff --git a/onnxruntime/test/providers/cpu/rnn/GRU.py b/onnxruntime/test/providers/cpu/rnn/GRU.py
index 3fee29e9928f0..846fc3d06b9a9 100644
--- a/onnxruntime/test/providers/cpu/rnn/GRU.py
+++ b/onnxruntime/test/providers/cpu/rnn/GRU.py
@@ -18,7 +18,7 @@ def print_results(Y):
     print("*************************")
 
 
-class GRU_Helper:
+class GRU_Helper:  # noqa: N801
     def __init__(self, **params):
         # Match the ONNXRuntime/CNTK behavior
         # If False use the python from the ONNX spec
@@ -26,28 +26,28 @@ def __init__(self, **params):
 
         required_inputs = ["X", "W", "R"]
         for i in required_inputs:
-            assert i in params, "Missing Required Input: {0}".format(i)
+            assert i in params, f"Missing Required Input: {i}"
 
         num_directions = params["W"].shape[0]
-        sequence_length = params["X"].shape[0]
+        params["X"].shape[0]
 
         hidden_size = params["R"].shape[-1]
         batch_size = params["X"].shape[1]
 
-        X = params["X"]
-        W = params["W"]
-        R = params["R"]
-        B = (
+        X = params["X"]  # noqa: N806
+        W = params["W"]  # noqa: N806
+        R = params["R"]  # noqa: N806
+        B = (  # noqa: N806
             params["B"]
             if "B" in params
             else np.zeros(num_directions * 6 * hidden_size).reshape(num_directions, 6 * hidden_size)
         )
-        H_0 = (
+        H_0 = (  # noqa: N806
             params["initial_h"]
             if "initial_h" in params
             else np.zeros((num_directions, batch_size, hidden_size)).reshape(num_directions, batch_size, hidden_size)
         )
-        LBR = params["linear_before_reset"] if "linear_before_reset" in params else 0
+        LBR = params["linear_before_reset"] if "linear_before_reset" in params else 0  # noqa: N806
         self.direction = params["direction"] if "direction" in params else "forward"
 
         if num_directions == 1:
@@ -61,16 +61,15 @@ def __init__(self, **params):
 
         else:
             # split the inputs which have per direction rows
-            Wfw, Wbw = np.vsplit(W, 2)
-            Rfw, Rbw = np.vsplit(R, 2)
-            Bfw, Bbw = np.vsplit(B, 2)
-            H_0fw, H_0bw = np.vsplit(H_0, 2)
+            Wfw, Wbw = np.vsplit(W, 2)  # noqa: N806
+            Rfw, Rbw = np.vsplit(R, 2)  # noqa: N806
+            Bfw, Bbw = np.vsplit(B, 2)  # noqa: N806
+            H_0fw, H_0bw = np.vsplit(H_0, 2)  # noqa: N806
 
             self.one = OneDirectionGRU(X, Wfw, Rfw, Bfw, H_0fw, LBR)
             self.two = OneDirectionGRU(np.flip(X, 0), Wbw, Rbw, Bbw, H_0bw, LBR)
 
     def run(self):
-
         if self.direction == "bidirectional":
             f_output = self.one.execute()
             r_output = self.two.execute()
@@ -101,7 +100,6 @@ def run(self):
 
 class OneDirectionGRU:
     def __init__(self, X, W, R, B, initial_h, LBR):
-
         self.X = X
         # remove num_directions axis for W, R, B, H_0
         self.W = np.squeeze(W, axis=0)
@@ -138,7 +136,7 @@ def execute(self):
         # print_with_shape("r_br", r_br)
         # print_with_shape("r_bh", r_bh)
 
-        seq_len = self.X.shape[0]
+        self.X.shape[0]
         num_directions = 1
         hidden_size = self.R.shape[-1]
         batch_size = self.X.shape[1]
@@ -157,7 +155,7 @@ def execute(self):
             print_with_shape("r", r)
             print_with_shape("h", h)
 
-            H = (1 - z) * h + z * self.H_0
+            H = (1 - z) * h + z * self.H_0  # noqa: N806
 
             print_with_shape("H", H)
             output = np.append(output, H.reshape(1, 1, batch_size, hidden_size), axis=0)
@@ -169,11 +167,10 @@ def execute(self):
 
 class ONNXRuntimeTestContext:
     @staticmethod
-    def OneDirectionWeights():
-
+    def OneDirectionWeights():  # noqa: N802
         hidden_size = 2
 
-        W = np.array(
+        W = np.array(  # noqa: N806
             [
                 [
                     [-0.494659, 0.0453352],  # Wz
@@ -186,7 +183,7 @@ def OneDirectionWeights():
             ]
         ).astype(np.float32)
 
-        R = np.array(
+        R = np.array(  # noqa: N806
             [
                 [
                     [0.146626, -0.0620289],  # Rz
@@ -199,34 +196,43 @@ def OneDirectionWeights():
             ]
         ).astype(np.float32)
 
-        W_B = np.array([[0.381619, 0.0323954, -0.258721, 0.45056, -0.250755, 0.0967895,]]).astype(  # Wbz  # Wbr
+        W_B = np.array(  # noqa: N806
+            [
+                [
+                    0.381619,
+                    0.0323954,
+                    -0.258721,
+                    0.45056,
+                    -0.250755,
+                    0.0967895,
+                ]
+            ]
+        ).astype(  # Wbz  # Wbr
             np.float32
         )  # Wbh
-        R_B = np.zeros((1, 3 * hidden_size)).astype(np.float32)
-        B = np.concatenate((W_B, R_B), axis=1)
+        R_B = np.zeros((1, 3 * hidden_size)).astype(np.float32)  # noqa: N806
+        B = np.concatenate((W_B, R_B), axis=1)  # noqa: N806
 
         return W, R, B
 
     @staticmethod
-    def BidirectionalWeights():
-
-        W1, R1, B1 = ONNXRuntimeTestContext.OneDirectionWeights()
+    def BidirectionalWeights():  # noqa: N802
+        W1, R1, B1 = ONNXRuntimeTestContext.OneDirectionWeights()  # noqa: N806
 
         hidden_size = R1.shape[-1]
         input_size = W1.shape[-1]
 
-        W = np.tile(W1, (2, 1)).reshape(2, 3 * hidden_size, input_size)
-        R = np.tile(R1, (2, 1)).reshape(2, 3 * hidden_size, hidden_size)
-        B = np.tile(B1, (2, 1))
+        W = np.tile(W1, (2, 1)).reshape(2, 3 * hidden_size, input_size)  # noqa: N806
+        R = np.tile(R1, (2, 1)).reshape(2, 3 * hidden_size, hidden_size)  # noqa: N806
+        B = np.tile(B1, (2, 1))  # noqa: N806
 
         return W, R, B
 
 
 # replicate ONNXRuntime unit tests inputs to validate output
-class GRU_ONNXRuntimeUnitTests:
+class GRU_ONNXRuntimeUnitTests:  # noqa: N801
     @staticmethod
-    def ForwardDefaultActivationsSimpleWeightsNoBiasTwoRows():
-
+    def ForwardDefaultActivationsSimpleWeightsNoBiasTwoRows():  # noqa: N802
         print(GRU_ONNXRuntimeUnitTests.ForwardDefaultActivationsSimpleWeightsNoBiasTwoRows.__name__)
 
         seq_length = 2
@@ -235,46 +241,44 @@ def ForwardDefaultActivationsSimpleWeightsNoBiasTwoRows():
         hidden_size = 3
         input = np.array([1.0, 2.0, 10.0, 11.0]).astype(np.float32).reshape(seq_length, batch_size, input_size)
 
-        W = np.array([0.1, 0.2, 0.3, 1, 2, 3, 10, 11, 12]).astype(np.float32).reshape(1, 3 * hidden_size, input_size)
+        W = (  # noqa: N806
+            np.array([0.1, 0.2, 0.3, 1, 2, 3, 10, 11, 12]).astype(np.float32).reshape(1, 3 * hidden_size, input_size)
+        )
 
         weight_scale = 0.1
-        R = weight_scale * np.ones((1, 3 * hidden_size, hidden_size)).astype(np.float32)
+        R = weight_scale * np.ones((1, 3 * hidden_size, hidden_size)).astype(np.float32)  # noqa: N806
 
         gru = GRU_Helper(X=input, W=W, R=R, direction="forward")
         fw_output = gru.run()
         print_results(fw_output)
 
     @staticmethod
-    def ReverseDefaultActivationsSimpleWeightsNoBiasTwoRows():
-
+    def ReverseDefaultActivationsSimpleWeightsNoBiasTwoRows():  # noqa: N802
         print(GRU_ONNXRuntimeUnitTests.ReverseDefaultActivationsSimpleWeightsNoBiasTwoRows.__name__)
 
-        seq_length = 2
-        batch_size = 2
         input_size = 1
         hidden_size = 3
         input = np.array([[[1.0], [2.0]], [[10.0], [11.0]]]).astype(np.float32)
 
-        W = np.array([0.1, 0.2, 0.3, 1, 2, 3, 10, 11, 12]).astype(np.float32).reshape(1, 3 * hidden_size, input_size)
+        W = (  # noqa: N806
+            np.array([0.1, 0.2, 0.3, 1, 2, 3, 10, 11, 12]).astype(np.float32).reshape(1, 3 * hidden_size, input_size)
+        )
 
         weight_scale = 0.1
-        R = weight_scale * np.ones((1, 3 * hidden_size, hidden_size)).astype(np.float32)
+        R = weight_scale * np.ones((1, 3 * hidden_size, hidden_size)).astype(np.float32)  # noqa: N806
 
         gru = GRU_Helper(X=input, W=W, R=R, direction="reverse")
         fw_output = gru.run()
         print_results(fw_output)
 
     @staticmethod
-    def BidirectionalDefaultActivationsSimpleWeightsNoBias(linear_before_reset=0):
-
+    def BidirectionalDefaultActivationsSimpleWeightsNoBias(linear_before_reset=0):  # noqa: N802
         print(
             GRU_ONNXRuntimeUnitTests.BidirectionalDefaultActivationsSimpleWeightsNoBias.__name__
             + ".linear_before_reset="
             + str(linear_before_reset)
         )
 
-        seq_length = 2
-        batch_size = 3 if linear_before_reset else 2
         input_size = 1
         hidden_size = 3
 
@@ -283,10 +287,12 @@ def BidirectionalDefaultActivationsSimpleWeightsNoBias(linear_before_reset=0):
         else:
             input = np.array([[[1.0], [2.0]], [[10.0], [11.0]]]).astype(np.float32)
 
-        W = np.array([0.1, 0.2, 0.3, 1, 2, 3, 10, 11, 12]).astype(np.float32).reshape(1, 3 * hidden_size, input_size)
+        W = (  # noqa: N806
+            np.array([0.1, 0.2, 0.3, 1, 2, 3, 10, 11, 12]).astype(np.float32).reshape(1, 3 * hidden_size, input_size)
+        )
 
         weight_scale = 0.1
-        R = weight_scale * np.ones((1, 3 * hidden_size, hidden_size)).astype(np.float32)
+        R = weight_scale * np.ones((1, 3 * hidden_size, hidden_size)).astype(np.float32)  # noqa: N806
 
         # duplicate the W and R inputs so we use the same values for both forward and reverse
         gru = GRU_Helper(
@@ -301,8 +307,7 @@ def BidirectionalDefaultActivationsSimpleWeightsNoBias(linear_before_reset=0):
         print_results(fw_output)
 
     @staticmethod
-    def DefaultActivationsSimpleWeightsWithBias(rows=2, direction="forward", linear_before_reset=0):
-
+    def DefaultActivationsSimpleWeightsWithBias(rows=2, direction="forward", linear_before_reset=0):  # noqa: N802
         print(
             GRU_ONNXRuntimeUnitTests.DefaultActivationsSimpleWeightsWithBias.__name__
             + " batch_parallel="
@@ -325,17 +330,17 @@ def DefaultActivationsSimpleWeightsWithBias(rows=2, direction="forward", linear_
 
         input = np.array(input).astype(np.float32).reshape(seq_length, batch_size, input_size)
 
-        W = (
+        W = (  # noqa: N806
             np.array([0.1, 0.2, 0.3, 0.2, 0.3, 0.1, 0.3, 0.1, 0.2])
             .astype(np.float32)
             .reshape(1, 3 * hidden_size, input_size)
         )
 
         weight_scale = 0.1
-        R = weight_scale * np.ones((1, 3 * hidden_size, hidden_size)).astype(np.float32)
+        R = weight_scale * np.ones((1, 3 * hidden_size, hidden_size)).astype(np.float32)  # noqa: N806
 
         # Wb[zrh] Rb[zrh]
-        B = (
+        B = (  # noqa: N806
             np.array(
                 [
                     -0.01,
@@ -374,62 +379,56 @@ def DefaultActivationsSimpleWeightsWithBias(rows=2, direction="forward", linear_
         print_results(fw_output)
 
     @staticmethod
-    def ForwardDefaultActivationsSimpleWeightsWithBiasBatchParallel():
+    def ForwardDefaultActivationsSimpleWeightsWithBiasBatchParallel():  # noqa: N802
         GRU_ONNXRuntimeUnitTests.DefaultActivationsSimpleWeightsWithBias()
 
     @staticmethod
-    def ForwardDefaultActivationsSimpleWeightsWithBiasBatchParallelLinearBeforeReset():
-
+    def ForwardDefaultActivationsSimpleWeightsWithBiasBatchParallelLinearBeforeReset():  # noqa: N802
         GRU_ONNXRuntimeUnitTests.DefaultActivationsSimpleWeightsWithBias(linear_before_reset=1)
 
     @staticmethod
-    def ReverseDefaultActivationsSimpleWeightsWithBiasBatchParallelLinearBeforeReset():
-
+    def ReverseDefaultActivationsSimpleWeightsWithBiasBatchParallelLinearBeforeReset():  # noqa: N802
         GRU_ONNXRuntimeUnitTests.DefaultActivationsSimpleWeightsWithBias(direction="reverse", linear_before_reset=1)
 
     @staticmethod
-    def ForwardDefaultActivationsSimpleWeightsWithBiasLinearBeforeReset():
-
+    def ForwardDefaultActivationsSimpleWeightsWithBiasLinearBeforeReset():  # noqa: N802
         GRU_ONNXRuntimeUnitTests.DefaultActivationsSimpleWeightsWithBias(rows=1, linear_before_reset=1)
 
     @staticmethod
-    def ReverseDefaultActivationsSimpleWeightsWithBiasLinearBeforeReset():
-
+    def ReverseDefaultActivationsSimpleWeightsWithBiasLinearBeforeReset():  # noqa: N802
         GRU_ONNXRuntimeUnitTests.DefaultActivationsSimpleWeightsWithBias(
             rows=1, direction="reverse", linear_before_reset=1
         )
 
     @staticmethod
-    def Legacy_TestGRUOpForwardBasic():
-
+    def Legacy_TestGRUOpForwardBasic():  # noqa: N802
         print(GRU_ONNXRuntimeUnitTests.Legacy_TestGRUOpForwardBasic.__name__)
 
         input = np.array([[[-0.455351, -0.276391]], [[-0.185934, -0.269585]]]).astype(np.float32)
 
-        W, R, B = ONNXRuntimeTestContext.OneDirectionWeights()
+        W, R, B = ONNXRuntimeTestContext.OneDirectionWeights()  # noqa: N806
         gru = GRU_Helper(X=input, W=W, R=R, B=B)
         output = gru.run()
         print_results(output)
 
     @staticmethod
-    def Legacy_TestGRUOpBackwardBasic():
+    def Legacy_TestGRUOpBackwardBasic():  # noqa: N802
         print(GRU_ONNXRuntimeUnitTests.Legacy_TestGRUOpBackwardBasic.__name__)
 
         input = np.array([[[-0.185934, -0.269585]], [[-0.455351, -0.276391]]]).astype(np.float32)
 
-        W, R, B = ONNXRuntimeTestContext.OneDirectionWeights()
+        W, R, B = ONNXRuntimeTestContext.OneDirectionWeights()  # noqa: N806
         gru = GRU_Helper(X=input, W=W, R=R, B=B, direction="reverse")
         output = gru.run()
         print_results(output)
 
     @staticmethod
-    def Legacy_TestGRUOpBidirectionalBasic():
-
+    def Legacy_TestGRUOpBidirectionalBasic():  # noqa: N802
         print(GRU_ONNXRuntimeUnitTests.Legacy_TestGRUOpBidirectionalBasic.__name__)
 
         input = np.array([[[-0.455351, -0.276391]], [[-0.185934, -0.269585]]]).astype(np.float32)
 
-        W, R, B = ONNXRuntimeTestContext.BidirectionalWeights()
+        W, R, B = ONNXRuntimeTestContext.BidirectionalWeights()  # noqa: N806
         gru = GRU_Helper(X=input, W=W, R=R, B=B, direction="bidirectional")
         output = gru.run()
         print_results(output)
diff --git a/onnxruntime/test/providers/cpu/rnn/LSTM.py b/onnxruntime/test/providers/cpu/rnn/LSTM.py
index 039a419552586..74299ea2c75a3 100644
--- a/onnxruntime/test/providers/cpu/rnn/LSTM.py
+++ b/onnxruntime/test/providers/cpu/rnn/LSTM.py
@@ -1,9 +1,8 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-from __future__ import absolute_import, division, print_function, unicode_literals
 
-from typing import Any, Tuple
+from typing import Any, Tuple  # noqa: F401
 
 import numpy as np  # type: ignore
 
@@ -30,28 +29,27 @@ def print_results(Y, Y_h, Y_c):
     print("*************************")
 
 
-class LSTM_Helper:
+class LSTM_Helper:  # noqa: N801
     def __init__(self, **params):  # type: (*Any) -> None
-
         required_inputs = ["X", "W", "R"]
         for i in required_inputs:
-            assert i in params, "Missing Required Input: {0}".format(i)
+            assert i in params, f"Missing Required Input: {i}"
 
-        X = params["X"]
-        W = params["W"]
-        R = params["R"]
+        X = params["X"]  # noqa: N806
+        W = params["W"]  # noqa: N806
+        R = params["R"]  # noqa: N806
 
         num_directions = W.shape[0]
-        sequence_length = X.shape[0]
+        X.shape[0]
         batch_size = X.shape[1]
         hidden_size = R.shape[-1]
 
-        B = (
+        B = (  # noqa: N806
             params["B"]
             if "B" in params
             else np.zeros(num_directions * 8 * hidden_size).reshape(num_directions, 8 * hidden_size)
         )
-        P = (
+        P = (  # noqa: N806
             params["P"]
             if "P" in params
             else np.zeros(num_directions * 3 * hidden_size).reshape(num_directions, 3 * hidden_size)
@@ -86,10 +84,10 @@ def __init__(self, **params):  # type: (*Any) -> None
 
         else:
             # split the inputs which have per direction rows
-            Wfw, Wbw = np.vsplit(W, 2)
-            Rfw, Rbw = np.vsplit(R, 2)
-            Bfw, Bbw = np.vsplit(B, 2)
-            Pfw, Pbw = np.vsplit(P, 2)
+            Wfw, Wbw = np.vsplit(W, 2)  # noqa: N806
+            Rfw, Rbw = np.vsplit(R, 2)  # noqa: N806
+            Bfw, Bbw = np.vsplit(B, 2)  # noqa: N806
+            Pfw, Pbw = np.vsplit(P, 2)  # noqa: N806
             h_0fw, h_0bw = np.vsplit(h_0, 2)
             c_0fw, c_0bw = np.vsplit(c_0, 2)
 
@@ -110,10 +108,9 @@ def __init__(self, **params):  # type: (*Any) -> None
             )
 
     def run(self):
-
         if self.direction == "bidirectional":
-            f_output, f_Y_h, f_Y_c = self.one.execute()
-            r_output, r_Y_h, r_Y_c = self.two.execute()
+            f_output, f_Y_h, f_Y_c = self.one.execute()  # noqa: N806
+            r_output, r_Y_h, r_Y_c = self.two.execute()  # noqa: N806
 
             # flip reverse output it matches the original input order
             r_output_orig_input_order = np.flip(r_output, 0)
@@ -133,11 +130,11 @@ def run(self):
 
             output = output.reshape(seq_length, 2, batch_size, hidden_size)
 
-            Y_h = np.append(f_Y_h, r_Y_h)
-            Y_c = np.append(f_Y_c, r_Y_c)
+            Y_h = np.append(f_Y_h, r_Y_h)  # noqa: N806
+            Y_c = np.append(f_Y_c, r_Y_c)  # noqa: N806
 
         else:
-            output, Y_h, Y_c = self.one.execute()
+            output, Y_h, Y_c = self.one.execute()  # noqa: N806
             if self.direction == "reverse":
                 # flip so it's back in the original order of the inputs
                 output = np.flip(output, 0)
@@ -171,7 +168,6 @@ def __init__(
         input_forget=False,
         clip=9999.0,
     ):
-
         self.X = X
         # remove num_directions axis for W, R, B, P, H_0, C_0
         self.W = np.squeeze(W, axis=0)
@@ -196,12 +192,11 @@ def __init__(
         self.clip = clip
 
     def execute(self):  # type: () -> Tuple[np.ndarray, np.ndarray]
-
         [p_i, p_o, p_f] = np.split(self.P, 3)
         h_list = []
 
-        H_t = self.h_0
-        C_t = self.c_0
+        H_t = self.h_0  # noqa: N806
+        C_t = self.c_0  # noqa: N806
 
         for x in np.split(self.X, self.X.shape[0], axis=0):
             print_with_shape("Xt1", x)
@@ -210,11 +205,11 @@ def execute(self):  # type: () -> Tuple[np.ndarray, np.ndarray]
 
             print_with_shape("W^T", np.transpose(self.W))
             # t0 == t-1, t1 == current
-            Xt1_W = np.dot(x, np.transpose(self.W))
+            Xt1_W = np.dot(x, np.transpose(self.W))  # noqa: N806
             print_with_shape("Xt1_W^T", Xt1_W)
-            Ht0_R = np.dot(H_t, np.transpose(self.R))
+            Ht0_R = np.dot(H_t, np.transpose(self.R))  # noqa: N806
             print_with_shape("Ht-1*R", Ht0_R)
-            WbRb = np.add(*np.split(self.B, 2))
+            WbRb = np.add(*np.split(self.B, 2))  # noqa: N806
             print_with_shape("Wb + Rb", WbRb)
             gates = Xt1_W + Ht0_R + WbRb
 
@@ -231,12 +226,12 @@ def execute(self):  # type: () -> Tuple[np.ndarray, np.ndarray]
             else:
                 f = self.f(np.clip((ft_in + p_f * C_t), -self.clip, self.clip))
             c = self.g(np.clip(ct_in, -self.clip, self.clip))
-            C = f * C_t + i * c
+            C = f * C_t + i * c  # noqa: N806
             o = self.f(np.clip((ot_in + p_o * C), -self.clip, self.clip))
-            H = o * self.h(C)
+            H = o * self.h(C)  # noqa: N806
             h_list.append(H)
-            H_t = H
-            C_t = C
+            H_t = H  # noqa: N806
+            C_t = C  # noqa: N806
 
             print_with_shape("i", i)
             print_with_shape("f", f)
@@ -252,40 +247,36 @@ def execute(self):  # type: () -> Tuple[np.ndarray, np.ndarray]
 
 class LSTM:  # Base):
     @staticmethod
-    def SimpleWeightsNoBiasTwoRows(direction):  # type: () -> None
-
+    def SimpleWeightsNoBiasTwoRows(direction):  # type: () -> None  # noqa: N802
         print(LSTM.SimpleWeightsNoBiasTwoRows.__name__ + " direction=" + direction)
 
-        seq_length = 2
-        batch_size = 2
         input_size = 1
         hidden_size = 3
         number_of_gates = 4
 
         input = np.array([[[1.0], [2.0]], [[10.0], [11.0]]]).astype(np.float32)
 
-        W = (
+        W = (  # noqa: N806
             np.array([0.1, 0.2, 0.3, 0.4, 1, 2, 3, 4, 10, 11, 12, 13])
             .astype(np.float32)
             .reshape(1, number_of_gates * hidden_size, input_size)
         )
 
         weight_scale = 0.1
-        R = weight_scale * np.ones((1, number_of_gates * hidden_size, hidden_size)).astype(np.float32)
+        R = weight_scale * np.ones((1, number_of_gates * hidden_size, hidden_size)).astype(np.float32)  # noqa: N806
 
         if direction == "bidirectional":
-            W = W = np.tile(W, (2, 1)).reshape(2, number_of_gates * hidden_size, input_size)
-            R = R = np.tile(R, (2, 1)).reshape(2, number_of_gates * hidden_size, hidden_size)
+            W = W = np.tile(W, (2, 1)).reshape(2, number_of_gates * hidden_size, input_size)  # noqa: N806
+            R = R = np.tile(R, (2, 1)).reshape(2, number_of_gates * hidden_size, hidden_size)  # noqa: N806
 
         lstm = LSTM_Helper(X=input, W=W, R=R, direction=direction)
 
-        Y, Y_h, Y_c = lstm.run()
+        Y, Y_h, Y_c = lstm.run()  # noqa: N806
         # expect(node, inputs=[input, W, R], outputs=[Y_h.astype(np.float32)], name='test_lstm_defaults')
         print_results(Y, Y_h, Y_c)
 
     @staticmethod
-    def LargeBatchWithClip(clip):
-
+    def LargeBatchWithClip(clip):  # noqa: N802
         print(LSTM.LargeBatchWithClip.__name__ + " clip=" + str(clip))
 
         seq_length = 2
@@ -301,22 +292,22 @@ def LargeBatchWithClip(clip):
             .reshape(seq_length, batch_size, input_size)
         )
 
-        W = (
+        W = (  # noqa: N806
             np.array([0.1, 0.2, 0.3, 0.4, 1, 2, 3, 4, 10, 11, 12, 13])
             .astype(np.float32)
             .reshape(1, number_of_gates * hidden_size, input_size)
         )
 
         weight_scale = 0.1
-        R = weight_scale * np.ones((1, number_of_gates * hidden_size, hidden_size)).astype(np.float32)
+        R = weight_scale * np.ones((1, number_of_gates * hidden_size, hidden_size)).astype(np.float32)  # noqa: N806
 
         lstm = LSTM_Helper(X=input, W=W, R=R, clip=clip)
 
-        Y, Y_h, Y_c = lstm.run()
+        Y, Y_h, Y_c = lstm.run()  # noqa: N806
         print_results(Y, Y_h, Y_c)
 
     @staticmethod
-    def BatchParallelFalseSeqLengthGreaterThanOne():
+    def BatchParallelFalseSeqLengthGreaterThanOne():  # noqa: N802
         print(LSTM.BatchParallelFalseSeqLengthGreaterThanOne.__name__)
 
         seq_length = 2
@@ -327,23 +318,22 @@ def BatchParallelFalseSeqLengthGreaterThanOne():
 
         input = np.array([1, 2]).astype(np.float32).reshape(seq_length, batch_size, input_size)
 
-        W = (
+        W = (  # noqa: N806
             np.array([0.1, 0.2, 0.3, 0.4, 1, 2, 3, 4])
             .astype(np.float32)
             .reshape(1, number_of_gates * hidden_size, input_size)
         )
 
         weight_scale = 0.1
-        R = weight_scale * np.ones((1, number_of_gates * hidden_size, hidden_size)).astype(np.float32)
+        R = weight_scale * np.ones((1, number_of_gates * hidden_size, hidden_size)).astype(np.float32)  # noqa: N806
 
         lstm = LSTM_Helper(X=input, W=W, R=R)
 
-        Y, Y_h, Y_c = lstm.run()
+        Y, Y_h, Y_c = lstm.run()  # noqa: N806
         print_results(Y, Y_h, Y_c)
 
     @staticmethod
     def export_initial_bias():  # type: () -> None
-
         print(LSTM.export_initial_bias.__name__)
 
         input = np.array([[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]]).astype(np.float32)
@@ -361,16 +351,16 @@ def export_initial_bias():  # type: () -> None
         #     hidden_size=hidden_size
         # )
 
-        W = weight_scale * np.ones((1, number_of_gates * hidden_size, input_size)).astype(np.float32)
-        R = weight_scale * np.ones((1, number_of_gates * hidden_size, hidden_size)).astype(np.float32)
+        W = weight_scale * np.ones((1, number_of_gates * hidden_size, input_size)).astype(np.float32)  # noqa: N806
+        R = weight_scale * np.ones((1, number_of_gates * hidden_size, hidden_size)).astype(np.float32)  # noqa: N806
 
         # Adding custom bias
-        W_B = custom_bias * np.ones((1, number_of_gates * hidden_size)).astype(np.float32)
-        R_B = np.zeros((1, number_of_gates * hidden_size)).astype(np.float32)
-        B = np.concatenate((W_B, R_B), 1)
+        W_B = custom_bias * np.ones((1, number_of_gates * hidden_size)).astype(np.float32)  # noqa: N806
+        R_B = np.zeros((1, number_of_gates * hidden_size)).astype(np.float32)  # noqa: N806
+        B = np.concatenate((W_B, R_B), 1)  # noqa: N806
 
         lstm = LSTM_Helper(X=input, W=W, R=R, B=B)
-        Y, Y_h, Y_c = lstm.run()
+        Y, Y_h, Y_c = lstm.run()  # noqa: N806
         print_results(Y, Y_h, Y_c)
         # expect(node, inputs=[input, W, R, B], outputs=[Y_h.astype(np.float32)], name='test_lstm_with_initial_bias')
 
@@ -392,34 +382,32 @@ def export_peepholes():  # type: () -> None
         # )
 
         # Initializing Inputs
-        W = weight_scale * np.ones((1, number_of_gates * hidden_size, input_size)).astype(np.float32)
-        R = weight_scale * np.ones((1, number_of_gates * hidden_size, hidden_size)).astype(np.float32)
-        B = np.zeros((1, 2 * number_of_gates * hidden_size)).astype(np.float32)
-        seq_lens = np.repeat(input.shape[0], input.shape[1]).astype(np.int32)
+        W = weight_scale * np.ones((1, number_of_gates * hidden_size, input_size)).astype(np.float32)  # noqa: N806
+        R = weight_scale * np.ones((1, number_of_gates * hidden_size, hidden_size)).astype(np.float32)  # noqa: N806
+        B = np.zeros((1, 2 * number_of_gates * hidden_size)).astype(np.float32)  # noqa: N806
+        np.repeat(input.shape[0], input.shape[1]).astype(np.int32)
         init_h = np.zeros((1, input.shape[1], hidden_size)).astype(np.float32)
         init_c = np.zeros((1, input.shape[1], hidden_size)).astype(np.float32)
-        P = weight_scale * np.ones((1, number_of_peepholes * hidden_size)).astype(np.float32)
+        P = weight_scale * np.ones((1, number_of_peepholes * hidden_size)).astype(np.float32)  # noqa: N806
 
         lstm = LSTM_Helper(X=input, W=W, R=R, B=B, P=P, initial_c=init_c, initial_h=init_h)
-        Y, Y_h, Y_c = lstm.run()
+        Y, Y_h, Y_c = lstm.run()  # noqa: N806
         print_results(Y, Y_h, Y_c)
         # expect(node, inputs=[input, W, R, B, seq_lens, init_h, init_c, P], outputs=[Y_h.astype(np.float32)],
         #        name='test_lstm_with_peepholes')
 
 
 class ONNXRuntimeTestContext:
-
     hidden_size = 2
     input_size = 2
 
     @staticmethod
-    def OneDirectionWeights():
-
+    def OneDirectionWeights():  # noqa: N802
         num_directions = 1
         hidden_size = ONNXRuntimeTestContext.hidden_size
         input_size = ONNXRuntimeTestContext.input_size
 
-        W = (
+        W = (  # noqa: N806
             np.array(
                 [
                     -0.494659,
@@ -444,7 +432,7 @@ def OneDirectionWeights():
             .astype(np.float32)
         )
 
-        R = (
+        R = (  # noqa: N806
             np.array(
                 [
                     0.146626,
@@ -469,14 +457,14 @@ def OneDirectionWeights():
             .astype(np.float32)
         )
 
-        P = (
+        P = (  # noqa: N806
             np.array([0.2345, 0.5235, 0.4378, 0.3475, 0.8927, 0.3456])
             .reshape(num_directions, 3 * hidden_size)
             .astype(np.float32)
         )
 
         # // [8*hidden]
-        B = (
+        B = (  # noqa: N806
             np.array(
                 [
                     0.381619,
@@ -505,22 +493,21 @@ def OneDirectionWeights():
         return W, R, B, P
 
     @staticmethod
-    def BidirectionalWeights():
-
+    def BidirectionalWeights():  # noqa: N802
         hidden_size = ONNXRuntimeTestContext.hidden_size
         input_size = ONNXRuntimeTestContext.input_size
 
-        W1, R1, B1, P1 = ONNXRuntimeTestContext.OneDirectionWeights()
+        W1, R1, B1, P1 = ONNXRuntimeTestContext.OneDirectionWeights()  # noqa: N806
 
-        W = np.tile(W1, (2, 1)).reshape(2, 4 * hidden_size, input_size)
-        R = np.tile(R1, (2, 1)).reshape(2, 4 * hidden_size, hidden_size)
-        B = np.tile(B1, (2, 1))
-        P = np.tile(P1, (2, 1))
+        W = np.tile(W1, (2, 1)).reshape(2, 4 * hidden_size, input_size)  # noqa: N806
+        R = np.tile(R1, (2, 1)).reshape(2, 4 * hidden_size, hidden_size)  # noqa: N806
+        B = np.tile(B1, (2, 1))  # noqa: N806
+        P = np.tile(P1, (2, 1))  # noqa: N806
 
         return W, R, B, P
 
     @staticmethod
-    def DefaultInput():
+    def DefaultInput():  # noqa: N802
         seq_length = 2
         batch_size = 1
         input_size = 2
@@ -536,84 +523,83 @@ def DefaultInput():
 
 class ONNXRuntimeUnitTests:
     @staticmethod
-    def ONNXRuntime_TestLSTMBidirectionalBasic():
+    def ONNXRuntime_TestLSTMBidirectionalBasic():  # noqa: N802
         print(ONNXRuntimeUnitTests.ONNXRuntime_TestLSTMBidirectionalBasic.__name__)
 
         input = ONNXRuntimeTestContext.DefaultInput()
-        W, R, B, P = ONNXRuntimeTestContext.BidirectionalWeights()
+        W, R, B, P = ONNXRuntimeTestContext.BidirectionalWeights()  # noqa: N806
         lstm = LSTM_Helper(X=input, W=W, R=R, B=B, P=P, direction="bidirectional")
-        Y, Y_h, Y_c = lstm.run()
+        Y, Y_h, Y_c = lstm.run()  # noqa: N806
         print_results(Y, Y_h, Y_c)
 
     @staticmethod
-    def ONNXRuntime_TestLSTMForwardNoBiasUsePeepholes():
+    def ONNXRuntime_TestLSTMForwardNoBiasUsePeepholes():  # noqa: N802
         print(ONNXRuntimeUnitTests.ONNXRuntime_TestLSTMForwardNoBiasUsePeepholes.__name__)
         input = ONNXRuntimeTestContext.DefaultInput()
-        W, R, B, P = ONNXRuntimeTestContext.OneDirectionWeights()
+        W, R, B, P = ONNXRuntimeTestContext.OneDirectionWeights()  # noqa: N806
         lstm = LSTM_Helper(X=input, W=W, R=R, P=P)  # no bias
-        Y, Y_h, Y_c = lstm.run()
+        Y, Y_h, Y_c = lstm.run()  # noqa: N806
         print_results(Y, Y_h, Y_c)
 
     @staticmethod
-    def ONNXRuntime_TestLSTMForwardInputForget():
+    def ONNXRuntime_TestLSTMForwardInputForget():  # noqa: N802
         print(ONNXRuntimeUnitTests.ONNXRuntime_TestLSTMForwardInputForget.__name__)
 
         input = ONNXRuntimeTestContext.DefaultInput()
-        W, R, B, P = ONNXRuntimeTestContext.OneDirectionWeights()
+        W, R, B, P = ONNXRuntimeTestContext.OneDirectionWeights()  # noqa: N806
         lstm = LSTM_Helper(X=input, W=W, R=R, B=B, P=P, input_forget=True)
-        Y, Y_h, Y_c = lstm.run()
+        Y, Y_h, Y_c = lstm.run()  # noqa: N806
         print_results(Y, Y_h, Y_c)
 
     @staticmethod
-    def ONNXRuntime_TestLSTMForwardClip():
+    def ONNXRuntime_TestLSTMForwardClip():  # noqa: N802
         print(ONNXRuntimeUnitTests.ONNXRuntime_TestLSTMForwardClip.__name__)
 
         input = ONNXRuntimeTestContext.DefaultInput()
-        W, R, B, P = ONNXRuntimeTestContext.OneDirectionWeights()
+        W, R, B, P = ONNXRuntimeTestContext.OneDirectionWeights()  # noqa: N806
         lstm = LSTM_Helper(X=input, W=W, R=R, B=B, P=P, clip=0.1)
-        Y, Y_h, Y_c = lstm.run()
+        Y, Y_h, Y_c = lstm.run()  # noqa: N806
         print_results(Y, Y_h, Y_c)
 
     @staticmethod
-    def ONNXRuntime_TestLSTMBackward():
+    def ONNXRuntime_TestLSTMBackward():  # noqa: N802
         print(ONNXRuntimeUnitTests.ONNXRuntime_TestLSTMBackward.__name__)
 
         input = ONNXRuntimeTestContext.DefaultInput()
-        W, R, B, P = ONNXRuntimeTestContext.OneDirectionWeights()
+        W, R, B, P = ONNXRuntimeTestContext.OneDirectionWeights()  # noqa: N806
         lstm = LSTM_Helper(X=input, W=W, R=R, B=B, P=P, direction="reverse")
-        Y, Y_h, Y_c = lstm.run()
+        Y, Y_h, Y_c = lstm.run()  # noqa: N806
         print_results(Y, Y_h, Y_c)
 
     @staticmethod
-    def ONNXRuntime_TestLSTMForwardHiddenState():
+    def ONNXRuntime_TestLSTMForwardHiddenState():  # noqa: N802
         print(ONNXRuntimeUnitTests.ONNXRuntime_TestLSTMForwardHiddenState.__name__)
 
         input = ONNXRuntimeTestContext.DefaultInput()
-        W, R, B, P = ONNXRuntimeTestContext.OneDirectionWeights()
+        W, R, B, P = ONNXRuntimeTestContext.OneDirectionWeights()  # noqa: N806
         initial_h = np.array([0.34, 0.72]).reshape(1, 1, 2).astype(np.float32)
         lstm = LSTM_Helper(X=input, W=W, R=R, B=B, initial_h=initial_h)
-        Y, Y_h, Y_c = lstm.run()
+        Y, Y_h, Y_c = lstm.run()  # noqa: N806
         print_results(Y, Y_h, Y_c)
 
     @staticmethod
-    def ONNXRuntime_TestLSTMForwardCellState():
+    def ONNXRuntime_TestLSTMForwardCellState():  # noqa: N802
         print(ONNXRuntimeUnitTests.ONNXRuntime_TestLSTMForwardCellState.__name__)
 
         input = ONNXRuntimeTestContext.DefaultInput()
-        W, R, B, P = ONNXRuntimeTestContext.OneDirectionWeights()
+        W, R, B, P = ONNXRuntimeTestContext.OneDirectionWeights()  # noqa: N806
         initial_h = np.array([0.34, 0.72]).reshape(1, 1, 2).astype(np.float32)
         initial_c = np.array([0.63, 0.21]).reshape(1, 1, 2).astype(np.float32)
         lstm = LSTM_Helper(X=input, W=W, R=R, B=B, initial_h=initial_h, initial_c=initial_c)
-        Y, Y_h, Y_c = lstm.run()
+        Y, Y_h, Y_c = lstm.run()  # noqa: N806
         print_results(Y, Y_h, Y_c)
 
     @staticmethod
-    def ONNXRuntime_TestLSTMActivation():
-
+    def ONNXRuntime_TestLSTMActivation():  # noqa: N802
         print(ONNXRuntimeUnitTests.ONNXRuntime_TestLSTMActivation.__name__)
 
         input = ONNXRuntimeTestContext.DefaultInput()
-        W, R, B, P = ONNXRuntimeTestContext.OneDirectionWeights()
+        W, R, B, P = ONNXRuntimeTestContext.OneDirectionWeights()  # noqa: N806
         lstm = LSTM_Helper(
             X=input,
             W=W,
@@ -623,19 +609,18 @@ def ONNXRuntime_TestLSTMActivation():
             g=ActivationFuncs.sigmoid,
             h=ActivationFuncs.tanh,
         )
-        Y, Y_h, Y_c = lstm.run()
+        Y, Y_h, Y_c = lstm.run()  # noqa: N806
         print_results(Y, Y_h, Y_c)
 
     @staticmethod
-    def ONNXRuntime_TestLSTMBatchReallocation():
-
+    def ONNXRuntime_TestLSTMBatchReallocation():  # noqa: N802
         print(ONNXRuntimeUnitTests.ONNXRuntime_TestLSTMBatchReallocation.__name__)
         seq_length = 2
         batch_size = 1
         input_size = 2
 
         input = ONNXRuntimeTestContext.DefaultInput()
-        W, R, B, P = ONNXRuntimeTestContext.OneDirectionWeights()
+        W, R, B, P = ONNXRuntimeTestContext.OneDirectionWeights()  # noqa: N806
         lstm = LSTM_Helper(
             X=input,
             W=W,
@@ -645,7 +630,7 @@ def ONNXRuntime_TestLSTMBatchReallocation():
             g=ActivationFuncs.sigmoid,
             h=ActivationFuncs.tanh,
         )
-        Y, Y_h, Y_c = lstm.run()
+        Y, Y_h, Y_c = lstm.run()  # noqa: N806
         print_results(Y, Y_h, Y_c)
         print("===============")
 
@@ -671,7 +656,7 @@ def ONNXRuntime_TestLSTMBatchReallocation():
             .astype(np.float32)
         )
 
-        W, R, B, P = ONNXRuntimeTestContext.OneDirectionWeights()
+        W, R, B, P = ONNXRuntimeTestContext.OneDirectionWeights()  # noqa: N806
         lstm = LSTM_Helper(
             X=input,
             W=W,
@@ -681,19 +666,18 @@ def ONNXRuntime_TestLSTMBatchReallocation():
             g=ActivationFuncs.sigmoid,
             h=ActivationFuncs.tanh,
         )
-        Y, Y_h, Y_c = lstm.run()
+        Y, Y_h, Y_c = lstm.run()  # noqa: N806
         print_results(Y, Y_h, Y_c)
 
     @staticmethod
-    def ONNXRuntime_TestLSTMOutputWrite():
-
+    def ONNXRuntime_TestLSTMOutputWrite():  # noqa: N802
         print(ONNXRuntimeUnitTests.ONNXRuntime_TestLSTMOutputWrite.__name__)
         seq_length = 2
         batch_size = 1
         input_size = 2
 
         input = ONNXRuntimeTestContext.DefaultInput()
-        W, R, B, P = ONNXRuntimeTestContext.BidirectionalWeights()
+        W, R, B, P = ONNXRuntimeTestContext.BidirectionalWeights()  # noqa: N806
         lstm = LSTM_Helper(
             X=input,
             W=W,
@@ -704,7 +688,7 @@ def ONNXRuntime_TestLSTMOutputWrite():
             g=ActivationFuncs.sigmoid,
             h=ActivationFuncs.tanh,
         )
-        Y, Y_h, Y_c = lstm.run()
+        Y, Y_h, Y_c = lstm.run()  # noqa: N806
         print_results(Y, Y_h, Y_c)
 
         print("===============")
@@ -731,7 +715,7 @@ def ONNXRuntime_TestLSTMOutputWrite():
             .astype(np.float32)
         )
 
-        W, R, B, P = ONNXRuntimeTestContext.BidirectionalWeights()
+        W, R, B, P = ONNXRuntimeTestContext.BidirectionalWeights()  # noqa: N806
         lstm = LSTM_Helper(
             X=input,
             W=W,
@@ -742,7 +726,7 @@ def ONNXRuntime_TestLSTMOutputWrite():
             g=ActivationFuncs.sigmoid,
             h=ActivationFuncs.tanh,
         )
-        Y, Y_h, Y_c = lstm.run()
+        Y, Y_h, Y_c = lstm.run()  # noqa: N806
         print_results(Y, Y_h, Y_c)
 
 
diff --git a/onnxruntime/test/python/contrib_ops/aten_op_tests.py b/onnxruntime/test/python/contrib_ops/aten_op_tests.py
index 0ae8fe6c280c5..925a88ac28c69 100644
--- a/onnxruntime/test/python/contrib_ops/aten_op_tests.py
+++ b/onnxruntime/test/python/contrib_ops/aten_op_tests.py
@@ -19,7 +19,7 @@ class OrtOpTests(unittest.TestCase):
     def test_aten_embedding(self):
         class NeuralNetEmbedding(torch.nn.Module):
             def __init__(self, num_embeddings, embedding_dim, hidden_size):
-                super(NeuralNetEmbedding, self).__init__()
+                super().__init__()
                 self.embedding = torch.nn.Embedding(num_embeddings, embedding_dim)
                 self.linear = torch.nn.Linear(embedding_dim, hidden_size)
 
@@ -27,7 +27,7 @@ def forward(self, input):
                 embedding_result = self.embedding(input)
                 return embedding_result, self.linear(embedding_result)
 
-        N, num_embeddings, embedding_dim, hidden_size = 64, 32, 128, 128
+        N, num_embeddings, embedding_dim, hidden_size = 64, 32, 128, 128  # noqa: N806
         model = NeuralNetEmbedding(num_embeddings, embedding_dim, hidden_size)
 
         with torch.no_grad():
@@ -56,7 +56,7 @@ def forward(self, input):
                     attr = node.attribute.add()
                     attr.name = "operator"
                     attr.type = 3
-                    attr.s = "embedding".encode()
+                    attr.s = b"embedding"
                     exported_model.graph.node.append(
                         helper.make_node(
                             "Constant",
diff --git a/onnxruntime/test/python/contrib_ops/onnx_contrib_ops_helper.py b/onnxruntime/test/python/contrib_ops/onnx_contrib_ops_helper.py
index 521938b14b65e..0dce08aeccb2c 100644
--- a/onnxruntime/test/python/contrib_ops/onnx_contrib_ops_helper.py
+++ b/onnxruntime/test/python/contrib_ops/onnx_contrib_ops_helper.py
@@ -38,11 +38,11 @@ def generate_data(graph, inputs, outputs, name):
     prepare_dir(data_set)
     for j, input_np in enumerate(inputs):
         tensor = numpy_helper.from_array(input_np, model.graph.input[j].name)
-        with open(os.path.join(data_set, "input_{}.pb".format(j)), "wb") as f:
+        with open(os.path.join(data_set, f"input_{j}.pb"), "wb") as f:
             f.write(tensor.SerializeToString())
     for j, output_np in enumerate(outputs):
         tensor = numpy_helper.from_array(output_np, model.graph.output[j].name)
-        with open(os.path.join(data_set, "output_{}.pb".format(j)), "wb") as f:
+        with open(os.path.join(data_set, f"output_{j}.pb"), "wb") as f:
             f.write(tensor.SerializeToString())
 
 
@@ -57,12 +57,12 @@ def expect(
     present_outputs = [x for x in node.output if (x != "")]
     input_types = [None] * len(inputs)
     if "input_types" in kwargs:
-        input_types = kwargs[str("input_types")]
-        del kwargs[str("input_types")]
+        input_types = kwargs["input_types"]
+        del kwargs["input_types"]
     output_types = [None] * len(outputs)
     if "output_types" in kwargs:
-        output_types = kwargs[str("output_types")]
-        del kwargs[str("output_types")]
+        output_types = kwargs["output_types"]
+        del kwargs["output_types"]
     inputs_vi = [
         _extract_value_info(arr, arr_name, input_type)
         for arr, arr_name, input_type in zip(inputs, present_inputs, input_types)
diff --git a/onnxruntime/test/python/helper.py b/onnxruntime/test/python/helper.py
index 66a4f27319c1a..2a2c3fc9b4532 100644
--- a/onnxruntime/test/python/helper.py
+++ b/onnxruntime/test/python/helper.py
@@ -12,4 +12,4 @@ def get_name(name):
     res = os.path.join(data, name)
     if os.path.exists(res):
         return res
-    raise FileNotFoundError("Unable to find '{0}' or '{1}' or '{2}'".format(name, rel, res))
+    raise FileNotFoundError(f"Unable to find '{name}' or '{rel}' or '{res}'")
diff --git a/onnxruntime/test/python/onnxruntime_test_collective.py b/onnxruntime/test/python/onnxruntime_test_collective.py
index 8329ab36337c2..a6e420fe47040 100644
--- a/onnxruntime/test/python/onnxruntime_test_collective.py
+++ b/onnxruntime/test/python/onnxruntime_test_collective.py
@@ -3,17 +3,17 @@
 import unittest
 
 import numpy as np
-import onnx
+import onnx  # noqa: F401
 from mpi4py import MPI
-from onnx import AttributeProto, GraphProto, TensorProto, helper
+from onnx import AttributeProto, GraphProto, TensorProto, helper  # noqa: F401
 
 import onnxruntime as ort
 
 
 class ORTBertPretrainTest(unittest.TestCase):
     def _create_allreduce_ut_model(self, shape):
-        X = helper.make_tensor_value_info("X", TensorProto.FLOAT, shape)
-        Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, shape)
+        X = helper.make_tensor_value_info("X", TensorProto.FLOAT, shape)  # noqa: N806
+        Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, shape)  # noqa: N806
         node_def = helper.make_node("AllReduce", ["X"], ["Y"], domain="com.microsoft")
         graph_def = helper.make_graph(
             [node_def],
@@ -28,10 +28,10 @@ def _get_rank_size(self):
         return comm.Get_rank(), comm.Get_size()
 
     def _create_allgather_ut_model(self, shape):
-        X = helper.make_tensor_value_info("X", TensorProto.FLOAT, shape)
+        X = helper.make_tensor_value_info("X", TensorProto.FLOAT, shape)  # noqa: N806
         rank, size = self._get_rank_size()
         output_shape = [s * size if _ == 0 else s for _, s in enumerate(shape)]
-        Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, output_shape)
+        Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, output_shape)  # noqa: N806
         node_def = helper.make_node("AllGather", ["X"], ["Y"], domain="com.microsoft", group_size=size)
         graph_def = helper.make_graph(
             [node_def],
@@ -42,8 +42,8 @@ def _create_allgather_ut_model(self, shape):
         return helper.make_model(graph_def, producer_name="ort-distributed-inference-unittest")
 
     def _create_alltoall_ut_model(self, shape):
-        X = helper.make_tensor_value_info("X", TensorProto.FLOAT, shape)
-        Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, shape)
+        X = helper.make_tensor_value_info("X", TensorProto.FLOAT, shape)  # noqa: N806
+        Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, shape)  # noqa: N806
         _, size = self._get_rank_size()
         node_def = helper.make_node("AllToAll", ["X"], ["Y"], domain="com.microsoft", group_size=size)
         graph_def = helper.make_graph(
diff --git a/onnxruntime/test/python/onnxruntime_test_ort_trainer.py b/onnxruntime/test/python/onnxruntime_test_ort_trainer.py
index 30e2dc62e16da..096b3b639648d 100644
--- a/onnxruntime/test/python/onnxruntime_test_ort_trainer.py
+++ b/onnxruntime/test/python/onnxruntime_test_ort_trainer.py
@@ -3,17 +3,15 @@
 
 import copy
 import os
-import sys
 import unittest
 
 import numpy as np
 import onnx
-import pytest
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from helper import get_name
-from numpy.testing import assert_allclose, assert_array_equal
+from numpy.testing import assert_allclose
 from torchvision import datasets, transforms
 
 import onnxruntime
@@ -140,7 +138,7 @@ def create_ort_trainer(
     return model, model_desc, device
 
 
-def runBertTrainingTest(
+def runBertTrainingTest(  # noqa: N802
     gradient_accumulation_steps,
     use_mixed_precision,
     allreduce_post_accumulation,
@@ -170,7 +168,7 @@ def runBertTrainingTest(
     next_sentence_labels_batches = []
     batch_size = 16
     num_batches = 8
-    for batch in range(num_batches):
+    for _batch in range(num_batches):
         input_ids_batches = [
             *input_ids_batches,
             generate_sample_batch(model_desc.inputs_[0], batch_size, device),
@@ -267,7 +265,7 @@ def runBertTrainingTest(
 class MNISTWrapper:
     class NeuralNet(nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(MNISTWrapper.NeuralNet, self).__init__()
+            super().__init__()
             self.fc1 = nn.Linear(input_size, hidden_size)
             self.relu = nn.ReLU()
             self.fc2 = nn.Linear(hidden_size, num_classes)
@@ -282,7 +280,7 @@ def forward(self, x):
 
     class NeuralNetWithLoss(nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(MNISTWrapper.NeuralNetWithLoss, self).__init__()
+            super().__init__()
             self.fc1 = nn.Linear(input_size, hidden_size)
             self.relu = nn.ReLU()
             self.fc2 = nn.Linear(hidden_size, num_classes)
@@ -293,7 +291,7 @@ def forward(self, x, target):
             out = self.fc2(out)
             return F.nll_loss(F.log_softmax(out, dim=1), target), out
 
-    def my_loss(x, target):
+    def my_loss(x, target):  # noqa: N805
         return F.nll_loss(F.log_softmax(x, dim=1), target)
 
     def train_with_trainer(self, learningRate, trainer, device, train_loader, epoch):
@@ -415,7 +413,7 @@ def get_trainer(
         model_desc,
         device,
         onnx_opset_ver=12,
-        frozen_weights=[],
+        frozen_weights=[],  # noqa: B006
         internal_loss_fn=False,
         get_lr_this_step=None,
         optimizer="SGDOptimizer",
@@ -442,7 +440,7 @@ def get_trainer(
 
 
 class TestOrtTrainer(unittest.TestCase):
-    def run_mnist_training_and_testing(onnx_opset_ver):
+    def run_mnist_training_and_testing(onnx_opset_ver):  # noqa: N805
         torch.manual_seed(1)
         device = torch.device("cuda")
 
@@ -451,7 +449,7 @@ def run_mnist_training_and_testing(onnx_opset_ver):
         model, model_desc = mnist.get_model()
         trainer = mnist.get_trainer(model, model_desc, device, onnx_opset_ver=onnx_opset_ver)
 
-        learningRate = 0.01
+        learningRate = 0.01  # noqa: N806
         args_epochs = 2
         expected_losses = [
             2.312044143676758,
@@ -518,10 +516,10 @@ def run_mnist_training_and_testing(onnx_opset_ver):
             err_msg="test accuracy mismatch",
         )
 
-    def testMNISTTrainingAndTestingOpset12(self):
+    def testMNISTTrainingAndTestingOpset12(self):  # noqa: N802
         TestOrtTrainer.run_mnist_training_and_testing(onnx_opset_ver=12)
 
-    def testMNISTResumeTrainingAndTesting(self):
+    def testMNISTResumeTrainingAndTesting(self):  # noqa: N802
         torch.manual_seed(1)
         device = torch.device("cuda")
 
@@ -529,7 +527,7 @@ def testMNISTResumeTrainingAndTesting(self):
         train_loader, test_loader = mnist.get_loaders()
         model, model_desc = mnist.get_model()
 
-        learningRate = 0.01
+        learningRate = 0.01  # noqa: N806
         args_epochs = 2
         args_checkpoint_epoch = 1
         # should match those in test without checkpointing
@@ -590,7 +588,7 @@ def testMNISTResumeTrainingAndTesting(self):
             err_msg="test accuracy mismatch",
         )
 
-    def testMNISTStateDict(self):
+    def testMNISTStateDict(self):  # noqa: N802
         torch.manual_seed(1)
         device = torch.device("cuda")
 
@@ -602,8 +600,7 @@ def testMNISTStateDict(self):
         state_dict = trainer.state_dict()
         assert state_dict == {}
 
-        learningRate = 0.02
-        epoch = 0
+        learningRate = 0.02  # noqa: N806
 
         data, target = next(iter(train_loader))
         data, target = data.to(device), target.to(device)
@@ -620,7 +617,7 @@ def testMNISTStateDict(self):
             "bias_buffer",
         }
 
-    def testMNISTSaveAsONNX(self):
+    def testMNISTSaveAsONNX(self):  # noqa: N802
         torch.manual_seed(1)
         device = torch.device("cuda")
         onnx_file_name = "mnist.onnx"
@@ -635,8 +632,7 @@ def testMNISTSaveAsONNX(self):
         trainer.save_as_onnx(onnx_file_name)
         assert not os.path.exists(onnx_file_name)
 
-        learningRate = 0.02
-        epoch = 0
+        learningRate = 0.02  # noqa: N806
 
         data, target = next(iter(train_loader))
         data, target = data.to(device), target.to(device)
@@ -647,7 +643,7 @@ def testMNISTSaveAsONNX(self):
         trainer.save_as_onnx(onnx_file_name)
         assert os.path.exists(onnx_file_name)
 
-    def testMNISTDevice(self):
+    def testMNISTDevice(self):  # noqa: N802
         torch.manual_seed(1)
         device = torch.device("cuda")
 
@@ -658,8 +654,7 @@ def testMNISTDevice(self):
         for model_device in [torch.device("cpu"), torch.device("cuda")]:
             model.to(model_device)
             trainer = mnist.get_trainer(model, model_desc, device)
-            learningRate = 0.02
-            epoch = 0
+            learningRate = 0.02  # noqa: N806
 
             data, target = next(iter(train_loader))
             data, target = data.to(device), target.to(device)
@@ -667,7 +662,7 @@ def testMNISTDevice(self):
 
             loss, _ = trainer.train_step(data, target, torch.tensor([learningRate]))
 
-    def testMNISTInitializerNames(self):
+    def testMNISTInitializerNames(self):  # noqa: N802
         torch.manual_seed(1)
         device = torch.device("cuda")
 
@@ -676,8 +671,7 @@ def testMNISTInitializerNames(self):
         model, model_desc = mnist.get_model()
 
         trainer = mnist.get_trainer(model, model_desc, device)
-        learningRate = 0.02
-        epoch = 0
+        learningRate = 0.02  # noqa: N806
 
         data, target = next(iter(train_loader))
         data, target = data.to(device), target.to(device)
@@ -685,11 +679,11 @@ def testMNISTInitializerNames(self):
 
         loss, _ = trainer.train_step(data, target, torch.tensor([learningRate]))
 
-        assert (set([n.name for n in trainer.onnx_model_.graph.initializer]) - set(["bias_buffer"])) == set(
-            [n for n, t in model.named_parameters()]
-        )
+        assert ({n.name for n in trainer.onnx_model_.graph.initializer} - {"bias_buffer"}) == {
+            n for n, t in model.named_parameters()
+        }
 
-    def testMNISTInitializerNamesWithInternalLoss(self):
+    def testMNISTInitializerNamesWithInternalLoss(self):  # noqa: N802
         torch.manual_seed(1)
         device = torch.device("cuda")
 
@@ -698,7 +692,7 @@ def testMNISTInitializerNamesWithInternalLoss(self):
         model, model_desc = mnist.get_model_with_internal_loss()
 
         def get_lr_this_step(global_step):
-            learningRate = 0.02
+            learningRate = 0.02  # noqa: N806
             return torch.tensor([learningRate])
 
         trainer = mnist.get_trainer(
@@ -708,7 +702,6 @@ def get_lr_this_step(global_step):
             internal_loss_fn=True,
             get_lr_this_step=get_lr_this_step,
         )
-        epoch = 0
 
         data, target = next(iter(train_loader))
         data, target = data.to(device), target.to(device)
@@ -716,11 +709,9 @@ def get_lr_this_step(global_step):
 
         loss, _ = trainer.train_step(data, target)
 
-        assert set([n.name for n in trainer.onnx_model_.graph.initializer]) == set(
-            [n for n, t in model.named_parameters()]
-        )
+        assert {n.name for n in trainer.onnx_model_.graph.initializer} == {n for n, t in model.named_parameters()}
 
-    def testMNISTFrozenWeight(self):
+    def testMNISTFrozenWeight(self):  # noqa: N802
         torch.manual_seed(1)
         device = torch.device("cuda")
 
@@ -730,8 +721,7 @@ def testMNISTFrozenWeight(self):
 
         trainer = mnist.get_trainer(model, model_desc, device, frozen_weights=["fc1.weight"])
 
-        learningRate = 0.02
-        epoch = 0
+        learningRate = 0.02  # noqa: N806
 
         data, target = next(iter(train_loader))
         data, target = data.to(device), target.to(device)
@@ -748,7 +738,7 @@ def testMNISTFrozenWeight(self):
         fc2_trainstep_2 = trainer.state_dict()["fc2.weight"]
         assert np.array_equal(fc1_trainstep_1, fc1_trainstep_2) and not np.array_equal(fc2_trainstep_1, fc2_trainstep_2)
 
-    def testMNISTTorchBuffer(self):
+    def testMNISTTorchBuffer(self):  # noqa: N802
         torch.manual_seed(1)
         device = torch.device("cuda")
 
@@ -758,8 +748,7 @@ def testMNISTTorchBuffer(self):
 
         trainer = mnist.get_trainer(model, model_desc, device)
 
-        learningRate = 0.02
-        epoch = 0
+        learningRate = 0.02  # noqa: N806
 
         data, target = next(iter(train_loader))
         data, target = data.to(device), target.to(device)
@@ -778,7 +767,7 @@ def testMNISTTorchBuffer(self):
             bias_buffer_trainstep_1, bias_buffer_trainstep_2
         )
 
-    def testMNISTFrozenWeightCheckpoint(self):
+    def testMNISTFrozenWeightCheckpoint(self):  # noqa: N802
         torch.manual_seed(1)
         device = torch.device("cuda")
 
@@ -788,8 +777,7 @@ def testMNISTFrozenWeightCheckpoint(self):
 
         trainer = mnist.get_trainer(model, model_desc, device, frozen_weights=["fc1.weight"])
 
-        learningRate = 0.02
-        epoch = 0
+        learningRate = 0.02  # noqa: N806
 
         # do one train step
         data, target = next(iter(train_loader))
@@ -818,7 +806,7 @@ def testMNISTFrozenWeightCheckpoint(self):
         loaded_state_dict = trainer.state_dict()
         assert state_dict.keys() == loaded_state_dict.keys()
 
-    def testMNISTTrainingCheckpoint(self):
+    def testMNISTTrainingCheckpoint(self):  # noqa: N802
         torch.manual_seed(1)
         device = torch.device("cuda")
 
@@ -834,11 +822,10 @@ def testMNISTTrainingCheckpoint(self):
             frozen_weights=["fc1.weight"],
         )
 
-        learningRate = 0.02
-        epoch = 0
+        learningRate = 0.02  # noqa: N806
 
         # do 5 train step
-        for i in range(5):
+        for _i in range(5):
             data, target = next(iter(train_loader))
             data, target = data.to(device), target.to(device)
             data = data.reshape(data.shape[0], -1)
@@ -873,7 +860,7 @@ def testMNISTTrainingCheckpoint(self):
         for key in state_dict:
             assert np.array_equal(state_dict[key], loaded_state_dict[key])
 
-    def testBertTrainingBasic(self):
+    def testBertTrainingBasic(self):  # noqa: N802
         expected_losses = [
             11.027887,
             11.108191,
@@ -907,7 +894,7 @@ def testBertTrainingBasic(self):
             err_msg="evaluation loss mismatch",
         )
 
-    def testBertTrainingGradientAccumulation(self):
+    def testBertTrainingGradientAccumulation(self):  # noqa: N802
         expected_losses = [
             11.027887,
             11.108191,
@@ -942,7 +929,7 @@ def testBertTrainingGradientAccumulation(self):
             err_msg="evaluation loss mismatch",
         )
 
-    def testBertCheckpointingBasic(self):
+    def testBertCheckpointingBasic(self):  # noqa: N802
         model, _, _ = create_ort_trainer(
             gradient_accumulation_steps=1,
             use_mixed_precision=False,
@@ -976,7 +963,7 @@ def testBertCheckpointingBasic(self):
         for k, v in loaded_sd.items():
             assert torch.all(torch.eq(v, sd[k]))
 
-    def testWrapModelLossFnStateDict(self):
+    def testWrapModelLossFnStateDict(self):  # noqa: N802
         torch.manual_seed(1)
         device = torch.device("cuda")
 
@@ -1011,7 +998,7 @@ def loss_fn(x, label):
             return F.nll_loss(F.log_softmax(x, dim=1), label)
 
         def get_lr_this_step(global_step):
-            learningRate = 0.02
+            learningRate = 0.02  # noqa: N806
             return torch.tensor([learningRate])
 
         ort_trainer = ORTTrainer(
diff --git a/onnxruntime/test/python/onnxruntime_test_ort_trainer_with_mixed_precision.py b/onnxruntime/test/python/onnxruntime_test_ort_trainer_with_mixed_precision.py
index 0c9e703c61fe5..53ace2a642652 100644
--- a/onnxruntime/test/python/onnxruntime_test_ort_trainer_with_mixed_precision.py
+++ b/onnxruntime/test/python/onnxruntime_test_ort_trainer_with_mixed_precision.py
@@ -8,7 +8,7 @@
 
 
 class TestOrtTrainer(unittest.TestCase):
-    def testBertTrainingMixedPrecision(self):
+    def testBertTrainingMixedPrecision(self):  # noqa: N802
         expected_losses = [
             11.034248352050781,
             11.125300407409668,
@@ -38,7 +38,7 @@ def testBertTrainingMixedPrecision(self):
             err_msg="evaluation loss mismatch",
         )
 
-    def testBertTrainingMixedPrecisionInternalLossScale(self):
+    def testBertTrainingMixedPrecisionInternalLossScale(self):  # noqa: N802
         expected_losses = [
             11.034248352050781,
             11.125300407409668,
@@ -67,7 +67,7 @@ def testBertTrainingMixedPrecisionInternalLossScale(self):
             err_msg="evaluation loss mismatch",
         )
 
-    def testBertTrainingGradientAccumulationMixedPrecision(self):
+    def testBertTrainingGradientAccumulationMixedPrecision(self):  # noqa: N802
         expected_losses = [
             11.034248352050781,
             11.125300407409668,
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index 9e4d1f6a80858..24f1097aef500 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -17,7 +17,7 @@
 from onnxruntime.capi.onnxruntime_pybind11_state import Fail, OrtValueVector, RunOptions
 
 # handle change from python 3.8 and on where loading a dll from the current directory needs to be explicitly allowed.
-if platform.system() == "Windows" and sys.version_info.major >= 3 and sys.version_info.minor >= 8:
+if platform.system() == "Windows" and sys.version_info.major >= 3 and sys.version_info.minor >= 8:  # noqa: YTT204
     os.add_dll_directory(os.getcwd())
 
 available_providers = [provider for provider in onnxrt.get_available_providers()]
@@ -52,17 +52,17 @@ def run_model(self, session_object, run_options):
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
         np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
 
-    def testTvmImported(self):
+    def testTvmImported(self):  # noqa: N802
         if "TvmExecutionProvider" not in onnxrt.get_available_providers():
             return
         import tvm
 
         self.assertTrue(tvm is not None)
 
-    def testGetVersionString(self):
+    def testGetVersionString(self):  # noqa: N802
         self.assertIsNot(onnxrt.get_version_string(), None)
 
-    def testModelSerialization(self):
+    def testModelSerialization(self):  # noqa: N802
         try:
             so = onnxrt.SessionOptions()
             so.log_severity_level = 1
@@ -83,22 +83,22 @@ def testModelSerialization(self):
             else:
                 raise onnxruntime_error
 
-    def testGetProviders(self):
+    def testGetProviders(self):  # noqa: N802
         self.assertTrue("CPUExecutionProvider" in onnxrt.get_available_providers())
         # get_all_providers() returns the default EP order from highest to lowest.
         # CPUExecutionProvider should always be last.
-        self.assertTrue("CPUExecutionProvider" == onnxrt.get_all_providers()[-1])
+        self.assertTrue(onnxrt.get_all_providers()[-1] == "CPUExecutionProvider")
         sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
         self.assertTrue("CPUExecutionProvider" in sess.get_providers())
 
-    def testEnablingAndDisablingTelemetry(self):
+    def testEnablingAndDisablingTelemetry(self):  # noqa: N802
         onnxrt.disable_telemetry_events()
 
         # no-op on non-Windows builds
         # may be no-op on certain Windows builds based on build configuration
         onnxrt.enable_telemetry_events()
 
-    def testSetProviders(self):
+    def testSetProviders(self):  # noqa: N802
         if "CUDAExecutionProvider" in onnxrt.get_available_providers():
             sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=["CUDAExecutionProvider"])
             # confirm that CUDA Provider is in list of registered providers.
@@ -108,7 +108,7 @@ def testSetProviders(self):
             # confirm only CPU Provider is registered now.
             self.assertEqual(["CPUExecutionProvider"], sess.get_providers())
 
-    def testSetProvidersWithOptions(self):
+    def testSetProvidersWithOptions(self):  # noqa: N802
         if "TensorrtExecutionProvider" in onnxrt.get_available_providers():
             sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=["TensorrtExecutionProvider"])
             self.assertIn("TensorrtExecutionProvider", sess.get_providers())
@@ -173,11 +173,11 @@ def testSetProvidersWithOptions(self):
 
         if "CUDAExecutionProvider" in onnxrt.get_available_providers():
             import ctypes
-            import sys
+            import sys  # noqa: F401
 
-            CUDA_SUCCESS = 0
+            CUDA_SUCCESS = 0  # noqa: N806
 
-            def runBaseTest1():
+            def runBaseTest1():  # noqa: N802
                 sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=["CUDAExecutionProvider"])
                 self.assertTrue("CUDAExecutionProvider" in sess.get_providers())
 
@@ -196,7 +196,7 @@ def runBaseTest1():
                     sess.get_providers(),
                 )
 
-            def runBaseTest2():
+            def runBaseTest2():  # noqa: N802
                 sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=["CUDAExecutionProvider"])
                 self.assertIn("CUDAExecutionProvider", sess.get_providers())
 
@@ -273,7 +273,7 @@ def test_get_and_set_option_with_values(option_name, option_values):
                 with self.assertRaises(RuntimeError):
                     sess.set_providers(["CUDAExecutionProvider"], [option])
 
-            def getCudaDeviceCount():
+            def getCudaDeviceCount():  # noqa: N802
                 import ctypes
 
                 num_device = ctypes.c_int()
@@ -289,7 +289,7 @@ def getCudaDeviceCount():
 
                 return num_device.value
 
-            def setDeviceIdTest(i):
+            def setDeviceIdTest(i):  # noqa: N802
                 import ctypes
 
                 import onnxruntime as onnxrt
@@ -313,7 +313,7 @@ def setDeviceIdTest(i):
                 self.assertEqual(result, CUDA_SUCCESS)
                 self.assertEqual(i, device.value)
 
-            def runAdvancedTest():
+            def runAdvancedTest():  # noqa: N802
                 num_device = getCudaDeviceCount()
                 if num_device < 0:
                     return
@@ -355,7 +355,7 @@ def runAdvancedTest():
 
         if "ROCMExecutionProvider" in onnxrt.get_available_providers():
 
-            def runRocmOptionsTest():
+            def runRocmOptionsTest():  # noqa: N802
                 sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=["ROCMExecutionProvider"])
                 self.assertIn("ROCMExecutionProvider", sess.get_providers())
                 options = sess.get_provider_options()
@@ -378,20 +378,20 @@ def test_get_and_set_option_with_values(option_name, option_values):
 
             runRocmOptionsTest()
 
-    def testInvalidSetProviders(self):
+    def testInvalidSetProviders(self):  # noqa: N802
         with self.assertRaises(RuntimeError) as context:
             sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=["CPUExecutionProvider"])
             sess.set_providers(["InvalidProvider"])
         self.assertTrue("Unknown Provider Type: InvalidProvider" in str(context.exception))
 
-    def testSessionProviders(self):
+    def testSessionProviders(self):  # noqa: N802
         if "CUDAExecutionProvider" in onnxrt.get_available_providers():
             # create session from scratch, but constrain it to only use the CPU.
             sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=["CPUExecutionProvider"])
             self.assertEqual(["CPUExecutionProvider"], sess.get_providers())
 
-    def testGetAndSetTuningResults(self):
-        def getTuningResultsForEp(sess, ep):  # without the outer list
+    def testGetAndSetTuningResults(self):  # noqa: N802
+        def getTuningResultsForEp(sess, ep):  # without the outer list  # noqa: N802
             tuning_results = sess.get_tuning_results()
             self.assertGreaterEqual(len(tuning_results), 1)
             tuning_results_for_this_ep = [t for t in tuning_results if t.get("ep") == ep]
@@ -402,21 +402,21 @@ def getTuningResultsForEp(sess, ep):  # without the outer list
         probe_params_sig = "probe_but_not_an_params_signature"
         probe_value = 10000000
 
-        def copyTuningResultsWithProbe(tr):
+        def copyTuningResultsWithProbe(tr):  # noqa: N802
             tr = copy.deepcopy(tr)
             tr["results"][probe_op_sig] = {probe_params_sig: probe_value}
             return tr
 
-        def assertTuningResultsLoaded(sess, ep):
+        def assertTuningResultsLoaded(sess, ep):  # noqa: N802
             tr = getTuningResultsForEp(sess, ep)
             self.assertIn(probe_op_sig, tr["results"])
             self.assertEqual(tr["results"][probe_op_sig], {probe_params_sig: probe_value})
 
-        def assertTuningResultsNotLoaded(sess, ep):
+        def assertTuningResultsNotLoaded(sess, ep):  # noqa: N802
             tr = getTuningResultsForEp(sess, ep)
             self.assertNotIn(probe_op_sig, tr["results"])
 
-        def doTestGetAndSetTuningResults(ep):
+        def doTestGetAndSetTuningResults(ep):  # noqa: N802
             sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=[ep])
             tuning_results = getTuningResultsForEp(sess, ep)
 
@@ -473,7 +473,7 @@ def doTestGetAndSetTuningResults(ep):
         if "ROCMExecutionProvider" in onnxrt.get_available_providers():
             doTestGetAndSetTuningResults("ROCMExecutionProvider")
 
-    def testRunModel(self):
+    def testRunModel(self):  # noqa: N802
         sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=available_providers)
         x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         input_name = sess.get_inputs()[0].name
@@ -488,7 +488,7 @@ def testRunModel(self):
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
         np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
 
-    def testRunModelFromBytes(self):
+    def testRunModelFromBytes(self):  # noqa: N802
         with open(get_name("mul_1.onnx"), "rb") as f:
             content = f.read()
         sess = onnxrt.InferenceSession(content, providers=onnxrt.get_available_providers())
@@ -505,7 +505,7 @@ def testRunModelFromBytes(self):
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
         np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
 
-    def testRunModel2(self):
+    def testRunModel2(self):  # noqa: N802
         sess = onnxrt.InferenceSession(get_name("matmul_1.onnx"), providers=onnxrt.get_available_providers())
         x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         input_name = sess.get_inputs()[0].name
@@ -520,7 +520,7 @@ def testRunModel2(self):
         output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32)
         np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
 
-    def testRunModel2Contiguous(self):
+    def testRunModel2Contiguous(self):  # noqa: N802
         sess = onnxrt.InferenceSession(get_name("matmul_1.onnx"), providers=onnxrt.get_available_providers())
         x = np.array([[2.0, 1.0], [4.0, 3.0], [6.0, 5.0]], dtype=np.float32)[:, [1, 0]]
         input_name = sess.get_inputs()[0].name
@@ -538,7 +538,7 @@ def testRunModel2Contiguous(self):
         rescontiguous = sess.run([output_name], {input_name: xcontiguous})
         np.testing.assert_allclose(output_expected, rescontiguous[0], rtol=1e-05, atol=1e-08)
 
-    def testRunModelMultipleThreads(self):
+    def testRunModelMultipleThreads(self):  # noqa: N802
         # Skip this test for a "pure" DML onnxruntime python wheel.
         # We keep this test enabled for instances where both DML and CUDA EPs are available
         # (Windows GPU CI pipeline has this config) - this test will pass because CUDA has higher precedence
@@ -568,7 +568,7 @@ def testRunModelMultipleThreads(self):
             t1.join()
             t2.join()
 
-    def testListAsInput(self):
+    def testListAsInput(self):  # noqa: N802
         sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
         x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         input_name = sess.get_inputs()[0].name
@@ -576,18 +576,18 @@ def testListAsInput(self):
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
         np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
 
-    def testStringListAsInput(self):
+    def testStringListAsInput(self):  # noqa: N802
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
         x = np.array(["this", "is", "identity", "test"], dtype=str).reshape((2, 2))
         x_name = sess.get_inputs()[0].name
         res = sess.run([], {x_name: x.tolist()})
         np.testing.assert_equal(x, res[0])
 
-    def testRunDevice(self):
+    def testRunDevice(self):  # noqa: N802
         device = onnxrt.get_device()
         self.assertTrue("CPU" in device or "GPU" in device)
 
-    def testRunModelSymbolicInput(self):
+    def testRunModelSymbolicInput(self):  # noqa: N802
         sess = onnxrt.InferenceSession(get_name("matmul_2.onnx"), providers=available_providers_without_tvm)
         x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         input_name = sess.get_inputs()[0].name
@@ -604,7 +604,7 @@ def testRunModelSymbolicInput(self):
         output_expected = np.array([[5.0], [11.0], [17.0]], dtype=np.float32)
         np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
 
-    def testBooleanInputs(self):
+    def testBooleanInputs(self):  # noqa: N802
         sess = onnxrt.InferenceSession(get_name("logicaland.onnx"), providers=available_providers)
         a = np.array([[True, True], [False, False]], dtype=bool)
         b = np.array([[True, False], [True, False]], dtype=bool)
@@ -636,7 +636,7 @@ def testBooleanInputs(self):
         res = sess.run([output_name], {a_name: a, b_name: b})
         np.testing.assert_equal(output_expected, res[0])
 
-    def testStringInput1(self):
+    def testStringInput1(self):  # noqa: N802
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
         x = np.array(["this", "is", "identity", "test"], dtype=str).reshape((2, 2))
 
@@ -657,7 +657,7 @@ def testStringInput1(self):
         res = sess.run([output_name], {x_name: x})
         np.testing.assert_equal(x, res[0])
 
-    def testStringInput2(self):
+    def testStringInput2(self):  # noqa: N802
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
         x = np.array(["Olá", "你好", "여보세요", "hello"], dtype=str).reshape((2, 2))
 
@@ -678,7 +678,7 @@ def testStringInput2(self):
         res = sess.run([output_name], {x_name: x})
         np.testing.assert_equal(x, res[0])
 
-    def testInputBytes(self):
+    def testInputBytes(self):  # noqa: N802
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
         x = np.array([b"this", b"is", b"identity", b"test"]).reshape((2, 2))
 
@@ -699,7 +699,7 @@ def testInputBytes(self):
         res = sess.run([output_name], {x_name: x})
         np.testing.assert_equal(x, res[0].astype("|S8"))
 
-    def testInputObject(self):
+    def testInputObject(self):  # noqa: N802
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
         x = np.array(["this", "is", "identity", "test"], object).reshape((2, 2))
 
@@ -720,7 +720,7 @@ def testInputObject(self):
         res = sess.run([output_name], {x_name: x})
         np.testing.assert_equal(x, res[0])
 
-    def testInputVoid(self):
+    def testInputVoid(self):  # noqa: N802
         sess = onnxrt.InferenceSession(get_name("identity_string.onnx"), providers=available_providers_without_tvm)
         # numpy 1.20+ doesn't automatically pad the bytes based entries in the array when dtype is np.void,
         # so we use inputs where that is the case
@@ -745,15 +745,15 @@ def testInputVoid(self):
         expr = np.array([["must", "have"], ["same", "size"]], dtype=object)
         np.testing.assert_equal(expr, res[0])
 
-    def testRaiseWrongNumInputs(self):
+    def testRaiseWrongNumInputs(self):  # noqa: N802
         with self.assertRaises(ValueError) as context:
             sess = onnxrt.InferenceSession(get_name("logicaland.onnx"), providers=onnxrt.get_available_providers())
             a = np.array([[True, True], [False, False]], dtype=bool)
-            res = sess.run([], {"input:0": a})
+            sess.run([], {"input:0": a})
 
         self.assertTrue("Model requires 2 inputs" in str(context.exception))
 
-    def testModelMeta(self):
+    def testModelMeta(self):  # noqa: N802
         model_path = "../models/opset8/test_squeezenet/model.onnx"
         if not os.path.exists(model_path):
             return
@@ -765,7 +765,7 @@ def testModelMeta(self):
         self.assertEqual("", modelmeta.description)
         self.assertEqual("", modelmeta.graph_description)
 
-    def testProfilerWithSessionOptions(self):
+    def testProfilerWithSessionOptions(self):  # noqa: N802
         so = onnxrt.SessionOptions()
         so.enable_profiling = True
         sess = onnxrt.InferenceSession(
@@ -786,8 +786,8 @@ def testProfilerWithSessionOptions(self):
                     self.assertTrue(tag in lines[i])
             self.assertTrue("]" in lines[-1])
 
-    def testProfilerGetStartTimeNs(self):
-        def getSingleSessionProfilingStartTime():
+    def testProfilerGetStartTimeNs(self):  # noqa: N802
+        def getSingleSessionProfilingStartTime():  # noqa: N802
             so = onnxrt.SessionOptions()
             so.enable_profiling = True
             sess = onnxrt.InferenceSession(
@@ -807,7 +807,7 @@ def getSingleSessionProfilingStartTime():
         # Chronological profiling's start time
         self.assertTrue(start_time_1 <= start_time_2 <= start_time_3)
 
-    def testGraphOptimizationLevel(self):
+    def testGraphOptimizationLevel(self):  # noqa: N802
         opt = onnxrt.SessionOptions()
         # default should be all optimizations optimization
         self.assertEqual(opt.graph_optimization_level, onnxrt.GraphOptimizationLevel.ORT_ENABLE_ALL)
@@ -820,9 +820,9 @@ def testGraphOptimizationLevel(self):
         a = np.array([[True, True], [False, False]], dtype=bool)
         b = np.array([[True, False], [True, False]], dtype=bool)
 
-        res = sess.run([], {"input1:0": a, "input:0": b})
+        sess.run([], {"input1:0": a, "input:0": b})
 
-    def testSequenceLength(self):
+    def testSequenceLength(self):  # noqa: N802
         sess = onnxrt.InferenceSession(get_name("sequence_length.onnx"), providers=available_providers_without_tvm)
         x = [
             np.array([1.0, 0.0, 3.0, 44.0, 23.0, 11.0], dtype=np.float32).reshape((2, 3)),
@@ -843,7 +843,7 @@ def testSequenceLength(self):
         res = sess.run([output_name], {x_name: x})
         self.assertEqual(output_expected, res[0])
 
-    def testSequenceConstruct(self):
+    def testSequenceConstruct(self):  # noqa: N802
         sess = onnxrt.InferenceSession(
             get_name("sequence_construct.onnx"),
             providers=available_providers_without_tvm,
@@ -875,7 +875,7 @@ def testSequenceConstruct(self):
 
         np.testing.assert_array_equal(output_expected, res[0])
 
-    def testSequenceInsert(self):
+    def testSequenceInsert(self):  # noqa: N802
         opt = onnxrt.SessionOptions()
         opt.execution_mode = onnxrt.ExecutionMode.ORT_SEQUENTIAL
         sess = onnxrt.InferenceSession(
@@ -905,13 +905,13 @@ def testSequenceInsert(self):
         )
         np.testing.assert_array_equal(output_expected, res[0])
 
-    def testOrtExecutionMode(self):
+    def testOrtExecutionMode(self):  # noqa: N802
         opt = onnxrt.SessionOptions()
         self.assertEqual(opt.execution_mode, onnxrt.ExecutionMode.ORT_SEQUENTIAL)
         opt.execution_mode = onnxrt.ExecutionMode.ORT_PARALLEL
         self.assertEqual(opt.execution_mode, onnxrt.ExecutionMode.ORT_PARALLEL)
 
-    def testLoadingSessionOptionsFromModel(self):
+    def testLoadingSessionOptionsFromModel(self):  # noqa: N802
         try:
             os.environ["ORT_LOAD_CONFIG_FROM_MODEL"] = str(1)
             sess = onnxrt.InferenceSession(
@@ -942,7 +942,7 @@ def testLoadingSessionOptionsFromModel(self):
             # Make sure the usage of the feature is disabled after this test
             os.environ["ORT_LOAD_CONFIG_FROM_MODEL"] = str(0)
 
-    def testSessionOptionsAddFreeDimensionOverrideByDenotation(self):
+    def testSessionOptionsAddFreeDimensionOverrideByDenotation(self):  # noqa: N802
         so = onnxrt.SessionOptions()
         so.add_free_dimension_override_by_denotation("DATA_BATCH", 3)
         so.add_free_dimension_override_by_denotation("DATA_CHANNEL", 5)
@@ -957,7 +957,7 @@ def testSessionOptionsAddFreeDimensionOverrideByDenotation(self):
         # Free dims with denotations - "DATA_BATCH" and "DATA_CHANNEL" have values assigned to them.
         self.assertEqual(input_shape, [3, 5, 5])
 
-    def testSessionOptionsAddFreeDimensionOverrideByName(self):
+    def testSessionOptionsAddFreeDimensionOverrideByName(self):  # noqa: N802
         so = onnxrt.SessionOptions()
         so.add_free_dimension_override_by_name("Dim1", 4)
         so.add_free_dimension_override_by_name("Dim2", 6)
@@ -972,14 +972,14 @@ def testSessionOptionsAddFreeDimensionOverrideByName(self):
         # "Dim1" and "Dim2" have values assigned to them.
         self.assertEqual(input_shape, [4, 6, 5])
 
-    def testSessionOptionsAddConfigEntry(self):
+    def testSessionOptionsAddConfigEntry(self):  # noqa: N802
         so = onnxrt.SessionOptions()
         key = "CONFIG_KEY"
         val = "CONFIG_VAL"
         so.add_session_config_entry(key, val)
         self.assertEqual(so.get_session_config_entry(key), val)
 
-    def testInvalidSessionOptionsConfigEntry(self):
+    def testInvalidSessionOptionsConfigEntry(self):  # noqa: N802
         so = onnxrt.SessionOptions()
         invalide_key = "INVALID_KEY"
         with self.assertRaises(RuntimeError) as context:
@@ -988,7 +988,7 @@ def testInvalidSessionOptionsConfigEntry(self):
             "SessionOptions does not have configuration with key: " + invalide_key in str(context.exception)
         )
 
-    def testSessionOptionsAddInitializer(self):
+    def testSessionOptionsAddInitializer(self):  # noqa: N802
         # Create an initializer and add it to a SessionOptions instance
         so = onnxrt.SessionOptions()
         # This initializer is different from the actual initializer in the model for "W"
@@ -1014,7 +1014,7 @@ def testSessionOptionsAddInitializer(self):
             )
         )
 
-    def testSessionOptionsAddExternalInitializers(self):
+    def testSessionOptionsAddExternalInitializers(self):  # noqa: N802
         # Create an external initializer data in OrtValue
         # This initializer will replace the initializer with external data reference in the graph
         ortvalue_initializer = onnxrt.OrtValue.ortvalue_from_numpy(np.array([0, 0, 1, 1]).astype(np.int64))
@@ -1027,26 +1027,26 @@ def testSessionOptionsAddExternalInitializers(self):
             providers=["CPUExecutionProvider"],
         )
 
-    def testRegisterCustomOpsLibrary(self):
+    def testRegisterCustomOpsLibrary(self):  # noqa: N802
         if sys.platform.startswith("win"):
             shared_library = "custom_op_library.dll"
             if not os.path.exists(shared_library):
-                raise FileNotFoundError("Unable to find '{0}'".format(shared_library))
+                raise FileNotFoundError(f"Unable to find '{shared_library}'")
 
         elif sys.platform.startswith("darwin"):
             shared_library = "libcustom_op_library.dylib"
             if not os.path.exists(shared_library):
-                raise FileNotFoundError("Unable to find '{0}'".format(shared_library))
+                raise FileNotFoundError(f"Unable to find '{shared_library}'")
 
         else:
             shared_library = "./libcustom_op_library.so"
             if not os.path.exists(shared_library):
-                raise FileNotFoundError("Unable to find '{0}'".format(shared_library))
+                raise FileNotFoundError(f"Unable to find '{shared_library}'")
 
         this = os.path.dirname(__file__)
         custom_op_model = os.path.join(this, "testdata", "custom_op_library", "custom_op_test.onnx")
         if not os.path.exists(custom_op_model):
-            raise FileNotFoundError("Unable to find '{0}'".format(custom_op_model))
+            raise FileNotFoundError(f"Unable to find '{custom_op_model}'")
 
         so1 = onnxrt.SessionOptions()
         so1.register_custom_ops_library(shared_library)
@@ -1070,19 +1070,18 @@ def testRegisterCustomOpsLibrary(self):
         so2 = so1
 
         # Model loading successfully indicates that the custom op node could be resolved successfully
-        sess2 = onnxrt.InferenceSession(
+        onnxrt.InferenceSession(
             custom_op_model, sess_options=so2, providers=available_providers_without_tvm_and_tensorrt
         )
 
         # Create another SessionOptions instance with the same shared library referenced
         so3 = onnxrt.SessionOptions()
         so3.register_custom_ops_library(shared_library)
-        sess3 = onnxrt.InferenceSession(
+        onnxrt.InferenceSession(
             custom_op_model, sess_options=so3, providers=available_providers_without_tvm_and_tensorrt
         )
 
-    def testOrtValue(self):
-
+    def testOrtValue(self):  # noqa: N802
         numpy_arr_input = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         numpy_arr_output = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
 
@@ -1120,7 +1119,7 @@ def test_session_with_ortvalue_input(ortvalue):
             # The constructed OrtValue should still be valid after being used in a session
             self.assertTrue(np.array_equal(ortvalue2.numpy(), numpy_arr_input))
 
-    def testOrtValue_ghIssue9799(self):
+    def testOrtValue_ghIssue9799(self):  # noqa: N802
         if "CUDAExecutionProvider" in onnxrt.get_available_providers():
             session = onnxrt.InferenceSession(
                 get_name("identity_9799.onnx"),
@@ -1134,7 +1133,7 @@ def testOrtValue_ghIssue9799(self):
                 outs = session.run(output_names=["output"], input_feed=upstreams_onnxrt)[0]
                 self.assertTrue(np.allclose(inps, outs))
 
-    def testSparseTensorCooFormat(self):
+    def testSparseTensorCooFormat(self):  # noqa: N802
         cpu_device = onnxrt.OrtDevice.make("cpu", 0)
         shape = [9, 9]
         values = np.array([1.0, 2.0, 3.0], dtype=np.float32)
@@ -1201,7 +1200,7 @@ def testSparseTensorCooFormat(self):
             with self.assertRaises(RuntimeError):
                 sparse_tensor.to_cuda(cuda_device)
 
-    def testSparseTensorCsrFormat(self):
+    def testSparseTensorCsrFormat(self):  # noqa: N802
         cpu_device = onnxrt.OrtDevice.make("cpu", 0)
         shape = [9, 9]
         values = np.array([1.0, 2.0, 3.0], dtype=np.float32)
@@ -1242,10 +1241,10 @@ def testSparseTensorCsrFormat(self):
             self.assertEqual(cuda_sparse_tensor.dense_shape(), shape)
             self.assertEqual(cuda_sparse_tensor.data_type(), "sparse_tensor(float)")
 
-    def testRunModelWithCudaCopyStream(self):
+    def testRunModelWithCudaCopyStream(self):  # noqa: N802
         available_providers = onnxrt.get_available_providers()
 
-        if not "CUDAExecutionProvider" in available_providers:
+        if "CUDAExecutionProvider" not in available_providers:
             print("Skipping testRunModelWithCudaCopyStream when CUDA is not available")
         else:
             # adapted from issue #4829 for a race condition when copy is not on default stream
@@ -1261,10 +1260,10 @@ def testRunModelWithCudaCopyStream(self):
 
             session = onnxrt.InferenceSession(get_name("issue4829.onnx"), providers=providers)
             shape = np.array([2, 2], dtype=np.int64)
-            for iteration in range(100000):
-                result = session.run(output_names=["output"], input_feed={"shape": shape})
+            for _iteration in range(100000):
+                session.run(output_names=["output"], input_feed={"shape": shape})
 
-    def testSharedAllocatorUsingCreateAndRegisterAllocator(self):
+    def testSharedAllocatorUsingCreateAndRegisterAllocator(self):  # noqa: N802
         # Create and register an arena based allocator
 
         # To create an OrtArenaCfg using non-default parameters, use one of below templates:
@@ -1298,7 +1297,7 @@ def testSharedAllocatorUsingCreateAndRegisterAllocator(self):
             providers=onnxrt.get_available_providers(),
         )
 
-    def testMemoryArenaShrinkage(self):
+    def testMemoryArenaShrinkage(self):  # noqa: N802
         if platform.architecture()[0] == "32bit" or "ppc" in platform.machine() or "powerpc" in platform.machine():
             # on x86 or ppc builds, the CPU allocator does not use an arena
             print("Skipping testMemoryArenaShrinkage in 32bit or powerpc platform.")
@@ -1331,7 +1330,7 @@ def testMemoryArenaShrinkage(self):
                 )
                 sess2.run([], {input_name: x}, ro2)
 
-    def testCheckAndNormalizeProviderArgs(self):
+    def testCheckAndNormalizeProviderArgs(self):  # noqa: N802
         from onnxruntime.capi.onnxruntime_inference_collection import check_and_normalize_provider_args
 
         valid_providers = ["a", "b", "c"]
@@ -1383,7 +1382,7 @@ def check_failure(providers, provider_options):
         # provider options unsupported mixed specification
         check_failure([("a", {1: 2})], [{3: 4}])
 
-    def testRegisterCustomEPsLibrary(self):
+    def testRegisterCustomEPsLibrary(self):  # noqa: N802
         from onnxruntime.capi import _pybind_state as C
 
         available_eps = C.get_available_providers()
@@ -1401,12 +1400,12 @@ def testRegisterCustomEPsLibrary(self):
             shared_library = "./libtest_execution_provider.so"
 
         if not os.path.exists(shared_library):
-            raise FileNotFoundError("Unable to find '{0}'".format(shared_library))
+            raise FileNotFoundError(f"Unable to find '{shared_library}'")
 
         this = os.path.dirname(__file__)
         custom_op_model = os.path.join(this, "testdata", "custom_execution_provider_library", "test_model.onnx")
         if not os.path.exists(custom_op_model):
-            raise FileNotFoundError("Unable to find '{0}'".format(custom_op_model))
+            raise FileNotFoundError(f"Unable to find '{custom_op_model}'")
 
         session_options = C.get_default_session_options()
         sess = C.InferenceSession(session_options, custom_op_model, True, True)
@@ -1423,7 +1422,7 @@ def testRegisterCustomEPsLibrary(self):
         )
         print("Create session with customize execution provider successfully!")
 
-    def testCreateAllocator(self):
+    def testCreateAllocator(self):  # noqa: N802
         def verify_allocator(allocator, expected_config):
             for key, val in expected_config.items():
                 if key == "max_mem":
diff --git a/onnxruntime/test/python/onnxruntime_test_python_azure.py b/onnxruntime/test/python/onnxruntime_test_python_azure.py
index f10b4ec3e3e78..005d09c9d6cf6 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_azure.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_azure.py
@@ -7,10 +7,8 @@
 
 
 class TestAmlEndpoint(unittest.TestCase):
-
     # test an endpoint of adding floats
-    def testAddf(self):
-
+    def testAddf(self):  # noqa: N802
         sess_opt = ort.SessionOptions()
         sess_opt.add_session_config_entry("azure.endpoint_type", "triton")
         sess_opt.add_session_config_entry("azure.uri", "https://endpoint-2930.westus2.inference.ml.azure.com")
@@ -34,8 +32,7 @@ def testAddf(self):
         np.testing.assert_allclose(z, expected_z, rtol=1e-05, atol=1e-08)
 
     # test an endpoint of adding doubles
-    def testAddf8(self):
-
+    def testAddf8(self):  # noqa: N802
         sess_opt = ort.SessionOptions()
         sess_opt.add_session_config_entry("azure.endpoint_type", "triton")
         sess_opt.add_session_config_entry("azure.uri", "https://endpoint-1364.westus2.inference.ml.azure.com")
@@ -59,8 +56,7 @@ def testAddf8(self):
         np.testing.assert_allclose(z, expected_z, rtol=1e-05, atol=1e-08)
 
     # test an endpoint of adding int
-    def testAddi4(self):
-
+    def testAddi4(self):  # noqa: N802
         sess_opt = ort.SessionOptions()
         sess_opt.add_session_config_entry("azure.endpoint_type", "triton")
         sess_opt.add_session_config_entry("azure.uri", "https://endpoint-9879.westus2.inference.ml.azure.com")
@@ -84,8 +80,7 @@ def testAddi4(self):
         np.testing.assert_allclose(z, expected_z, rtol=1e-05, atol=1e-08)
 
     # test an endpoint of "And"
-    def testAnd(self):
-
+    def testAnd(self):  # noqa: N802
         sess_opt = ort.SessionOptions()
         sess_opt.add_session_config_entry("azure.endpoint_type", "triton")
         sess_opt.add_session_config_entry("azure.uri", "https://endpoint-6811.westus2.inference.ml.azure.com")
diff --git a/onnxruntime/test/python/onnxruntime_test_python_backend.py b/onnxruntime/test/python/onnxruntime_test_python_backend.py
index 26752d687f97c..b7fb95f834455 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_backend.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_backend.py
@@ -13,7 +13,7 @@
 
 
 class TestBackend(unittest.TestCase):
-    def testRunModel(self):
+    def testRunModel(self):  # noqa: N802
         name = get_name("mul_1.onnx")
         rep = backend.prepare(name)
         x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
@@ -21,11 +21,11 @@ def testRunModel(self):
         output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
         np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
 
-    def testAllocationPlanWorksWithOnlyExecutePathToFetchesOption(self):
+    def testAllocationPlanWorksWithOnlyExecutePathToFetchesOption(self):  # noqa: N802
         """
                (inp0)  (inp1)
-                  |  \/  |
-                  |  /\  |
+                  |  \\/  |
+                  |  /\\  |
                  Add    Sub
                   |      |
               (tsor0)  (tsor1)
diff --git a/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py b/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py
index b93cf865d4aa0..42103fbbe3bc7 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py
@@ -10,7 +10,7 @@
 
 import onnxruntime.backend as backend
 from onnxruntime import datasets
-from onnxruntime.backend.backend import OnnxRuntimeBackend as ort_backend
+from onnxruntime.backend.backend import OnnxRuntimeBackend as ort_backend  # noqa: N813
 
 
 def check_list_of_map_to_float(testcase, expected_rows, actual_rows):
@@ -32,7 +32,7 @@ def check_list_of_map_to_float(testcase, expected_rows, actual_rows):
 
 
 class TestBackend(unittest.TestCase):
-    def testRunModelNonTensor(self):
+    def testRunModelNonTensor(self):  # noqa: N802
         name = get_name("pipeline_vectorize.onnx")
         rep = backend.prepare(name)
         x = {0: 25.0, 1: 5.13, 2: 0.0, 3: 0.453, 4: 5.966}
@@ -40,7 +40,7 @@ def testRunModelNonTensor(self):
         output_expected = np.array([[49.752754]], dtype=np.float32)
         np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
 
-    def testRunModelProto(self):
+    def testRunModelProto(self):  # noqa: N802
         name = datasets.get_example("logreg_iris.onnx")
         model = load(name)
 
@@ -65,7 +65,7 @@ def testRunModelProto(self):
 
         check_list_of_map_to_float(self, output_expected, res[1])
 
-    def testRunModelProtoApi(self):
+    def testRunModelProtoApi(self):  # noqa: N802
         name = datasets.get_example("logreg_iris.onnx")
         model = load(name)
 
diff --git a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
index b71b3a07cd41f..5dd927a566e81 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_cudagraph.py
@@ -1,11 +1,11 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-import gc
-import os
-import sys
-import threading
-import time
+import gc  # noqa: F401
+import os  # noqa: F401
+import sys  # noqa: F401
+import threading  # noqa: F401
+import time  # noqa: F401
 
 # -*- coding: UTF-8 -*-
 import unittest
@@ -14,11 +14,11 @@
 from helper import get_name
 
 import onnxruntime as onnxrt
-from onnxruntime.capi.onnxruntime_pybind11_state import Fail
+from onnxruntime.capi.onnxruntime_pybind11_state import Fail  # noqa: F401
 
 
 class TestInferenceSessionWithCudaGraph(unittest.TestCase):
-    def testOrtValueUpdateInPlace(self):
+    def testOrtValueUpdateInPlace(self):  # noqa: N802
         x0 = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
         ortvalue_cpu = onnxrt.OrtValue.ortvalue_from_numpy(x0)
         np.testing.assert_allclose(x0, ortvalue_cpu.numpy())
@@ -34,10 +34,10 @@ def testOrtValueUpdateInPlace(self):
             ortvalue_gpu.update_inplace(x1)
             np.testing.assert_allclose(x1, ortvalue_gpu.numpy())
 
-    def testRunModelWithCudaGraph(self):
+    def testRunModelWithCudaGraph(self):  # noqa: N802
         if "CUDAExecutionProvider" in onnxrt.get_available_providers():
             providers = [("CUDAExecutionProvider", {"enable_cuda_graph": True})]
-            INPUT_SIZE = 1280
+            INPUT_SIZE = 1280  # noqa: N806
             x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]] * INPUT_SIZE, dtype=np.float32)
             y = np.array([[0.0], [0.0], [0.0]] * INPUT_SIZE, dtype=np.float32)
             x_ortvalue = onnxrt.OrtValue.ortvalue_from_numpy(x, "cuda", 0)
diff --git a/onnxruntime/test/python/onnxruntime_test_python_iobinding.py b/onnxruntime/test/python/onnxruntime_test_python_iobinding.py
index ff1c0d17fd3ec..dece25cc8b8b9 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_iobinding.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_iobinding.py
@@ -43,7 +43,7 @@ def create_expected_output_alternate(self):
         return np.array([[2.0, 8.0], [18.0, 32.0], [50.0, 72.0]], dtype=np.float32)
 
     def test_bind_input_to_cpu_arr(self):
-        input = self.create_numpy_input()
+        self.create_numpy_input()
 
         session = onnxrt.InferenceSession(get_name("mul_1.onnx"), providers=onnxrt.get_available_providers())
         io_binding = session.io_binding()
@@ -67,7 +67,6 @@ def test_bind_input_to_cpu_arr(self):
         self.assertTrue(np.array_equal(self.create_expected_output(), ort_output))
 
     def test_bind_input_types(self):
-
         opset = onnx_opset_version()
         devices = [
             (
@@ -99,12 +98,11 @@ def test_bind_input_types(self):
                 np.bool_,
             ]:
                 with self.subTest(dtype=dtype, device=str(device)):
-
                     x = np.arange(8).reshape((-1, 2)).astype(dtype)
                     proto_dtype = NP_TYPE_TO_TENSOR_TYPE[x.dtype]
 
-                    X = helper.make_tensor_value_info("X", proto_dtype, [None, x.shape[1]])
-                    Y = helper.make_tensor_value_info("Y", proto_dtype, [None, x.shape[1]])
+                    X = helper.make_tensor_value_info("X", proto_dtype, [None, x.shape[1]])  # noqa: N806
+                    Y = helper.make_tensor_value_info("Y", proto_dtype, [None, x.shape[1]])  # noqa: N806
 
                     # inference
                     node_add = helper.make_node("Identity", ["X"], ["Y"])
diff --git a/onnxruntime/test/python/onnxruntime_test_python_keras.py b/onnxruntime/test/python/onnxruntime_test_python_keras.py
index fb94f67757844..c24cb6954df98 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_keras.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_keras.py
@@ -16,12 +16,12 @@
 
 class ScaledTanh(Layer):
     def __init__(self, alpha=1.0, beta=1.0, **kwargs):
-        super(ScaledTanh, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.alpha = alpha
         self.beta = beta
 
     def build(self, input_shape):
-        super(ScaledTanh, self).build(input_shape)
+        super().build(input_shape)
 
     def call(self, x):
         return self.alpha * K.tanh(self.beta * x)
@@ -43,10 +43,9 @@ def custom_activation(scope, operator, container):
 
 
 class TestInferenceSessionKeras(unittest.TestCase):
-    def testRunModelConv(self):
-
+    def testRunModelConv(self):  # noqa: N802
         # keras model
-        N, C, H, W = 2, 3, 5, 5
+        N, C, H, W = 2, 3, 5, 5  # noqa: N806
         x = np.random.rand(N, H, W, C).astype(np.float32, copy=False)
 
         model = Sequential()
diff --git a/onnxruntime/test/python/onnxruntime_test_python_mlops.py b/onnxruntime/test/python/onnxruntime_test_python_mlops.py
index b6604a6d51e8a..c9cb9bfbf58aa 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_mlops.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_mlops.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-import os
+import os  # noqa: F401
 
 # -*- coding: UTF-8 -*-
 import unittest
@@ -13,7 +13,7 @@
 
 
 class TestInferenceSession(unittest.TestCase):
-    def testZipMapStringFloat(self):
+    def testZipMapStringFloat(self):  # noqa: N802
         sess = onnxrt.InferenceSession(
             get_name("zipmap_stringfloat.onnx"),
             providers=onnxrt.get_available_providers(),
@@ -37,7 +37,7 @@ def testZipMapStringFloat(self):
         res = sess.run([output_name], {x_name: x})
         self.assertEqual(output_expected, res[0])
 
-    def testZipMapInt64Float(self):
+    def testZipMapInt64Float(self):  # noqa: N802
         sess = onnxrt.InferenceSession(
             get_name("zipmap_int64float.onnx"),
             providers=onnxrt.get_available_providers(),
@@ -58,7 +58,7 @@ def testZipMapInt64Float(self):
         res = sess.run([output_name], {x_name: x})
         self.assertEqual(output_expected, res[0])
 
-    def testDictVectorizer(self):
+    def testDictVectorizer(self):  # noqa: N802
         sess = onnxrt.InferenceSession(
             get_name("pipeline_vectorize.onnx"),
             providers=onnxrt.get_available_providers(),
@@ -108,7 +108,7 @@ def testDictVectorizer(self):
         output_expected = np.array([[49.752754]], dtype=np.float32)
         np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
 
-    def testLabelEncoder(self):
+    def testLabelEncoder(self):  # noqa: N802
         sess = onnxrt.InferenceSession(get_name("LabelEncoder.onnx"), providers=onnxrt.get_available_providers())
         input_name = sess.get_inputs()[0].name
         self.assertEqual(input_name, "input")
diff --git a/onnxruntime/test/python/onnxruntime_test_python_sparse_matmul.py b/onnxruntime/test/python/onnxruntime_test_python_sparse_matmul.py
index d084902bbc2ac..4abe799ac89d7 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_sparse_matmul.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_sparse_matmul.py
@@ -14,7 +14,7 @@
 
 
 class TestSparseToDenseMatmul(unittest.TestCase):
-    def testRunSparseOutputOrtValueVector(self):
+    def testRunSparseOutputOrtValueVector(self):  # noqa: N802
         """
         Try running models using the new run_with_ort_values
         sparse_initializer_as_output.onnx - requires no inputs, but only one output
@@ -28,7 +28,7 @@ def testRunSparseOutputOrtValueVector(self):
         res = sess._sess.run_with_ort_values({}, ["values"], RunOptions())
         self.assertIsInstance(res, OrtValueVector)
 
-    def testRunSparseOutputOnly(self):
+    def testRunSparseOutputOnly(self):  # noqa: N802
         """
         Try running models using the new run_with_ort_values
         sparse_initializer_as_output.onnx - requires no inputs, but only one output
@@ -52,12 +52,12 @@ def testRunSparseOutputOnly(self):
         self.assertTrue(np.array_equal(values, sparse_output.values()))
         self.assertTrue(np.array_equal(indices, sparse_output.as_coo_view().indices()))
 
-    def testRunContribSparseMatMul(self):
+    def testRunContribSparseMatMul(self):  # noqa: N802
         """
         Mutliple sparse COO tensor to dense
         """
         common_shape = [9, 9]  # inputs and oputputs same shape
-        A_values = np.array(
+        A_values = np.array(  # noqa: N806
             [
                 1.0,
                 2.0,
@@ -116,7 +116,7 @@ def testRunContribSparseMatMul(self):
             np.float32,
         )
         # 2-D index
-        A_indices = np.array(
+        A_indices = np.array(  # noqa: N806
             [
                 0,
                 1,
@@ -230,9 +230,9 @@ def testRunContribSparseMatMul(self):
 
         cpu_device = onnxrt.OrtDevice.make("cpu", 0)
         sparse_tensor = onnxrt.SparseTensor.sparse_coo_from_numpy(common_shape, A_values, A_indices, cpu_device)
-        A_ort_value = onnxrt.OrtValue.ort_value_from_sparse_tensor(sparse_tensor)
+        A_ort_value = onnxrt.OrtValue.ort_value_from_sparse_tensor(sparse_tensor)  # noqa: N806
 
-        B_data = np.array(
+        B_data = np.array(  # noqa: N806
             [
                 0,
                 1,
@@ -318,9 +318,9 @@ def testRunContribSparseMatMul(self):
             ],
             np.float32,
         ).reshape(common_shape)
-        B_ort_value = onnxrt.OrtValue.ortvalue_from_numpy(B_data)
+        B_ort_value = onnxrt.OrtValue.ortvalue_from_numpy(B_data)  # noqa: N806
 
-        Y_result = np.array(
+        Y_result = np.array(  # noqa: N806
             [
                 546,
                 561,
diff --git a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
index be63b1ba578c9..18fe6517daa80 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
@@ -5,7 +5,7 @@
 
 # -*- coding: UTF-8 -*-
 import onnx
-from onnx import AttributeProto, GraphProto, TensorProto, helper
+from onnx import AttributeProto, GraphProto, TensorProto, helper  # noqa: F401
 
 if os.path.exists(
     os.path.join(
@@ -106,7 +106,7 @@ def test_mismatched_types(self):
 
 class TestSymbolicShapeInferenceForOperators(unittest.TestCase):
     def _check_shapes(self, graph, inferred_graph, vis):  # type: (GraphProto, GraphProto, List[ValueInfoProto]) -> None
-        names_in_vis = set(x.name for x in vis)
+        names_in_vis = {x.name for x in vis}
         vis = list(x for x in graph.value_info if x.name not in names_in_vis) + vis
         inferred_vis = list(inferred_graph.value_info)
         vis = list(sorted(vis, key=lambda x: x.name))
@@ -114,12 +114,12 @@ def _check_shapes(self, graph, inferred_graph, vis):  # type: (GraphProto, Graph
         if vis == inferred_vis:
             return
         # otherwise some custom logic to give a nicer diff
-        vis_names = set(x.name for x in vis)
-        inferred_vis_names = set(x.name for x in inferred_vis)
+        vis_names = {x.name for x in vis}
+        inferred_vis_names = {x.name for x in inferred_vis}
         assert vis_names == inferred_vis_names, (vis_names, inferred_vis_names)
         for vi, inferred_vi in zip(vis, inferred_vis):
-            assert vi == inferred_vi, "\n%s\n%s\n" % (vi, inferred_vi)
-        assert False
+            assert vi == inferred_vi, f"\n{vi}\n{inferred_vi}\n"
+        raise AssertionError()
 
     def test_unsqueeze_opset_11(self):
         graph = helper.make_graph(
diff --git a/onnxruntime/test/python/onnxruntime_test_training_unit_tests.py b/onnxruntime/test/python/onnxruntime_test_training_unit_tests.py
index f89459cfd9750..4f322478673ae 100644
--- a/onnxruntime/test/python/onnxruntime_test_training_unit_tests.py
+++ b/onnxruntime/test/python/onnxruntime_test_training_unit_tests.py
@@ -17,7 +17,7 @@
 
 
 class TestTrainingDropout(unittest.TestCase):
-    def testTrainingAndEvalDropout(self):
+    def testTrainingAndEvalDropout(self):  # noqa: N802
         # Temporarily disable this test.
         # The graph below will trigger ORT
         # to sort backward graph before forward graph which gives incorrect result.
@@ -26,7 +26,7 @@ def testTrainingAndEvalDropout(self):
 
         class TwoDropoutNet(nn.Module):
             def __init__(self, drop_prb_1, drop_prb_2, dim_size):
-                super(TwoDropoutNet, self).__init__()
+                super().__init__()
                 self.drop_1 = nn.Dropout(drop_prb_1)
                 self.drop_2 = nn.Dropout(drop_prb_2)
                 self.weight_1 = torch.nn.Parameter(torch.zeros(dim_size, dtype=torch.float32))
diff --git a/onnxruntime/test/python/quantization/op_test_utils.py b/onnxruntime/test/python/quantization/op_test_utils.py
index 3681c5f492288..81506caab1b19 100644
--- a/onnxruntime/test/python/quantization/op_test_utils.py
+++ b/onnxruntime/test/python/quantization/op_test_utils.py
@@ -23,12 +23,12 @@ def rewind(self):
         self.iter_next = iter(self.data_feeds)
 
 
-def InputFeedsNegOneZeroOne(n, name2shape):
+def InputFeedsNegOneZeroOne(n, name2shape):  # noqa: N802
     """
     randomize n feed according to shape, its values are from -1, 0, and 1
     """
     input_data_list = []
-    for i in range(n):
+    for _i in range(n):
         inputs = {}
         for name, shape in name2shape.items():
             inputs.update({name: np.random.randint(-1, 2, shape).astype(np.float32)})
@@ -48,7 +48,7 @@ def check_op_type_order(testcase, model_to_check, ops):
         testcase.assertEqual(
             ops[node_idx],
             node.op_type,
-            "op {} is not in order. Expected: {}, Actual: {}".format(node_idx, ops[node_idx], node.op_type),
+            f"op {node_idx} is not in order. Expected: {ops[node_idx]}, Actual: {node.op_type}",
         )
 
 
@@ -64,7 +64,7 @@ def check_op_type_count(testcase, model_path, **kwargs):
         testcase.assertEqual(
             kwargs[op_type],
             optype2count[op_type],
-            "op_type {} count not same".format(op_type),
+            f"op_type {op_type} count not same",
         )
 
 
diff --git a/onnxruntime/test/python/quantization/test_calibration.py b/onnxruntime/test/python/quantization/test_calibration.py
index e88863a5418a8..14be6fa45c8c1 100644
--- a/onnxruntime/test/python/quantization/test_calibration.py
+++ b/onnxruntime/test/python/quantization/test_calibration.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License. See License.txt in the project root for
@@ -355,7 +354,7 @@ def test_compute_range(self):
         min_max_pairs = list(zip(rmin, rmax))
         output_names = [infer_session.get_outputs()[i].name for i in range(len(infer_session.get_outputs()))]
         output_min_max_dict = dict(zip(output_names, min_max_pairs))
-        for output_name in output_min_max_dict.keys():
+        for output_name in output_min_max_dict:
             self.assertEqual(output_min_max_dict[output_name], tensors_range[output_name])
 
     def test_augment_graph_with_zero_value_dimension(self):
diff --git a/onnxruntime/test/python/quantization/test_conv_dynamic.py b/onnxruntime/test/python/quantization/test_conv_dynamic.py
index 045bddccbfbb2..9578c9fe708aa 100644
--- a/onnxruntime/test/python/quantization/test_conv_dynamic.py
+++ b/onnxruntime/test/python/quantization/test_conv_dynamic.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License. See License.txt in the project root for
@@ -11,13 +10,9 @@
 import numpy as np
 import onnx
 from onnx import TensorProto, helper, numpy_helper
-from op_test_utils import (
-    TestDataFeeds,
-    check_model_correctness,
-    check_op_type_count,
-    check_op_type_order,
-    check_qtype_by_node_type,
-)
+from op_test_utils import TestDataFeeds  # noqa: F401
+from op_test_utils import check_op_type_order  # noqa: F401
+from op_test_utils import check_model_correctness, check_op_type_count, check_qtype_by_node_type
 
 from onnxruntime.quantization import DynamicQuantConfig, QuantType, quantize, quantize_dynamic
 
@@ -65,7 +60,7 @@ def construct_model(self, model_path):
         model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
         onnx.save(model, model_path)
 
-    def dynamic_quant_conv_test(self, weight_type, extra_options={}, use_quant_config=False):
+    def dynamic_quant_conv_test(self, weight_type, extra_options={}, use_quant_config=False):  # noqa: B006
         np.random.seed(1)
         model_fp32_path = "conv_bias.fp32.onnx"
         self.construct_model(model_fp32_path)
@@ -73,7 +68,7 @@ def dynamic_quant_conv_test(self, weight_type, extra_options={}, use_quant_confi
         activation_proto_qtype = TensorProto.UINT8
         activation_type_str = "u8"
         weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8"
-        model_int8_path = "conv_bias.quant.{}{}.onnx".format(activation_type_str, weight_type_str)
+        model_int8_path = f"conv_bias.quant.{activation_type_str}{weight_type_str}.onnx"
 
         if use_quant_config:
             quant_config = DynamicQuantConfig(weight_type=weight_type, extra_options=extra_options)
diff --git a/onnxruntime/test/python/quantization/test_onnx_model.py b/onnxruntime/test/python/quantization/test_onnx_model.py
index fc29810e9b97d..59f408b1eef5c 100644
--- a/onnxruntime/test/python/quantization/test_onnx_model.py
+++ b/onnxruntime/test/python/quantization/test_onnx_model.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License. See License.txt in the project root for
@@ -11,9 +10,9 @@
 import numpy as np
 import onnx
 from onnx import TensorProto, helper, numpy_helper
-from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_type_order
+from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_type_order  # noqa: F401
 
-import onnxruntime
+import onnxruntime  # noqa: F401
 from onnxruntime.quantization.onnx_model import ONNXModel
 
 
@@ -74,7 +73,7 @@ def construct_model(self, model_path):
         model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
         onnx.save(model, model_path)
 
-    def construct_model_Constant(self, model_path):
+    def construct_model_Constant(self, model_path):  # noqa: N802
         #    (input)    Constant
         #       \         /
         #        \       /
diff --git a/onnxruntime/test/python/quantization/test_op_argmax.py b/onnxruntime/test/python/quantization/test_op_argmax.py
index e73bb9093d99c..eaaff90fbe4ce 100644
--- a/onnxruntime/test/python/quantization/test_op_argmax.py
+++ b/onnxruntime/test/python/quantization/test_op_argmax.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License. See License.txt in the project root for
@@ -25,7 +24,7 @@
 class TestOpArgMax(unittest.TestCase):
     def input_feeds(self, n, name2shape):
         input_data_list = []
-        for i in range(n):
+        for _i in range(n):
             inputs = {}
             for name, shape in name2shape.items():
                 inputs.update({name: np.random.randint(-1, 2, shape).astype(np.float32)})
@@ -95,7 +94,7 @@ def construct_model_argmax(self, output_model_path, input_shape, output_shape):
 
         onnx.save(model, output_model_path)
 
-    def quantize_argmax_test(self, activation_type, weight_type, extra_options={}):
+    def quantize_argmax_test(self, activation_type, weight_type, extra_options={}):  # noqa: B006
         np.random.seed(1)
         model_fp32_path = "argmax_fp32.onnx"
 
@@ -104,9 +103,9 @@ def quantize_argmax_test(self, activation_type, weight_type, extra_options={}):
         activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
         activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8"
         weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8"
-        model_uint8_path = "argmax_{}{}.onnx".format(activation_type_str, weight_type_str)
-        model_uint8_qdq_path = "argmax_{}{}_qdq.onnx".format(activation_type_str, weight_type_str)
-        model_uint8_qdq_trt_path = "argmax_{}{}_qdq_trt.onnx".format(activation_type_str, weight_type_str)
+        model_uint8_path = f"argmax_{activation_type_str}{weight_type_str}.onnx"
+        model_uint8_qdq_path = f"argmax_{activation_type_str}{weight_type_str}_qdq.onnx"
+        model_uint8_qdq_trt_path = f"argmax_{activation_type_str}{weight_type_str}_qdq_trt.onnx"
 
         # Verify QOperator mode
         data_reader = self.input_feeds(1, {"input": [1, 256, 128, 128]})
diff --git a/onnxruntime/test/python/quantization/test_op_attention.py b/onnxruntime/test/python/quantization/test_op_attention.py
index 47bc342aad58f..2ed1d2a678adf 100644
--- a/onnxruntime/test/python/quantization/test_op_attention.py
+++ b/onnxruntime/test/python/quantization/test_op_attention.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License. See License.txt in the project root for
@@ -19,7 +18,7 @@
 class TestOpAttention(unittest.TestCase):
     def input_feeds(self, n, name2shape):
         input_data_list = []
-        for i in range(n):
+        for _i in range(n):
             inputs = {}
             for name, shape in name2shape.items():
                 inputs.update({name: np.random.randint(-1, 2, shape).astype(np.float32)})
diff --git a/onnxruntime/test/python/quantization/test_op_concat.py b/onnxruntime/test/python/quantization/test_op_concat.py
index 9c292cd667497..1c2a1fa44defc 100644
--- a/onnxruntime/test/python/quantization/test_op_concat.py
+++ b/onnxruntime/test/python/quantization/test_op_concat.py
@@ -87,7 +87,7 @@ def construct_model(self, model_path):
         model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
         save(model, model_path)
 
-    def quantize_concat_test(self, activation_type, weight_type, extra_options={}):
+    def quantize_concat_test(self, activation_type, weight_type, extra_options={}):  # noqa: B006
         np.random.seed(1)
         model_fp32_path = "concat_fp32.onnx"
         self.construct_model(model_fp32_path)
@@ -96,8 +96,8 @@ def quantize_concat_test(self, activation_type, weight_type, extra_options={}):
         activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
         activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8"
         weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8"
-        model_q8_path = "concat_{}{}.onnx".format(activation_type_str, weight_type_str)
-        model_q8_qdq_path = "concat_{}{}_qdq.onnx".format(activation_type_str, weight_type_str)
+        model_q8_path = f"concat_{activation_type_str}{weight_type_str}.onnx"
+        model_q8_qdq_path = f"concat_{activation_type_str}{weight_type_str}_qdq.onnx"
 
         # Verify QOperator mode
         data_reader.rewind()
diff --git a/onnxruntime/test/python/quantization/test_op_embed_layernorm.py b/onnxruntime/test/python/quantization/test_op_embed_layernorm.py
index 297d1c1af6c06..c68dac39b013a 100644
--- a/onnxruntime/test/python/quantization/test_op_embed_layernorm.py
+++ b/onnxruntime/test/python/quantization/test_op_embed_layernorm.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License. See License.txt in the project root for
@@ -19,7 +18,7 @@
 class TestOpEmbedLayerNormalization(unittest.TestCase):
     def input_feeds_int32(self, n, name2shape):
         input_data_list = []
-        for i in range(n):
+        for _i in range(n):
             inputs = {}
             for name, shape in name2shape.items():
                 inputs.update({name: np.ones(shape).astype(np.int32)})
diff --git a/onnxruntime/test/python/quantization/test_op_gavgpool.py b/onnxruntime/test/python/quantization/test_op_gavgpool.py
index a34c52f912ced..aa7a1833dd5c2 100644
--- a/onnxruntime/test/python/quantization/test_op_gavgpool.py
+++ b/onnxruntime/test/python/quantization/test_op_gavgpool.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License. See License.txt in the project root for
@@ -13,13 +12,13 @@
 from onnx import TensorProto, helper
 from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type
 
-from onnxruntime.quantization import QuantFormat, QuantType, quantize_dynamic, quantize_static
+from onnxruntime.quantization import QuantFormat, QuantType, quantize_dynamic, quantize_static  # noqa: F401
 
 
 class TestOpGlobalAveragePool(unittest.TestCase):
     def input_feeds(self, n, name2shape):
         input_data_list = []
-        for i in range(n):
+        for _i in range(n):
             inputs = {}
             for name, shape in name2shape.items():
                 inputs.update({name: np.random.randint(-1, 2, shape).astype(np.float32)})
@@ -80,7 +79,7 @@ def construct_model_gavgpool(self, output_model_path, input_shape, weight_shape,
 
         onnx.save(model, output_model_path)
 
-    def quantize_gavgpool_test(self, activation_type, weight_type, extra_options={}):
+    def quantize_gavgpool_test(self, activation_type, weight_type, extra_options={}):  # noqa: B006
         np.random.seed(1)
         model_fp32_path = "gavg_pool_fp32.onnx"
         data_reader = self.input_feeds(1, {"input": [1, 8, 33, 33]})
@@ -89,7 +88,7 @@ def quantize_gavgpool_test(self, activation_type, weight_type, extra_options={})
         activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
         activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8"
         weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8"
-        model_q8_path = "gavg_pool_{}{}.onnx".format(activation_type_str, weight_type_str)
+        model_q8_path = f"gavg_pool_{activation_type_str}{weight_type_str}.onnx"
 
         data_reader.rewind()
         quantize_static(
diff --git a/onnxruntime/test/python/quantization/test_op_gemm.py b/onnxruntime/test/python/quantization/test_op_gemm.py
index 7475ffb47003e..89bc0fe19747d 100644
--- a/onnxruntime/test/python/quantization/test_op_gemm.py
+++ b/onnxruntime/test/python/quantization/test_op_gemm.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License. See License.txt in the project root for
@@ -19,7 +18,7 @@
 class TestOpGemm(unittest.TestCase):
     def input_feeds(self, n, name2shape):
         input_data_list = []
-        for i in range(n):
+        for _i in range(n):
             inputs = {}
             for name, shape in name2shape.items():
                 inputs.update({name: np.random.randint(-1, 2, shape).astype(np.float32)})
@@ -110,12 +109,12 @@ def static_quant_test(
         data_reader,
         activation_type,
         weight_type,
-        extra_options={},
+        extra_options={},  # noqa: B006
     ):
         activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
         activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8"
         weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8"
-        model_int8_path = "gemm_fp32.quant_{}{}.onnx".format(activation_type_str, weight_type_str)
+        model_int8_path = f"gemm_fp32.quant_{activation_type_str}{weight_type_str}.onnx"
 
         data_reader.rewind()
         quantize_static(
@@ -149,12 +148,12 @@ def static_quant_test_qdq(
         data_reader,
         activation_type,
         weight_type,
-        extra_options={},
+        extra_options={},  # noqa: B006
     ):
         activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
         activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8"
         weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8"
-        model_int8_path = "gemm_fp32.quant_dqd_{}{}.onnx".format(activation_type_str, weight_type_str)
+        model_int8_path = f"gemm_fp32.quant_dqd_{activation_type_str}{weight_type_str}.onnx"
 
         data_reader.rewind()
         quantize_static(
@@ -188,12 +187,12 @@ def dynamic_quant_test(
         data_reader,
         activation_type,
         weight_type,
-        extra_options={},
+        extra_options={},  # noqa: B006
     ):
         activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
         activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8"
         weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8"
-        model_int8_path = "gemm_fp32.quant_dynamic_{}{}.onnx".format(activation_type_str, weight_type_str)
+        model_int8_path = f"gemm_fp32.quant_dynamic_{activation_type_str}{weight_type_str}.onnx"
 
         quantize_dynamic(
             model_fp32_path,
diff --git a/onnxruntime/test/python/quantization/test_op_instance_normalization.py b/onnxruntime/test/python/quantization/test_op_instance_normalization.py
index 194516eb564af..53a9bc093f246 100644
--- a/onnxruntime/test/python/quantization/test_op_instance_normalization.py
+++ b/onnxruntime/test/python/quantization/test_op_instance_normalization.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License. See License.txt in the project root for
diff --git a/onnxruntime/test/python/quantization/test_op_maxpool.py b/onnxruntime/test/python/quantization/test_op_maxpool.py
index 043aa2bca48bf..aeeb862bc08d2 100644
--- a/onnxruntime/test/python/quantization/test_op_maxpool.py
+++ b/onnxruntime/test/python/quantization/test_op_maxpool.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License. See License.txt in the project root for
@@ -25,7 +24,7 @@
 class TestOpMaxPool(unittest.TestCase):
     def input_feeds(self, n, name2shape):
         input_data_list = []
-        for i in range(n):
+        for _i in range(n):
             inputs = {}
             for name, shape in name2shape.items():
                 inputs.update({name: np.random.randint(-1, 2, shape).astype(np.float32)})
@@ -76,7 +75,7 @@ def construct_model_conv_maxpool(
         model.ir_version = 7  # use stable onnx ir version
         onnx.save(model, output_model_path)
 
-    def quantize_maxpool_test(self, activation_type, weight_type, extra_options={}):
+    def quantize_maxpool_test(self, activation_type, weight_type, extra_options={}):  # noqa: B006
         np.random.seed(1)
         model_fp32_path = "maxpool_fp32.onnx"
         self.construct_model_conv_maxpool(
@@ -92,8 +91,8 @@ def quantize_maxpool_test(self, activation_type, weight_type, extra_options={}):
         activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
         activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8"
         weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8"
-        model_q8_path = "maxpool_{}{}.onnx".format(activation_type_str, weight_type_str)
-        model_q8_qdq_path = "maxpool_dqd_{}{}.onnx".format(activation_type_str, weight_type_str)
+        model_q8_path = f"maxpool_{activation_type_str}{weight_type_str}.onnx"
+        model_q8_qdq_path = f"maxpool_dqd_{activation_type_str}{weight_type_str}.onnx"
 
         # Verify QOperator mode
         data_reader.rewind()
diff --git a/onnxruntime/test/python/quantization/test_op_pad.py b/onnxruntime/test/python/quantization/test_op_pad.py
index d3680ea9f273e..c413dedbef051 100644
--- a/onnxruntime/test/python/quantization/test_op_pad.py
+++ b/onnxruntime/test/python/quantization/test_op_pad.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License. See License.txt in the project root for
@@ -19,7 +18,7 @@
 class TestOpQuatizerPad(unittest.TestCase):
     def input_feeds(self, n, name2shape):
         input_data_list = []
-        for i in range(n):
+        for _i in range(n):
             inputs = {}
             for name, shape in name2shape.items():
                 inputs.update({name: np.random.randint(-1, 2, shape).astype(np.float32)})
@@ -128,7 +127,7 @@ def quantize_model(
         data_reader=None,
         activation_type=QuantType.QUInt8,
         weight_type=QuantType.QUInt8,
-        extra_options={},
+        extra_options={},  # noqa: B006
     ):
         if data_reader is not None:
             quantize_static(
@@ -152,8 +151,8 @@ def quantize_model(
 
     def verify_should_not_trigger(self, quantize_mode="static"):
         np.random.seed(108)
-        model_fp32_path = "qop_pad_notrigger_fp32_{}.onnx".format(quantize_mode)
-        model_i8_path = "qop_pad_notrigger_i8_{}.onnx".format(quantize_mode)
+        model_fp32_path = f"qop_pad_notrigger_fp32_{quantize_mode}.onnx"
+        model_i8_path = f"qop_pad_notrigger_i8_{quantize_mode}.onnx"
         data_reader = self.input_feeds(1, {"input": [1, 16, 31, 31]})
         self.construct_model_pad(model_fp32_path, "constant", [1, 16, 31, 31], [0, 0, 1, 2, 0, 0, 3, 4])
         self.quantize_model(
@@ -194,7 +193,7 @@ def verify_quantize_with_pad_mode(
         np.random.seed(108)
         tag_pad_mode = pad_mode if pad_mode is not None else "none"
         tag_constant_value = "" if constant_value is None else "_value"
-        model_fp32_path = "qop_pad_{}_fp32_{}{}.onnx".format(quantize_mode, tag_pad_mode, tag_constant_value)
+        model_fp32_path = f"qop_pad_{quantize_mode}_fp32_{tag_pad_mode}{tag_constant_value}.onnx"
         edge_case = "dual_feed" in extra_options and extra_options["dual_feed"] and constant_value is not None
         if edge_case:
             data_reader = self.input_feeds(1, {"input": [1, 8, 33, 33], "padding_value": [1]})
diff --git a/onnxruntime/test/python/quantization/test_op_pooling.py b/onnxruntime/test/python/quantization/test_op_pooling.py
index b0561bd79f8e1..539affc314ce9 100644
--- a/onnxruntime/test/python/quantization/test_op_pooling.py
+++ b/onnxruntime/test/python/quantization/test_op_pooling.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License. See License.txt in the project root for
@@ -11,13 +10,8 @@
 import numpy as np
 import onnx
 from onnx import TensorProto, helper
-from op_test_utils import (
-    TestDataFeeds,
-    check_model_correctness,
-    check_op_nodes,
-    check_op_type_count,
-    check_qtype_by_node_type,
-)
+from op_test_utils import check_op_nodes  # noqa: F401
+from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type
 
 from onnxruntime.quantization import QuantFormat, QuantType, quantize_static
 
@@ -25,7 +19,7 @@
 class TestOpAveragePool(unittest.TestCase):
     def input_feeds(self, n, name2shape):
         input_data_list = []
-        for i in range(n):
+        for _i in range(n):
             inputs = {}
             for name, shape in name2shape.items():
                 inputs.update({name: np.random.randint(-1, 2, shape).astype(np.float32)})
@@ -76,7 +70,7 @@ def construct_model_conv_avgpool(
         model.ir_version = 7  # use stable onnx ir version
         onnx.save(model, output_model_path)
 
-    def quantize_avgpool_test(self, activation_type, weight_type, extra_options={}):
+    def quantize_avgpool_test(self, activation_type, weight_type, extra_options={}):  # noqa: B006
         np.random.seed(1)
         model_fp32_path = "avgpool_fp32.onnx"
         self.construct_model_conv_avgpool(
@@ -92,8 +86,8 @@ def quantize_avgpool_test(self, activation_type, weight_type, extra_options={}):
         activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
         activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8"
         weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8"
-        model_q8_path = "avgpool_{}{}.onnx".format(activation_type_str, weight_type_str)
-        model_q8_qdq_path = "avgpool_qdq_{}{}.onnx".format(activation_type_str, weight_type_str)
+        model_q8_path = f"avgpool_{activation_type_str}{weight_type_str}.onnx"
+        model_q8_qdq_path = f"avgpool_qdq_{activation_type_str}{weight_type_str}.onnx"
 
         # Verify QOperator mode
         data_reader.rewind()
diff --git a/onnxruntime/test/python/quantization/test_op_relu.py b/onnxruntime/test/python/quantization/test_op_relu.py
index bf72b96508592..36a6a3dc7bc53 100644
--- a/onnxruntime/test/python/quantization/test_op_relu.py
+++ b/onnxruntime/test/python/quantization/test_op_relu.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License. See License.txt in the project root for
@@ -19,7 +18,7 @@
 class TestOpRelu(unittest.TestCase):
     def input_feeds(self, n, name2shape):
         input_data_list = []
-        for i in range(n):
+        for _i in range(n):
             inputs = {}
             for name, shape in name2shape.items():
                 inputs.update({name: np.random.randint(-1, 2, shape).astype(np.float32)})
@@ -104,12 +103,12 @@ def static_quant_test(
         data_reader,
         activation_type,
         weight_type,
-        extra_options={},
+        extra_options={},  # noqa: B006
     ):
         activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
         activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8"
         weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8"
-        model_int8_path = "relu_fp32.quant_{}{}.onnx".format(activation_type_str, weight_type_str)
+        model_int8_path = f"relu_fp32.quant_{activation_type_str}{weight_type_str}.onnx"
 
         data_reader.rewind()
         quantize_static(
@@ -143,12 +142,12 @@ def static_quant_test_qdq(
         data_reader,
         activation_type,
         weight_type,
-        extra_options={},
+        extra_options={},  # noqa: B006
     ):
         activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
         activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8"
         weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8"
-        model_int8_path = "relu_fp32.quant_dqd_{}{}.onnx".format(activation_type_str, weight_type_str)
+        model_int8_path = f"relu_fp32.quant_dqd_{activation_type_str}{weight_type_str}.onnx"
 
         data_reader.rewind()
         quantize_static(
diff --git a/onnxruntime/test/python/quantization/test_op_reshape.py b/onnxruntime/test/python/quantization/test_op_reshape.py
index a415de92e14b5..d843b1e205c69 100644
--- a/onnxruntime/test/python/quantization/test_op_reshape.py
+++ b/onnxruntime/test/python/quantization/test_op_reshape.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License. See License.txt in the project root for
@@ -25,7 +24,7 @@
 class TestOpReshape(unittest.TestCase):
     def input_feeds(self, n, name2shape):
         input_data_list = []
-        for i in range(n):
+        for _i in range(n):
             inputs = {}
             for name, shape in name2shape.items():
                 inputs.update({name: np.random.randint(-1, 2, shape).astype(np.float32)})
@@ -80,7 +79,7 @@ def construct_model_matmul_reshape(self, output_model_path, input_shape, weight_
 
         onnx.save(model, output_model_path)
 
-    def quantize_reshape_test(self, activation_type, weight_type, extra_options={}):
+    def quantize_reshape_test(self, activation_type, weight_type, extra_options={}):  # noqa: B006
         np.random.seed(1)
         model_fp32_path = "reshape_fp32.onnx"
 
@@ -89,8 +88,8 @@ def quantize_reshape_test(self, activation_type, weight_type, extra_options={}):
         activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
         activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8"
         weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8"
-        model_uint8_path = "reshape_{}{}.onnx".format(activation_type_str, weight_type_str)
-        model_uint8_qdq_path = "reshape_{}{}_qdq.onnx".format(activation_type_str, weight_type_str)
+        model_uint8_path = f"reshape_{activation_type_str}{weight_type_str}.onnx"
+        model_uint8_qdq_path = f"reshape_{activation_type_str}{weight_type_str}_qdq.onnx"
 
         # Verify QOperator mode
         data_reader = self.input_feeds(1, {"input": [3, 7]})
diff --git a/onnxruntime/test/python/quantization/test_op_resize.py b/onnxruntime/test/python/quantization/test_op_resize.py
index 1f9703b26f0fe..dbf64bd36ccb4 100644
--- a/onnxruntime/test/python/quantization/test_op_resize.py
+++ b/onnxruntime/test/python/quantization/test_op_resize.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License. See License.txt in the project root for
@@ -25,7 +24,7 @@
 class TestOpResize(unittest.TestCase):
     def input_feeds(self, n, name2shape):
         input_data_list = []
-        for i in range(n):
+        for _i in range(n):
             inputs = {}
             for name, shape in name2shape.items():
                 inputs.update({name: np.random.randint(-1, 2, shape).astype(np.float32)})
@@ -109,7 +108,7 @@ def construct_model_conv_resize(
         model.ir_version = 7  # use stable onnx ir version
         onnx.save(model, output_model_path)
 
-    def quantize_resize_test(self, activation_type, weight_type, extra_options={}):
+    def quantize_resize_test(self, activation_type, weight_type, extra_options={}):  # noqa: B006
         np.random.seed(1)
         model_fp32_path = "resize_fp32.onnx"
 
@@ -133,8 +132,8 @@ def quantize_resize_test(self, activation_type, weight_type, extra_options={}):
         activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
         activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8"
         weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8"
-        model_uint8_path = "resize_{}{}.onnx".format(activation_type_str, weight_type_str)
-        model_uint8_qdq_path = "resize_{}{}_qdq.onnx".format(activation_type_str, weight_type_str)
+        model_uint8_path = f"resize_{activation_type_str}{weight_type_str}.onnx"
+        model_uint8_qdq_path = f"resize_{activation_type_str}{weight_type_str}_qdq.onnx"
 
         # Verify QOperator mode
         data_reader = self.input_feeds(1, {"input": [1, 2, 26, 42]})
diff --git a/onnxruntime/test/python/quantization/test_op_softmax.py b/onnxruntime/test/python/quantization/test_op_softmax.py
index e8fb00a312084..8e6e4d4100348 100644
--- a/onnxruntime/test/python/quantization/test_op_softmax.py
+++ b/onnxruntime/test/python/quantization/test_op_softmax.py
@@ -78,7 +78,7 @@ def construct_model_conv_softmax(
         model.ir_version = 7  # use stable onnx ir version
         onnx.save(model, output_model_path)
 
-    def quantize_softmax_test(self, activation_type, weight_type, extra_options={}):
+    def quantize_softmax_test(self, activation_type, weight_type, extra_options={}):  # noqa: B006
         np.random.seed(1)
         model_fp32_path = "softmax_fp32.onnx"
         self.construct_model_conv_softmax(
@@ -163,9 +163,9 @@ def quantize_softmax_test(self, activation_type, weight_type, extra_options={}):
                 dqnode_cnt += 1
             elif node.op_type == "Softmax":
                 softmax_cnt += 1
-        self.assertEqual(3, qnode_cnt, "Expected 3 QuantizeLinear nodes, found {}".format(qnode_cnt))
-        self.assertEqual(4, dqnode_cnt, "Expected 4 DequantizeLinear nodes, found {}".format(dqnode_cnt))
-        self.assertEqual(1, softmax_cnt, "Expected 1 Softmax node, found {}".format(softmax_cnt))
+        self.assertEqual(3, qnode_cnt, f"Expected 3 QuantizeLinear nodes, found {qnode_cnt}")
+        self.assertEqual(4, dqnode_cnt, f"Expected 4 DequantizeLinear nodes, found {dqnode_cnt}")
+        self.assertEqual(1, softmax_cnt, f"Expected 1 Softmax node, found {softmax_cnt}")
         if extra_options.get("ActivationSymmetric", False):
             for tensor in result_model.graph.initializer:
                 if tensor.name in qnode_zeropoints:
@@ -173,7 +173,7 @@ def quantize_softmax_test(self, activation_type, weight_type, extra_options={}):
                     self.assertEqual(
                         0,
                         np_value,
-                        "QuantizeLinear node zero point value must be 0, found {} instead!".format(np_value),
+                        f"QuantizeLinear node zero point value must be 0, found {np_value} instead!",
                     )
 
         qnode_io_qtypes = {
diff --git a/onnxruntime/test/python/quantization/test_op_split.py b/onnxruntime/test/python/quantization/test_op_split.py
index e6b446fbef3ec..4a81f134f235d 100644
--- a/onnxruntime/test/python/quantization/test_op_split.py
+++ b/onnxruntime/test/python/quantization/test_op_split.py
@@ -74,7 +74,7 @@ def construct_model(self, model_path):
         model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
         save(model, model_path)
 
-    def quantize_split_test(self, activation_type, weight_type, extra_options={}):
+    def quantize_split_test(self, activation_type, weight_type, extra_options={}):  # noqa: B006
         np.random.seed(1)
         model_fp32_path = "split_fp32.onnx"
         self.construct_model(model_fp32_path)
@@ -83,8 +83,8 @@ def quantize_split_test(self, activation_type, weight_type, extra_options={}):
         activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
         activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8"
         weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8"
-        model_uint8_path = "split_{}{}.onnx".format(activation_type_str, weight_type_str)
-        model_uint8_qdq_path = "split_{}{}_qdq.onnx".format(activation_type_str, weight_type_str)
+        model_uint8_path = f"split_{activation_type_str}{weight_type_str}.onnx"
+        model_uint8_qdq_path = f"split_{activation_type_str}{weight_type_str}_qdq.onnx"
 
         # Verify QOperator mode
         data_reader.rewind()
diff --git a/onnxruntime/test/python/quantization/test_op_squeeze_unsqueeze.py b/onnxruntime/test/python/quantization/test_op_squeeze_unsqueeze.py
index 6794ef4acc787..9358e3755cb2e 100644
--- a/onnxruntime/test/python/quantization/test_op_squeeze_unsqueeze.py
+++ b/onnxruntime/test/python/quantization/test_op_squeeze_unsqueeze.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License. See License.txt in the project root for
@@ -19,7 +18,7 @@
 class TestOpSqueezeUnsqueeze(unittest.TestCase):
     def input_feeds(self, n, name2shape):
         input_data_list = []
-        for i in range(n):
+        for _i in range(n):
             inputs = {}
             for name, shape in name2shape.items():
                 inputs.update({name: np.random.randint(-1, 2, shape).astype(np.float32)})
@@ -149,18 +148,18 @@ def run_quantize_squeezes_of_opset(
         opset=13,
         activation_type=QuantType.QUInt8,
         weight_type=QuantType.QUInt8,
-        extra_options={},
+        extra_options={},  # noqa: B006
     ):
         np.random.seed(1)
 
-        model_fp32_path = "squeezes_opset{}_fp32.onnx".format(opset)
+        model_fp32_path = f"squeezes_opset{opset}_fp32.onnx"
         self.construct_model_conv_squeezes(model_fp32_path, [1, 2, 26, 42], [3, 2, 3, 3], [1, 3, 24, 40], opset=opset)
 
         activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
         activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8"
         weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8"
-        model_uint8_path = "squeezes_opset{}_{}{}.onnx".format(opset, activation_type_str, weight_type_str)
-        model_uint8_qdq_path = "squeezes_opset{}_{}{}_qdq.onnx".format(opset, activation_type_str, weight_type_str)
+        model_uint8_path = f"squeezes_opset{opset}_{activation_type_str}{weight_type_str}.onnx"
+        model_uint8_qdq_path = f"squeezes_opset{opset}_{activation_type_str}{weight_type_str}_qdq.onnx"
 
         # Verify QOperator mode
         data_reader = self.input_feeds(1, {"input": [1, 2, 26, 42]})
diff --git a/onnxruntime/test/python/quantization/test_op_transpose.py b/onnxruntime/test/python/quantization/test_op_transpose.py
index e34df1072976d..392451cf64611 100644
--- a/onnxruntime/test/python/quantization/test_op_transpose.py
+++ b/onnxruntime/test/python/quantization/test_op_transpose.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License. See License.txt in the project root for
@@ -25,7 +24,7 @@
 class TestOpTranspose(unittest.TestCase):
     def input_feeds(self, n, name2shape):
         input_data_list = []
-        for i in range(n):
+        for _i in range(n):
             inputs = {}
             for name, shape in name2shape.items():
                 inputs.update({name: np.random.randint(-1, 2, shape).astype(np.float32)})
@@ -77,7 +76,7 @@ def construct_model_matmul_transpose(self, output_model_path, input_shape, weigh
 
         onnx.save(model, output_model_path)
 
-    def quantize_transpose_test(self, activation_type, weight_type, extra_options={}):
+    def quantize_transpose_test(self, activation_type, weight_type, extra_options={}):  # noqa: B006
         np.random.seed(1)
         model_fp32_path = "transpose_fp32.onnx"
         self.construct_model_matmul_transpose(model_fp32_path, [3, 7], [7, 5], [5, 3])
@@ -85,8 +84,8 @@ def quantize_transpose_test(self, activation_type, weight_type, extra_options={}
         activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
         activation_type_str = "u8" if (activation_type == QuantType.QUInt8) else "s8"
         weight_type_str = "u8" if (weight_type == QuantType.QUInt8) else "s8"
-        model_uint8_path = "transpose_{}{}.onnx".format(activation_type_str, weight_type_str)
-        model_uint8_qdq_path = "transpose_{}{}_qdq.onnx".format(activation_type_str, weight_type_str)
+        model_uint8_path = f"transpose_{activation_type_str}{weight_type_str}.onnx"
+        model_uint8_qdq_path = f"transpose_{activation_type_str}{weight_type_str}_qdq.onnx"
 
         # Verify QOperator model
         data_reader = self.input_feeds(1, {"input": [3, 7]})
diff --git a/onnxruntime/test/python/quantization/test_op_where.py b/onnxruntime/test/python/quantization/test_op_where.py
index 43d6fe4fd442a..400594c780193 100644
--- a/onnxruntime/test/python/quantization/test_op_where.py
+++ b/onnxruntime/test/python/quantization/test_op_where.py
@@ -17,7 +17,7 @@ class TestWhereModel(unittest.TestCase):
     @staticmethod
     def input_feeds_for_where(n, name2shape):
         input_data_list = []
-        for i in range(n):
+        for _i in range(n):
             inputs = {}
             for name, shape in name2shape.items():
                 if name == "condition":
@@ -53,8 +53,7 @@ def construct_model(model_path, input_shape):
         model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 16)])
         save(model, model_path)
 
-    def quantize_where_test(self, activation_type, weight_type, extra_options={}):
-
+    def quantize_where_test(self, activation_type, weight_type, extra_options={}):  # noqa: B006
         model_fp32_path = "where_fp32.onnx"
         input_shape = [2, 2]
         self.construct_model(model_fp32_path, input_shape)
@@ -146,7 +145,7 @@ def test_quantize_where_u8u8(self):
         self.quantize_where_test(QuantType.QUInt8, QuantType.QUInt8, extra_options={"ForceQuantizeNoInputCheck": True})
         print(__name__)
 
-    def test_quantize_where_u8u8_no_ForceQuantizeNoInputCheck(self):
+    def test_quantize_where_u8u8_no_ForceQuantizeNoInputCheck(self):  # noqa: N802
         self.quantize_where_test(QuantType.QUInt8, QuantType.QUInt8, extra_options={"ForceQuantizeNoInputCheck": False})
         print(__name__)
 
diff --git a/onnxruntime/test/python/quantization/test_qdq.py b/onnxruntime/test/python/quantization/test_qdq.py
index c50fb9870b12d..f9f90998c8dc0 100644
--- a/onnxruntime/test/python/quantization/test_qdq.py
+++ b/onnxruntime/test/python/quantization/test_qdq.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License. See License.txt in the project root for
@@ -27,7 +26,7 @@
 class TestQDQFormat(unittest.TestCase):
     def input_feeds(self, n, name2shape):
         input_data_list = []
-        for i in range(n):
+        for _i in range(n):
             inputs = {}
             for name, shape in name2shape.items():
                 inputs.update({name: np.random.randint(-1, 2, shape).astype(np.float32)})
@@ -287,9 +286,9 @@ def construct_model_conv(self, output_model_path, input_shape, weight_shape, out
 
     def verify_quantize_conv(self, has_bias, per_channel, is_quant_type_int8=False):
         np.random.seed(1)
-        model_fp32_path = "conv_fp32.{}.{}.onnx".format(has_bias, per_channel)
-        model_int8_qdq_path = "conv_quant_qdq.{}.{}.onnx".format(has_bias, per_channel)
-        model_int8_qop_path = "conv_quant_qop.{}.{}.onnx".format(has_bias, per_channel)
+        model_fp32_path = f"conv_fp32.{has_bias}.{per_channel}.onnx"
+        model_int8_qdq_path = f"conv_quant_qdq.{has_bias}.{per_channel}.onnx"
+        model_int8_qop_path = f"conv_quant_qop.{has_bias}.{per_channel}.onnx"
         channel_count = 16
         data_reader = self.input_feeds(1, {"input": [1, 8, 33, 33]})
         self.construct_model_conv(model_fp32_path, [1, 8, 33, 33], [channel_count, 8, 3, 3], [1, 16, 31, 31], has_bias)
@@ -396,9 +395,9 @@ def construct_model_conv_clip(self, output_model_path, input_shape, weight_shape
 
     def verify(self, per_channel, is_quant_type_int8):
         np.random.seed(1)
-        model_fp32_path = "conv_clip_fp32.{}.onnx".format(per_channel)
-        model_int8_qdq_path = "conv_clip_quant_qdq.{}.onnx".format(per_channel)
-        model_int8_qop_path = "conv_clip_quant_qop.{}.onnx".format(per_channel)
+        model_fp32_path = f"conv_clip_fp32.{per_channel}.onnx"
+        model_int8_qdq_path = f"conv_clip_quant_qdq.{per_channel}.onnx"
+        model_int8_qop_path = f"conv_clip_quant_qop.{per_channel}.onnx"
         data_reader = self.input_feeds(1, {"input": [1, 8, 33, 33]})
         self.construct_model_conv_clip(model_fp32_path, [1, 8, 33, 33], [16, 8, 3, 3], [15376])
         quantize_static(
@@ -477,7 +476,7 @@ def construct_relu_conv_model(test_model_path: str) -> None:
       Relu2  Conv3
         |      |
       Conv2    |
-        \      /
+        \\      /
           Add
            |
           (AddOut)
@@ -566,9 +565,9 @@ def construct_model_conv_relu(self, output_model_path, input_shape, weight_shape
 
     def verify(self, per_channel, is_quant_type_int8):
         np.random.seed(1)
-        model_fp32_path = str(Path(self._tmp_model_dir.name) / "conv_relu_fp32.{}.onnx".format(per_channel))
-        model_int8_qdq_path = str(Path(self._tmp_model_dir.name) / "conv_relu_quant_qdq.{}.onnx".format(per_channel))
-        model_int8_qop_path = str(Path(self._tmp_model_dir.name) / "conv_relu_quant_qop.{}.onnx".format(per_channel))
+        model_fp32_path = str(Path(self._tmp_model_dir.name) / f"conv_relu_fp32.{per_channel}.onnx")
+        model_int8_qdq_path = str(Path(self._tmp_model_dir.name) / f"conv_relu_quant_qdq.{per_channel}.onnx")
+        model_int8_qop_path = str(Path(self._tmp_model_dir.name) / f"conv_relu_quant_qop.{per_channel}.onnx")
         data_reader = self.input_feeds(1, {"input": [1, 8, 33, 33]})
         self.construct_model_conv_relu(model_fp32_path, [1, 8, 33, 33], [16, 8, 3, 3], [1, 16, 31, 31])
         quantize_static(
diff --git a/onnxruntime/test/python/quantization/test_qdq_loss_debug.py b/onnxruntime/test/python/quantization/test_qdq_loss_debug.py
index 5a26cd36115f9..a3f7d29002648 100644
--- a/onnxruntime/test/python/quantization/test_qdq_loss_debug.py
+++ b/onnxruntime/test/python/quantization/test_qdq_loss_debug.py
@@ -42,7 +42,7 @@ def construct_test_model1(test_model_path: str, activations_as_outputs=False):
       Relu2  Conv3
         |      |
       Conv2    |
-        \      /
+        \\      /
           Add
            |
           (AddOut)
@@ -87,7 +87,7 @@ def construct_test_model1(test_model_path: str, activations_as_outputs=False):
 class TestDataReader(CalibrationDataReader):
     """Random Data Input Generator"""
 
-    def __init__(self, input_shape=[1, 3, 1, 3]):
+    def __init__(self, input_shape=[1, 3, 1, 3]):  # noqa: B006
         self.preprocess_flag = True
         self.enum_data_dicts = []
         self.count = 2
@@ -258,7 +258,6 @@ def test_create_weight_matching(self):
             )
 
     def test_create_weight_matching_per_channel(self):
-
         # float model
         #         (input)
         #           |
diff --git a/onnxruntime/test/python/quantization/test_quant_util.py b/onnxruntime/test/python/quantization/test_quant_util.py
index 5a9a5a5029658..9823f8f3d2770 100644
--- a/onnxruntime/test/python/quantization/test_quant_util.py
+++ b/onnxruntime/test/python/quantization/test_quant_util.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License. See License.txt in the project root for
diff --git a/onnxruntime/test/python/quantization/test_quantize_static.py b/onnxruntime/test/python/quantization/test_quantize_static.py
index 68dc2b6049536..1fb7ad2e9efa4 100644
--- a/onnxruntime/test/python/quantization/test_quantize_static.py
+++ b/onnxruntime/test/python/quantization/test_quantize_static.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License. See License.txt in the project root for
@@ -29,7 +28,7 @@ def construct_test_model(test_model_path, channel_size):
       Relu   Conv3
         |      |
       Conv2    |
-        \      /
+        \\      /
            Add
             |
           output
diff --git a/onnxruntime/test/python/quantization/test_symmetric_flag.py b/onnxruntime/test/python/quantization/test_symmetric_flag.py
index 26f7ba6ce59b3..f24daddbbcf83 100644
--- a/onnxruntime/test/python/quantization/test_symmetric_flag.py
+++ b/onnxruntime/test/python/quantization/test_symmetric_flag.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License. See License.txt in the project root for
@@ -17,7 +16,6 @@
 
 class TestSymmetricFlag(unittest.TestCase):
     def setUp(self):
-
         # Set up symmetrically and asymmetrically disributed values for activations
         self.symmetric_activations = [
             -1 * np.ones([1, 2, 32, 32], dtype="float32"),
@@ -45,10 +43,9 @@ def setUp(self):
         )
 
     def perform_quantization(self, activations, weight, act_sym, wgt_sym):
-
         # One-layer convolution model
         act = helper.make_tensor_value_info("ACT", TensorProto.FLOAT, activations[0].shape)
-        wgt = helper.make_tensor_value_info("WGT", TensorProto.FLOAT, weight.shape)
+        helper.make_tensor_value_info("WGT", TensorProto.FLOAT, weight.shape)
         res = helper.make_tensor_value_info("RES", TensorProto.FLOAT, [None, None, None, None])
         wgt_init = numpy_helper.from_array(weight, "WGT")
         conv_node = onnx.helper.make_node("Conv", ["ACT", "WGT"], ["RES"])
@@ -86,7 +83,6 @@ def get_next(self):
         return act_zp, act_sc, wgt_zp, wgt_sc
 
     def test_0(self):
-
         act_zp, act_sc, wgt_zp, wgt_sc = self.perform_quantization(
             self.asymmetric_activations,
             self.asymmetric_weights,
@@ -104,7 +100,6 @@ def test_0(self):
         self.assertEqual(wgt_zp, 0)
 
     def test_1(self):
-
         act_zp, act_sc, wgt_zp, wgt_sc = self.perform_quantization(
             self.asymmetric_activations,
             self.asymmetric_weights,
@@ -121,7 +116,6 @@ def test_1(self):
         self.assertNotEqual(wgt_zp, 0)
 
     def test_2(self):
-
         act_zp, act_sc, wgt_zp, wgt_sc = self.perform_quantization(
             self.symmetric_activations,
             self.symmetric_weights,
@@ -138,7 +132,6 @@ def test_2(self):
         self.assertEqual(wgt_zp, 0)
 
     def test_3(self):
-
         act_zp, act_sc, wgt_zp, wgt_sc = self.perform_quantization(
             self.symmetric_activations,
             self.symmetric_weights,
@@ -156,5 +149,4 @@ def test_3(self):
 
 
 if __name__ == "__main__":
-
     unittest.main()
diff --git a/onnxruntime/test/python/test_pytorch_export_contrib_ops.py b/onnxruntime/test/python/test_pytorch_export_contrib_ops.py
index bc46f3b57cb56..6cf8426da521d 100644
--- a/onnxruntime/test/python/test_pytorch_export_contrib_ops.py
+++ b/onnxruntime/test/python/test_pytorch_export_contrib_ops.py
@@ -36,7 +36,7 @@ def to_numpy(tensor):
     inputs = list(map(to_numpy, input))
     outputs = list(map(to_numpy, output))
 
-    ort_inputs = dict((ort_sess.get_inputs()[i].name, input) for i, input in enumerate(inputs))
+    ort_inputs = {ort_sess.get_inputs()[i].name: input for i, input in enumerate(inputs)}
     ort_outs = ort_sess.run(None, ort_inputs)
 
     # compare onnxruntime and PyTorch results
@@ -163,7 +163,7 @@ def test_triu(self):
 
             class Module(torch.nn.Module):
                 def forward(self, input):
-                    return input.triu(diagonal=i)
+                    return input.triu(diagonal=i)  # noqa: B023
 
             model = Module()
             x = torch.randn(5, 4, 7, dtype=torch.float32)
@@ -179,7 +179,7 @@ def forward(self, input):
 
             class Module2D(torch.nn.Module):
                 def forward(self, input):
-                    return input.triu(diagonal=i)
+                    return input.triu(diagonal=i)  # noqa: B023
 
             model = Module2D()
             x = torch.randn(4, 7, dtype=torch.float32)
@@ -196,7 +196,7 @@ def test_tril(self):
 
             class Module(torch.nn.Module):
                 def forward(self, input):
-                    return input.tril(diagonal=i)
+                    return input.tril(diagonal=i)  # noqa: B023
 
             model = Module()
             x = torch.randn(5, 4, 7, dtype=torch.float32)
@@ -212,7 +212,7 @@ def forward(self, input):
 
             class Module2D(torch.nn.Module):
                 def forward(self, input):
-                    return input.tril(diagonal=i)
+                    return input.tril(diagonal=i)  # noqa: B023
 
             model = Module2D()
             x = torch.randn(4, 7, dtype=torch.float32)
@@ -228,7 +228,7 @@ def forward(self, input):
 # opset 9 tests, with keep_initializers_as_inputs=False for
 # IR version 4 style export.
 ONNXExporterTest_opset9_IRv4 = type(
-    str("TestONNXRuntime_opset9_IRv4"),
+    "TestONNXRuntime_opset9_IRv4",
     (unittest.TestCase,),
     dict(ONNXExporterTest.__dict__, keep_initializers_as_inputs=False),
 )
diff --git a/onnxruntime/test/python/transformers/gpt2_model_generator.py b/onnxruntime/test/python/transformers/gpt2_model_generator.py
index 8d2317d2d25d9..e5e46be80d01a 100644
--- a/onnxruntime/test/python/transformers/gpt2_model_generator.py
+++ b/onnxruntime/test/python/transformers/gpt2_model_generator.py
@@ -5,7 +5,7 @@
 # --------------------------------------------------------------------------
 
 import math
-from typing import List
+from typing import List  # noqa: F401
 
 import numpy
 import onnx
@@ -494,9 +494,6 @@ def create_gpt2_attention(hidden_size=64, num_heads=4, max_seq_len=32, switch_ad
         )
         initializers.append(helper.make_tensor("axes_1", TensorProto.INT64, [1], [1]))
 
-    batch_size = 1
-    sequence_length = 3
-    past_sequence_length = 2
     graph = helper.make_graph(
         [node for node in nodes if node],
         "GPT2",  # name
diff --git a/onnxruntime/test/python/transformers/model_loader.py b/onnxruntime/test/python/transformers/model_loader.py
index 126df89240c70..fc66fdd92fe8d 100644
--- a/onnxruntime/test/python/transformers/model_loader.py
+++ b/onnxruntime/test/python/transformers/model_loader.py
@@ -5,7 +5,7 @@
 # --------------------------------------------------------------------------
 
 import os
-import unittest
+import unittest  # noqa: F401
 
 from onnx import ModelProto, TensorProto, external_data_helper, load_model, numpy_helper
 from parity_utilities import find_transformers_source
diff --git a/onnxruntime/test/python/transformers/parity_utilities.py b/onnxruntime/test/python/transformers/parity_utilities.py
index 61f71e9dc0dfc..472b768e4aa94 100644
--- a/onnxruntime/test/python/transformers/parity_utilities.py
+++ b/onnxruntime/test/python/transformers/parity_utilities.py
@@ -12,7 +12,6 @@
 
 
 def parse_arguments(namespace_filter=None):
-
     parser = argparse.ArgumentParser()
 
     # useful EPs that don't require the use of optmizer.py
@@ -41,7 +40,7 @@ def parse_arguments(namespace_filter=None):
     return args, sys.argv[:1] + remaining_args
 
 
-def find_transformers_source(sub_dir_paths=[]):
+def find_transformers_source(sub_dir_paths=[]):  # noqa: B006
     source_dir = os.path.join(
         os.path.dirname(__file__),
         "..",
@@ -64,7 +63,7 @@ def create_inputs(
     sequence_length=1,
     hidden_size=768,
     float16=False,
-    device=torch.device("cuda"),
+    device=torch.device("cuda"),  # noqa: B008
 ):
     float_type = torch.float16 if float16 else torch.float32
     input = torch.normal(mean=0.0, std=10.0, size=(batch_size, sequence_length, hidden_size)).to(float_type).to(device)
@@ -78,7 +77,7 @@ def export_onnx(model, onnx_model_path, float16, hidden_size, device):
 
     input_hidden_states = create_inputs(hidden_size=hidden_size, float16=float16, device=device)
     with torch.no_grad():
-        outputs = model(input_hidden_states)
+        model(input_hidden_states)
 
     dynamic_axes = {
         "input": {0: "batch_size", 1: "seq_len"},
@@ -213,7 +212,7 @@ def run_parity(
     max_diffs = []
     printed = False  # print only one sample
     ort_session = create_ort_session(onnx_model_path, device.type == "cuda", optimized=optimized, verbose=verbose)
-    for i in range(test_cases):
+    for _i in range(test_cases):
         input_hidden_states = create_inputs(batch_size, sequence_length, hidden_size, float16, device)
 
         with torch.no_grad():
diff --git a/onnxruntime/test/python/transformers/test_attention_fusion.py b/onnxruntime/test/python/transformers/test_attention_fusion.py
index d3de8a50c85ea..2edc2ec06d631 100644
--- a/onnxruntime/test/python/transformers/test_attention_fusion.py
+++ b/onnxruntime/test/python/transformers/test_attention_fusion.py
@@ -178,7 +178,7 @@ def test_gpt2_attention_fusion(self):
                 else:
                     model_suffix = "opt_no_skiplayernorm"
 
-                model_name = "gpt2_attention_{}.onnx".format(model_suffix)
+                model_name = f"gpt2_attention_{model_suffix}.onnx"
                 self.verify_fusion(optimized_model, model_name)
 
     def test_megatron_gpt2_attention_fusion(self):
@@ -203,7 +203,7 @@ def test_megatron_gpt2_attention_fusion(self):
             else:
                 model_suffix = "opt_no_skiplayernorm"
 
-            model_name = "gpt2_megatron_{}.onnx".format(model_suffix)
+            model_name = f"gpt2_megatron_{model_suffix}.onnx"
             self.verify_fusion(optimized_model, model_name)
 
 
diff --git a/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py b/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
index 47145fc213a0d..b3973a9337382 100644
--- a/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
+++ b/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
@@ -26,7 +26,7 @@
 import argparse
 import os
 import random
-import sys
+import sys  # noqa: F401
 import timeit
 from pathlib import Path
 
@@ -45,7 +45,7 @@
 
 class TinyBertOnnxModel(OnnxModel):
     def __init__(self, model, verbose):
-        super(TinyBertOnnxModel, self).__init__(model, verbose)
+        super().__init__(model, verbose)
         self.resize_model()
 
     def resize_weight(self, initializer_name, target_shape):
@@ -108,7 +108,7 @@ def resize_model(self):
             if len(tensor.shape) == 1 and tensor.shape[0] == 1:
                 if tensor == old_parameters["num_heads"]:
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         old_parameters["num_heads"],
                         "=>[",
@@ -123,7 +123,7 @@ def resize_model(self):
                     )
                 elif tensor == old_parameters["seq_len"]:
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         old_parameters["seq_len"],
                         "=>[",
@@ -138,7 +138,7 @@ def resize_model(self):
                     )
                 elif tensor == old_parameters["size_per_head"]:
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         old_parameters["size_per_head"],
                         "=>[",
@@ -153,7 +153,7 @@ def resize_model(self):
                     )
                 elif tensor == old_parameters["hidden_size"]:
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         old_parameters["hidden_size"],
                         "=>[",
@@ -168,7 +168,7 @@ def resize_model(self):
                     )
                 elif tensor == 4 * old_parameters["hidden_size"]:
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         4 * old_parameters["hidden_size"],
                         "=>[",
@@ -184,7 +184,7 @@ def resize_model(self):
             elif len(tensor.shape) == 0:
                 if tensor == old_parameters["num_heads"]:
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         old_parameters["num_heads"],
                         "=>",
@@ -198,7 +198,7 @@ def resize_model(self):
                     )
                 elif tensor == old_parameters["seq_len"]:
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         old_parameters["seq_len"],
                         "=>",
@@ -212,7 +212,7 @@ def resize_model(self):
                     )
                 elif tensor == old_parameters["size_per_head"]:
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         old_parameters["size_per_head"],
                         "=>",
@@ -226,7 +226,7 @@ def resize_model(self):
                     )
                 elif tensor == old_parameters["hidden_size"]:
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         old_parameters["hidden_size"],
                         "=>",
@@ -240,7 +240,7 @@ def resize_model(self):
                     )
                 elif tensor == 4 * old_parameters["hidden_size"]:
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         4 * old_parameters["hidden_size"],
                         "=>",
@@ -254,7 +254,7 @@ def resize_model(self):
                     )
                 elif tensor == 1.0 / np.sqrt(old_parameters["size_per_head"]):
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         1.0 / np.sqrt(old_parameters["size_per_head"]),
                         "=>",
@@ -302,7 +302,6 @@ def use_dynamic_axes(self, dynamic_batch_dim="batch_size", seq_len=7):
         """
         Update input and output shape to use dynamic axes.
         """
-        dynamic_batch_inputs = {}
         for input in self.model.graph.input:
             dim_proto = input.type.tensor_type.shape.dim[0]
             dim_proto.dim_param = dynamic_batch_dim
@@ -355,7 +354,7 @@ def generate_test_data(
         sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
         sess = onnxruntime.InferenceSession(onnx_file, sess_options, providers=["CPUExecutionProvider"])
 
-        input1_name = sess.get_inputs()[0].name
+        sess.get_inputs()[0].name
         output_names = [output.name for output in sess.get_outputs()]
         inputs = {
             "input_ids": input_1,
@@ -365,19 +364,19 @@ def generate_test_data(
         print("inputs", inputs)
         result = sess.run(output_names, inputs)
 
-        with open(os.path.join(path, "input_{}.pb".format(0)), "wb") as f:
+        with open(os.path.join(path, f"input_{0}.pb"), "wb") as f:
             f.write(tensor_1.SerializeToString())
-        with open(os.path.join(path, "input_{}.pb".format(1)), "wb") as f:
+        with open(os.path.join(path, f"input_{1}.pb"), "wb") as f:
             f.write(tensor_2.SerializeToString())
-        with open(os.path.join(path, "input_{}.pb".format(2)), "wb") as f:
+        with open(os.path.join(path, f"input_{2}.pb"), "wb") as f:
             f.write(tensor_3.SerializeToString())
 
-        for i, output_name in enumerate(output_names):
+        for i, _output_name in enumerate(output_names):
             tensor_result = numpy_helper.from_array(
                 np.asarray(result[i]).reshape((batch_size, sequence_length)),
                 output_names[i],
             )
-            with open(os.path.join(path, "output_{}.pb".format(i)), "wb") as f:
+            with open(os.path.join(path, f"output_{i}.pb"), "wb") as f:
                 f.write(tensor_result.SerializeToString())
 
         start_time = timeit.default_timer()
@@ -402,14 +401,14 @@ def generate_test_data(
                 print("Warning: GPU not found")
                 continue
         outputs = session.run(None, inputs)
-        evalTime = timeit.default_timer() - start_time
+        evalTime = timeit.default_timer() - start_time  # noqa: N806
         if outputs[0].tolist() != result[0].tolist():
             print(
                 "Error: not same result after optimization. use_cpu={}, no_opt_output={}, opt_output={}".format(
                     use_cpu, result[0].tolist(), outputs[1].tolist()
                 )
             )
-        print("** Evaluation done in total {} secs".format(evalTime))
+        print(f"** Evaluation done in total {evalTime} secs")
 
 
 def main():
diff --git a/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py b/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py
index 7f613a8674989..065783d5812a8 100644
--- a/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py
+++ b/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py
@@ -7,9 +7,6 @@
 
 import argparse
 import os
-import random
-import sys
-import timeit
 from pathlib import Path
 
 import numpy as np
@@ -47,7 +44,7 @@
 
 class TinyGpt2Model(OnnxModel):
     def __init__(self, model):
-        super(TinyGpt2Model, self).__init__(model)
+        super().__init__(model)
         self.resize_model()
 
     def resize_weight(self, initializer_name, target_shape):
@@ -105,7 +102,7 @@ def resize_model(self):
             if len(tensor.shape) == 1 and tensor.shape[0] == 1:
                 if tensor == old_parameters["num_heads"]:
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         old_parameters["num_heads"],
                         "=>[",
@@ -120,7 +117,7 @@ def resize_model(self):
                     )
                 elif tensor == old_parameters["seq_len"]:
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         old_parameters["seq_len"],
                         "=>[",
@@ -135,7 +132,7 @@ def resize_model(self):
                     )
                 elif tensor == old_parameters["size_per_head"]:
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         old_parameters["size_per_head"],
                         "=>[",
@@ -150,7 +147,7 @@ def resize_model(self):
                     )
                 elif tensor == old_parameters["hidden_size"]:
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         old_parameters["hidden_size"],
                         "=>[",
@@ -165,7 +162,7 @@ def resize_model(self):
                     )
                 elif tensor == 4 * old_parameters["hidden_size"]:
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         4 * old_parameters["hidden_size"],
                         "=>[",
@@ -180,7 +177,7 @@ def resize_model(self):
                     )
                 elif tensor == 3 * old_parameters["hidden_size"]:
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         3 * old_parameters["hidden_size"],
                         "=>[",
@@ -196,7 +193,7 @@ def resize_model(self):
             elif len(tensor.shape) == 0:
                 if tensor == old_parameters["num_heads"]:
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         old_parameters["num_heads"],
                         "=>",
@@ -210,7 +207,7 @@ def resize_model(self):
                     )
                 elif tensor == old_parameters["seq_len"]:
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         old_parameters["seq_len"],
                         "=>",
@@ -224,7 +221,7 @@ def resize_model(self):
                     )
                 elif tensor == old_parameters["size_per_head"]:
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         old_parameters["size_per_head"],
                         "=>",
@@ -238,7 +235,7 @@ def resize_model(self):
                     )
                 elif tensor == old_parameters["hidden_size"]:
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         old_parameters["hidden_size"],
                         "=>",
@@ -252,7 +249,7 @@ def resize_model(self):
                     )
                 elif tensor == 4 * old_parameters["hidden_size"]:
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         4 * old_parameters["hidden_size"],
                         "=>",
@@ -266,7 +263,7 @@ def resize_model(self):
                     )
                 elif tensor == 3 * old_parameters["hidden_size"]:
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         3 * old_parameters["hidden_size"],
                         "=>",
@@ -280,7 +277,7 @@ def resize_model(self):
                     )
                 elif tensor == 1.0 / np.sqrt(old_parameters["size_per_head"]):
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         1.0 / np.sqrt(old_parameters["size_per_head"]),
                         "=>",
@@ -297,7 +294,7 @@ def resize_model(self):
                     )
                 elif tensor == np.sqrt(old_parameters["size_per_head"]):
                     print(
-                        "initializer type={}".format(initializer.data_type),
+                        f"initializer type={initializer.data_type}",
                         initializer.name,
                         np.sqrt(old_parameters["size_per_head"]),
                         "=>",
@@ -349,7 +346,7 @@ def resize_model(self):
                         "Split",
                         node.input,
                         node.output,
-                        name="Split_{}".format(i),
+                        name=f"Split_{i}",
                         axis=2,
                         split=[
                             new_parameters["hidden_size"],
@@ -446,7 +443,6 @@ def generate_test_data(
     test_cases=1,
     output_optimized_model=False,
 ):
-
     for test_case in range(test_cases):
         sequence_length = 3
         input_1 = np.random.randint(dictionary_size, size=(batch_size, 1), dtype=np.int64)
@@ -468,7 +464,7 @@ def generate_test_data(
         output_names = [output.name for output in sess.get_outputs()]
         inputs = {input1_name: input_1}
 
-        with open(os.path.join(path, "input_{}.pb".format(0)), "wb") as f:
+        with open(os.path.join(path, f"input_{0}.pb"), "wb") as f:
             f.write(tensor_1.SerializeToString())
 
         for i in range(12):
@@ -483,7 +479,7 @@ def generate_test_data(
             tensor = numpy_helper.from_array(input, input_name)
             inputs.update({input_name: input})
 
-            with open(os.path.join(path, "input_{}.pb".format(1 + i)), "wb") as f:
+            with open(os.path.join(path, f"input_{1 + i}.pb"), "wb") as f:
                 f.write(tensor.SerializeToString())
 
         if input_tensor_only:
diff --git a/onnxruntime/test/python/transformers/test_gelu_fusions.py b/onnxruntime/test/python/transformers/test_gelu_fusions.py
index feaba40ad88b1..77a6491d4bd3c 100644
--- a/onnxruntime/test/python/transformers/test_gelu_fusions.py
+++ b/onnxruntime/test/python/transformers/test_gelu_fusions.py
@@ -47,7 +47,7 @@ def verify_node_count(self, bert_model, expected_node_count, test_name):
             if len(bert_model.get_nodes_by_op_type(op_type)) != count:
                 print(f"Counters is not expected in test: {test_name}")
                 for op, counter in expected_node_count.items():
-                    print("{}: {} expected={}".format(op, len(bert_model.get_nodes_by_op_type(op)), counter))
+                    print(f"{op}: {len(bert_model.get_nodes_by_op_type(op))} expected={counter}")
             self.assertEqual(len(bert_model.get_nodes_by_op_type(op_type)), count)
 
     def test_fusions(self):
diff --git a/onnxruntime/test/python/transformers/test_gemmfastgelu_fusion.py b/onnxruntime/test/python/transformers/test_gemmfastgelu_fusion.py
index 0c5d12c905cf0..3a948705770ee 100644
--- a/onnxruntime/test/python/transformers/test_gemmfastgelu_fusion.py
+++ b/onnxruntime/test/python/transformers/test_gemmfastgelu_fusion.py
@@ -43,7 +43,7 @@ def float_tensor(name: str, shape: List[int], random=False):
     return helper.make_tensor(name, TensorProto.FLOAT, shape, weights)
 
 
-def create_MatMul_FastGelu_withoutBias(batch_size, m, n, k):
+def create_MatMul_FastGelu_withoutBias(batch_size, m, n, k):  # noqa: N802
     # MatMul + FastGelu
     nodes = [
         helper.make_node("MatMul", ["input", "matmul_weight"], ["fastgelu_input"], "matmul"),
@@ -77,7 +77,7 @@ def create_MatMul_FastGelu_withoutBias(batch_size, m, n, k):
     return helper.make_model(graph)
 
 
-def create_MatMul_FastGelu_withBias(batch_size, m, n, k):
+def create_MatMul_FastGelu_withBias(batch_size, m, n, k):  # noqa: N802
     # MatMul + FastGelu
     nodes = [
         helper.make_node("MatMul", ["input", "matmul_weight"], ["fastgelu_input"], "matmul"),
diff --git a/onnxruntime/test/python/transformers/test_generation.py b/onnxruntime/test/python/transformers/test_generation.py
index f044f98e1da05..7d8c35b47dc57 100644
--- a/onnxruntime/test/python/transformers/test_generation.py
+++ b/onnxruntime/test/python/transformers/test_generation.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation.  All rights reserved.
 # Licensed under the MIT License.  See License.txt in the project root for
@@ -74,9 +73,8 @@ def check_for_init_decoder_attr(self, model_path: str):
         self.assertTrue(init_decoder_found)
 
     def run_beam_search(self, extra_arguments: str, sentences=None, append_arguments=True, is_greedy=False):
-
         if append_arguments:
-            arguments = " ".join(self.default_arguments + [extra_arguments]).split()
+            arguments = " ".join([*self.default_arguments, extra_arguments]).split()
         else:
             arguments = extra_arguments.split()
 
@@ -159,12 +157,12 @@ def test_greedy_search_float16(self):
     @pytest.mark.slow
     def test_beam_search_use_decoder_masked_self_attention(self):
         if self.enable_cuda:
-            self.run_beam_search(f"--past_present_share_buffer --use_decoder_masked_self_attention --use_gpu")
+            self.run_beam_search("--past_present_share_buffer --use_decoder_masked_self_attention --use_gpu")
 
     @pytest.mark.slow
     def test_beam_search_use_decoder_masked_self_attention_fp16(self):
         if self.enable_cuda:
-            self.run_beam_search(f"--past_present_share_buffer --use_decoder_masked_self_attention --use_gpu -p fp16")
+            self.run_beam_search("--past_present_share_buffer --use_decoder_masked_self_attention --use_gpu -p fp16")
 
     @pytest.mark.slow
     def test_external_data(self):
@@ -235,7 +233,7 @@ def remove_onnx_files(self):
 
     def run_beam_search(self, extra_arguments: str, sentences=None, append_arguments=True):
         if append_arguments:
-            arguments = " ".join(self.default_arguments + [extra_arguments]).split()
+            arguments = " ".join([*self.default_arguments, extra_arguments]).split()
         else:
             arguments = extra_arguments.split()
 
diff --git a/onnxruntime/test/python/transformers/test_gpt2_benchmark.py b/onnxruntime/test/python/transformers/test_gpt2_benchmark.py
index c507423a79c13..8ab622b63be34 100644
--- a/onnxruntime/test/python/transformers/test_gpt2_benchmark.py
+++ b/onnxruntime/test/python/transformers/test_gpt2_benchmark.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation.  All rights reserved.
 # Licensed under the MIT License.  See License.txt in the project root for
diff --git a/onnxruntime/test/python/transformers/test_optimizer.py b/onnxruntime/test/python/transformers/test_optimizer.py
index 2a02ce277fb67..270c015c83f27 100644
--- a/onnxruntime/test/python/transformers/test_optimizer.py
+++ b/onnxruntime/test/python/transformers/test_optimizer.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation.  All rights reserved.
 # Licensed under the MIT License.  See License.txt in the project root for
@@ -63,7 +62,7 @@ def verify_node_count(self, onnx_model, expected_node_count, test_name):
             if len(onnx_model.get_nodes_by_op_type(op_type)) != count:
                 print(f"Counters is not expected in test: {test_name}")
                 for op, counter in expected_node_count.items():
-                    print("{}: {} expected={}".format(op, len(onnx_model.get_nodes_by_op_type(op)), counter))
+                    print(f"{op}: {len(onnx_model.get_nodes_by_op_type(op))} expected={counter}")
 
                 self.assertEqual(len(onnx_model.get_nodes_by_op_type(op_type)), count)
 
@@ -308,9 +307,9 @@ def test_huggingface_bart_fusion(self):
 
 @unittest.skipUnless(is_tf_available(), "skip TestBertOptimizationTF since tensorflow is not available")
 class TestTensorflowModelOptimization(unittest.TestCase):
-    def Setup(self):
+    def Setup(self):  # noqa: N802
         try:
-            import tf2onnx
+            import tf2onnx  # noqa: F401
         except ImportError:
             self.skipTest("skip TestBertOptimizationTF since tf2onnx not installed")
 
diff --git a/onnxruntime/test/python/transformers/test_parity_decoder_attention.py b/onnxruntime/test/python/transformers/test_parity_decoder_attention.py
index 6c05e321f7618..992a805eb2eb3 100644
--- a/onnxruntime/test/python/transformers/test_parity_decoder_attention.py
+++ b/onnxruntime/test/python/transformers/test_parity_decoder_attention.py
@@ -10,9 +10,9 @@
 # license information.
 # -------------------------------------------------------------------------
 
-import math
-import os
-from typing import Dict, List, Optional, Tuple
+import math  # noqa: F401
+import os  # noqa: F401
+from typing import Dict, List, Optional, Tuple  # noqa: F401
 
 import numpy
 import torch
@@ -122,7 +122,7 @@ def forward(
         key,
         layer_state: Optional[List[Tensor]],
         encoder_decoder_attention: bool,
-        use_past=torch.tensor(False),
+        use_past=torch.tensor(False),  # noqa: B008
     ):
         bsz = torch._shape_as_tensor(query)[1]
         if layer_state is None or not use_past:
@@ -188,7 +188,7 @@ def forward(
         layer_state: Optional[List[Tensor]] = None,
         attn_mask: Optional[Tensor] = None,
         output_attentions: bool = False,
-        use_past=torch.tensor(False),
+        use_past=torch.tensor(False),  # noqa: B008
         has_key_padding_mask: bool = False,
     ) -> Tuple[Tensor, Optional[Tensor]]:
         """Input shape: Time(SeqLen) x Batch x Channel"""
@@ -239,7 +239,7 @@ def forward(
 
         return attn_output, new_key_cache, new_value_cache
 
-    def ORT_forward(
+    def ORT_forward(  # noqa: N802
         self,
         query,
         key: Tensor,
@@ -247,14 +247,14 @@ def ORT_forward(
         layer_state: Optional[List[Tensor]] = None,
         attn_mask: Optional[Tensor] = None,
         output_attentions: bool = False,
-        use_past=torch.tensor(False),
+        use_past=torch.tensor(False),  # noqa: B008
         has_key_padding_mask: bool = False,
     ) -> Tuple[Tensor, Optional[Tensor]]:
         """Input shape: Time(SeqLen) x Batch x Channel"""
         # For readability
-        static_kv = True if self.encoder_decoder_attention else False
-        has_layer_state = True if layer_state is not None else False
-        use_past_cache = True if use_past else False
+        static_kv = bool(self.encoder_decoder_attention)
+        has_layer_state = layer_state is not None
+        use_past_cache = bool(use_past)
 
         q_weight = self.q_proj.weight.transpose(0, 1)
         q_weight = q_weight.reshape(self.embed_dim, self.embed_dim)
@@ -328,10 +328,10 @@ def create_decoder_attention_graph(
 ):
     from onnx import TensorProto, helper
 
-    S, B, NH = query.size()
-    S2 = key.size()[0]
-    N = num_heads_
-    H = int(NH / N)
+    S, B, NH = query.size()  # noqa: N806
+    S2 = key.size()[0]  # noqa: N806
+    N = num_heads_  # noqa: N806
+    H = int(NH / N)  # noqa: N806
 
     nodes = [
         helper.make_node(
diff --git a/onnxruntime/test/python/transformers/test_parity_gelu.py b/onnxruntime/test/python/transformers/test_parity_gelu.py
index 4da7c2f36a999..dfafb9b7e7c5c 100644
--- a/onnxruntime/test/python/transformers/test_parity_gelu.py
+++ b/onnxruntime/test/python/transformers/test_parity_gelu.py
@@ -28,7 +28,7 @@
 import unittest
 
 import torch
-from parity_utilities import *
+from parity_utilities import *  # noqa: F403
 from torch import nn
 
 
@@ -98,12 +98,12 @@ def run(
 
     # Do not re-use onnx file from previous test since weights of model are random.
     onnx_model_path = "./temp/gelu_{}_{}.onnx".format(formula, "fp16" if float16 else "fp32")
-    export_onnx(model, onnx_model_path, float16, hidden_size, device)
+    export_onnx(model, onnx_model_path, float16, hidden_size, device)  # noqa: F405
 
     if optimized:
         optimized_onnx_path = "./temp/gelu_{}_opt_{}.onnx".format(formula, "fp16" if float16 else "fp32")
         use_gpu = float16 and not fp32_gelu_op
-        optimize_onnx(
+        optimize_onnx(  # noqa: F405
             onnx_model_path,
             optimized_onnx_path,
             Gelu.get_fused_op(formula),
@@ -115,7 +115,7 @@ def run(
     else:
         onnx_path = onnx_model_path
 
-    num_failure = run_parity(
+    num_failure = run_parity(  # noqa: F405
         model,
         onnx_path,
         batch_size,
@@ -236,7 +236,7 @@ def test_cuda(self):
 
 
 if __name__ == "__main__":
-    args, remaining_args = parse_arguments(namespace_filter=unittest)
+    args, remaining_args = parse_arguments(namespace_filter=unittest)  # noqa: F405
 
     TestGeluParity.verbose = args.log_verbose
     TestGeluParity.optimized = args.optimize
diff --git a/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py b/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py
index 33ff8079a1af3..1158f38e2887a 100644
--- a/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py
+++ b/onnxruntime/test/python/transformers/test_parity_huggingface_gpt_attention.py
@@ -166,7 +166,7 @@ def create_inputs(
     sequence_length=1,
     past_sequence_length=5,
     float16=False,
-    device=torch.device("cuda"),
+    device=torch.device("cuda"),  # noqa: B008
     padding_length=0,
 ):
     float_type = torch.float16 if float16 else torch.float32
@@ -218,7 +218,7 @@ def export_onnx(model, onnx_model_path, float16, hidden_size, num_attention_head
     )
 
     with torch.no_grad():
-        outputs = model(input_hidden_states, attention_mask=attention_mask, layer_past=layer_past)
+        model(input_hidden_states, attention_mask=attention_mask, layer_past=layer_past)
 
     dynamic_axes = {
         "input_hidden_states": {0: "batch_size", 1: "seq_len"},
@@ -260,7 +260,7 @@ def optimize_onnx(input_onnx_path, optimized_onnx_path, num_heads, debug):
     onnx_model = OnnxModel(m)
 
     nodes_to_remove = onnx_model.nodes()
-    output_names = ["attn_output", "present"] + DEBUG_OUTPUTS if debug else ["attn_output", "present"]
+    output_names = ["attn_output", "present", *DEBUG_OUTPUTS] if debug else ["attn_output", "present"]
     node_to_add = helper.make_node(
         "Attention",
         [
@@ -316,7 +316,7 @@ def verify_attention(
     max_diffs = []
 
     ort_session = create_ort_session(onnx_model_path, device.type == "cuda", verbose=verbose)
-    for i in range(test_cases):
+    for _i in range(test_cases):
         input_hidden_states, attention_mask, layer_past = create_inputs(
             batch_size,
             hidden_size,
diff --git a/onnxruntime/test/python/transformers/test_parity_layernorm.py b/onnxruntime/test/python/transformers/test_parity_layernorm.py
index 48190fee150a9..648bfde7a8342 100644
--- a/onnxruntime/test/python/transformers/test_parity_layernorm.py
+++ b/onnxruntime/test/python/transformers/test_parity_layernorm.py
@@ -9,10 +9,10 @@
 
 import onnx
 import torch
-from parity_utilities import *
+from parity_utilities import *  # noqa: F403
 from torch import nn
 
-if find_transformers_source():
+if find_transformers_source():  # noqa: F405
     from onnx_model import OnnxModel
 else:
     from onnxruntime.transformers.onnx_model import OnnxModel
@@ -150,12 +150,14 @@ def run(
 
     # Do not re-use onnx file from previous test since weights of model are random.
     onnx_model_path = "./temp/layer_norm_{}_formula{}.onnx".format("fp16" if float16 else "fp32", formula)
-    export_onnx(model, onnx_model_path, float16, hidden_size, device)
+    export_onnx(model, onnx_model_path, float16, hidden_size, device)  # noqa: F405
 
     if optimized:
         optimized_onnx_path = "./temp/layer_norm_{}_formula{}_opt.onnx".format("fp16" if float16 else "fp32", formula)
         if (not float16) or cast_fp16:
-            optimize_onnx(onnx_model_path, optimized_onnx_path, expected_op=LayerNorm.get_fused_op(), verbose=verbose)
+            optimize_onnx(  # noqa: F405
+                onnx_model_path, optimized_onnx_path, expected_op=LayerNorm.get_fused_op(), verbose=verbose
+            )
         else:
             if cast_onnx_only:
                 optimize_fp16_onnx_with_cast(onnx_model_path, optimized_onnx_path, epsilon=epsilon)
@@ -166,7 +168,7 @@ def run(
     else:
         onnx_path = onnx_model_path
 
-    num_failure = run_parity(
+    num_failure = run_parity(  # noqa: F405
         model,
         onnx_path,
         batch_size,
@@ -306,7 +308,7 @@ def test_cuda(self):
 
 
 if __name__ == "__main__":
-    args, remaining_args = parse_arguments(namespace_filter=unittest)
+    args, remaining_args = parse_arguments(namespace_filter=unittest)  # noqa: F405
 
     TestLayerNormParity.verbose = args.log_verbose
     TestLayerNormParity.optimized = args.optimize
diff --git a/onnxruntime/test/python/transformers/test_parity_neox_attention.py b/onnxruntime/test/python/transformers/test_parity_neox_attention.py
index 63ed49b9013b1..0c119da69c812 100644
--- a/onnxruntime/test/python/transformers/test_parity_neox_attention.py
+++ b/onnxruntime/test/python/transformers/test_parity_neox_attention.py
@@ -275,7 +275,6 @@ def torch_forward(
             past_value = layer_past[1]
             key = torch.cat((past_key, key), dim=-2)
             value = torch.cat((past_value, value), dim=-2)
-        present = (key, value) if use_cache else None
 
         # Compute attention
         attn_output, _ = self._attn(query, key, value, attention_mask, head_mask)
diff --git a/onnxruntime/test/python/transformers/test_parity_t5_mha.py b/onnxruntime/test/python/transformers/test_parity_t5_mha.py
index 22522c91835a7..51d5ba7838d92 100644
--- a/onnxruntime/test/python/transformers/test_parity_t5_mha.py
+++ b/onnxruntime/test/python/transformers/test_parity_t5_mha.py
@@ -455,7 +455,7 @@ def compare_t5_cross_attention_decoder(batch_size, seq_len, num_heads, head_size
         head_size=head_size,
         use_past=True,
     )
-    T5CrossAttention = T5Attention(config, is_static_kv=True)
+    T5CrossAttention = T5Attention(config, is_static_kv=True)  # noqa: N806
 
     hidden_states, key_value_states, past_key_value, attention_mask, _ = T5CrossAttention.create_inputs()
     torch_output = T5CrossAttention.torch_forward(
@@ -479,7 +479,7 @@ def compare_t5_cross_attention_decoder_init(batch_size, seq_len, num_heads, head
         head_size=head_size,
         use_past=False,
     )
-    T5CrossAttention = T5Attention(config, is_static_kv=True)
+    T5CrossAttention = T5Attention(config, is_static_kv=True)  # noqa: N806
 
     hidden_states, key_value_states, _, attention_mask, _ = T5CrossAttention.create_inputs()
     torch_output = T5CrossAttention.torch_forward(
@@ -505,7 +505,7 @@ def compare_t5_self_attention_decoder_init(batch_size, seq_len, num_heads, head_
         head_size=head_size,
         use_past=False,
     )
-    T5CrossAttention = T5Attention(config, is_static_kv=False)
+    T5CrossAttention = T5Attention(config, is_static_kv=False)  # noqa: N806
 
     hidden_states, _, _, _, position_bias = T5CrossAttention.create_inputs()
     torch_output = T5CrossAttention.torch_forward(
@@ -531,7 +531,7 @@ def compare_t5_self_attention_decoder(batch_size, seq_len, num_heads, head_size,
         head_size=head_size,
         use_past=True,
     )
-    T5CrossAttention = T5Attention(config, is_static_kv=False)
+    T5CrossAttention = T5Attention(config, is_static_kv=False)  # noqa: N806
 
     hidden_states, _, past_key_value, _, position_bias = T5CrossAttention.create_inputs()
     torch_output = T5CrossAttention.torch_forward(
diff --git a/onnxruntime/test/python/transformers/test_profiler.py b/onnxruntime/test/python/transformers/test_profiler.py
index c8397bc3f7bb8..5d260e716d0aa 100644
--- a/onnxruntime/test/python/transformers/test_profiler.py
+++ b/onnxruntime/test/python/transformers/test_profiler.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding: utf-8
 # -------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation.  All rights reserved.
 # Licensed under the MIT License.  See License.txt in the project root for
diff --git a/onnxruntime/test/testdata/CNTK/gen.py b/onnxruntime/test/testdata/CNTK/gen.py
index db9022d3d50f8..51ad5e781c243 100644
--- a/onnxruntime/test/testdata/CNTK/gen.py
+++ b/onnxruntime/test/testdata/CNTK/gen.py
@@ -12,33 +12,27 @@
 data_dir = "test_data_set_0"
 
 
-def SaveTensorProto(file_path, variable, data, name):
+def SaveTensorProto(file_path, variable, data, name):  # noqa: N802
     # ONNX input shape always has sequence axis as the first dimension, if sequence axis exists
     if len(variable.dynamic_axes) == 2:
-        data = data.transpose(
-            (
-                1,
-                0,
-            )
-            + tuple(range(2, len(data.shape)))
-        )
+        data = data.transpose((1, 0, *tuple(range(2, len(data.shape)))))
     tp = numpy_helper.from_array(data, name if name else variable.uid)
     onnx.save_tensor(tp, file_path)
 
 
-def SaveData(test_data_dir, prefix, variables, data_list, name_replacements=None):
+def SaveData(test_data_dir, prefix, variables, data_list, name_replacements=None):  # noqa: N802
     if isinstance(data_list, np.ndarray):
         data_list = [data_list]
     for (i, d), v in zip(enumerate(data_list), variables):
         SaveTensorProto(
-            os.path.join(test_data_dir, "{0}_{1}.pb".format(prefix, i)),
+            os.path.join(test_data_dir, f"{prefix}_{i}.pb"),
             v,
             d,
             name_replacements[v.uid] if name_replacements else None,
         )
 
 
-def Save(dir, func, feed, outputs):
+def Save(dir, func, feed, outputs):  # noqa: N802
     if not os.path.exists(dir):
         os.makedirs(dir)
     onnx_file = os.path.join(dir, model_file)
@@ -74,7 +68,7 @@ def Save(dir, func, feed, outputs):
     SaveData(test_data_dir, "output", func.outputs, [outputs[var] for var in func.outputs])
 
 
-def GenSimple():
+def GenSimple():  # noqa: N802
     x = C.input_variable(
         (
             1,
@@ -87,7 +81,7 @@ def GenSimple():
     Save("test_simple", y, data_x, data_y)
 
 
-def GenSharedWeights():
+def GenSharedWeights():  # noqa: N802
     x = C.input_variable(
         (
             1,
@@ -101,7 +95,7 @@ def GenSharedWeights():
     Save("test_shared_weights", y, data_x, data_y)
 
 
-def GenSimpleMNIST():
+def GenSimpleMNIST():  # noqa: N802
     input_dim = 784
     num_output_classes = 10
     num_hidden_layers = 1
@@ -128,7 +122,7 @@ def GenSimpleMNIST():
     Save("test_simpleMNIST", model, data_feature, data_output)
 
 
-def GenMatMul_1k():
+def GenMatMul_1k():  # noqa: N802
     feature = C.input_variable(
         (
             1024,
@@ -143,23 +137,25 @@ def GenMatMul_1k():
     Save("test_MatMul_1k", model, data_feature, data_output)
 
 
-def LSTM(cell_dim, use_scan=True):
+def LSTM(cell_dim, use_scan=True):  # noqa: N802
     # we now create an LSTM_cell function and call it with the input and placeholders
-    LSTM_cell = C.layers.LSTM(cell_dim)
+    LSTM_cell = C.layers.LSTM(cell_dim)  # noqa: N806
 
     @C.Function
     def func(dh, dc, input):
-        LSTM_func = LSTM_cell(dh, dc, input)
+        LSTM_func = LSTM_cell(dh, dc, input)  # noqa: N806
         if use_scan:
-            LSTM_func_root = C.as_composite(LSTM_func.outputs[0].owner.block_root)
+            LSTM_func_root = C.as_composite(LSTM_func.outputs[0].owner.block_root)  # noqa: N806
             args = LSTM_func_root.arguments
-            LSTM_func = LSTM_func_root.clone(C.CloneMethod.share, {args[0]: input, args[1]: dh, args[2]: dc})
+            LSTM_func = LSTM_func_root.clone(  # noqa: N806
+                C.CloneMethod.share, {args[0]: input, args[1]: dh, args[2]: dc}
+            )
         return LSTM_func
 
     return func
 
 
-def GenLSTMx4(use_scan):
+def GenLSTMx4(use_scan):  # noqa: N802
     feature = C.sequence.input_variable((128,), np.float32)
     lstm1 = C.layers.Recurrence(LSTM(512, use_scan))(feature)
     lstm2_fw = C.layers.Recurrence(LSTM(512, use_scan))(lstm1)
@@ -178,7 +174,7 @@ def GenLSTMx4(use_scan):
     Save("test_LSTMx4_" + postfix, model, data_feature, data_output)
 
 
-def GenScan():
+def GenScan():  # noqa: N802
     np.random.seed(0)
     feature = C.sequence.input_variable((3,), np.float32)
     model = C.layers.For(range(4), lambda: C.layers.Recurrence(LSTM(2, use_scan=True)))(feature)
@@ -225,7 +221,7 @@ def GenScan():
     onnx.save(out_mp, "test_Scan/model.onnx", "wb")
 
 
-def GenSimpleScan():
+def GenSimpleScan():  # noqa: N802
     feature = C.sequence.input_variable((128,), np.float32)
     param = C.parameter(shape=(1,), dtype=np.float32)
     scan = C.layers.Recurrence(lambda h, x: x + h + param)(feature)
@@ -235,7 +231,7 @@ def GenSimpleScan():
     Save("test_SimpleScan", model, data_feature, data_output)
 
 
-def GenGRU():
+def GenGRU():  # noqa: N802
     feature = C.sequence.input_variable((64,), np.float32)
     gru_fw = C.layers.Recurrence(C.layers.GRU(128))(feature)
     gru_bw = C.layers.Recurrence(C.layers.GRU(128), go_backwards=True)(feature)
@@ -245,7 +241,7 @@ def GenGRU():
     Save("test_GRU", model, data_feature, data_output)
 
 
-def GenRNN():
+def GenRNN():  # noqa: N802
     feature = C.sequence.input_variable((64,), np.float32)
     model = C.optimized_rnnstack(
         feature,
diff --git a/onnxruntime/test/testdata/capi_symbolic_dims.py b/onnxruntime/test/testdata/capi_symbolic_dims.py
index 69024717cf1ba..0bb487d1e91f0 100644
--- a/onnxruntime/test/testdata/capi_symbolic_dims.py
+++ b/onnxruntime/test/testdata/capi_symbolic_dims.py
@@ -29,7 +29,7 @@
 
 onnx.save_model(model, "capi_symbolic_dims.onnx")
 
-import onnxruntime as rt
+import onnxruntime as rt  # noqa: E402
 
 sess = rt.InferenceSession("capi_symbolic_dims.onnx")
 print([i.shape for i in sess.get_inputs()])
diff --git a/onnxruntime/test/testdata/coreml_argmax_cast_test.py b/onnxruntime/test/testdata/coreml_argmax_cast_test.py
index 37e99d9b88ef6..acf24ac379065 100644
--- a/onnxruntime/test/testdata/coreml_argmax_cast_test.py
+++ b/onnxruntime/test/testdata/coreml_argmax_cast_test.py
@@ -7,7 +7,7 @@
 # We have this separated test script to generate graph for the case: An ArgMax followed by a Cast to int32 type
 
 
-def GenerateModel(model_name):
+def GenerateModel(model_name):  # noqa: N802
     nodes = [
         helper.make_node("ArgMax", ["X"], ["argmax_output_int64"], "argmax", axis=1, keepdims=1),
         helper.make_node("Cast", ["argmax_output_int64"], ["Y"], "cast", to=6),  # cast to int32 type
diff --git a/onnxruntime/test/testdata/dynamic_quantize_matmul_test.py b/onnxruntime/test/testdata/dynamic_quantize_matmul_test.py
index d681723810e65..8e6dbe5ea581d 100644
--- a/onnxruntime/test/testdata/dynamic_quantize_matmul_test.py
+++ b/onnxruntime/test/testdata/dynamic_quantize_matmul_test.py
@@ -1,10 +1,10 @@
-from enum import Enum
+from enum import Enum  # noqa: F401
 
 import onnx
 from onnx import TensorProto, helper
 
 
-def GenerateModel(model_name, sign, b_zp=True, bias=False):
+def GenerateModel(model_name, sign, b_zp=True, bias=False):  # noqa: N802
     nodes = [  # DynamicQuantizeMatMul subgraph
         helper.make_node(
             "DynamicQuantizeLinear",
diff --git a/onnxruntime/test/testdata/ep_dynamic_graph_input_test.py b/onnxruntime/test/testdata/ep_dynamic_graph_input_test.py
index b6ef5f78330f1..23d790ba7bca4 100644
--- a/onnxruntime/test/testdata/ep_dynamic_graph_input_test.py
+++ b/onnxruntime/test/testdata/ep_dynamic_graph_input_test.py
@@ -6,7 +6,7 @@
 # the whole graph in NNAPI EP if it has a dynamic input to checking the dynamic shape at individual operator support check level,
 # We have a separated test here using a graph with dynamic input that becomes fixed after a Resize
 # Please see BaseOpBuilder::HasSupportedInputs in <repo_root>/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_support_checker.cc
-def GenerateModel(model_name):
+def GenerateModel(model_name):  # noqa: N802
     nodes = [
         helper.make_node(
             "Resize",
diff --git a/onnxruntime/test/testdata/ep_partitioning_tests.py b/onnxruntime/test/testdata/ep_partitioning_tests.py
index a85b9bda6c187..6c8322bb9bd62 100644
--- a/onnxruntime/test/testdata/ep_partitioning_tests.py
+++ b/onnxruntime/test/testdata/ep_partitioning_tests.py
@@ -1,4 +1,4 @@
-import numpy as np
+import numpy as np  # noqa: F401
 import onnx
 from onnx import TensorProto, helper
 
diff --git a/onnxruntime/test/testdata/matmul_integer_to_float.py b/onnxruntime/test/testdata/matmul_integer_to_float.py
index 6b126fb3a2a1f..b898390044cf4 100644
--- a/onnxruntime/test/testdata/matmul_integer_to_float.py
+++ b/onnxruntime/test/testdata/matmul_integer_to_float.py
@@ -1,10 +1,10 @@
-from enum import Enum
+from enum import Enum  # noqa: F401
 
 import onnx
 from onnx import TensorProto, helper
 
 
-def GenerateModel(model_name, sign_i, sign_w, has_zp=True, bias=False):
+def GenerateModel(model_name, sign_i, sign_w, has_zp=True, bias=False):  # noqa: N802
     nodes = [  # subgraph
         helper.make_node(
             "MatMulInteger",
diff --git a/onnxruntime/test/testdata/model_with_external_initializer_come_from_user.py b/onnxruntime/test/testdata/model_with_external_initializer_come_from_user.py
index b6b622e20e248..ce96bb3f5783a 100644
--- a/onnxruntime/test/testdata/model_with_external_initializer_come_from_user.py
+++ b/onnxruntime/test/testdata/model_with_external_initializer_come_from_user.py
@@ -11,22 +11,22 @@
 def create_external_data_tensor(value, tensor_name):  # type: (List[Any], Text) -> TensorProto
     tensor = from_array(value)
     tensor.name = tensor_name
-    tensor_filename = "{}.bin".format(tensor_name)
+    tensor_filename = f"{tensor_name}.bin"
     set_external_data(tensor, location=tensor_filename)
     tensor.ClearField("raw_data")
     tensor.data_location = onnx.TensorProto.EXTERNAL
     return tensor
 
 
-def GenerateModel(model_name):
+def GenerateModel(model_name):  # noqa: N802
     # Create one input (ValueInfoProto)
-    X = helper.make_tensor_value_info("X", TensorProto.FLOAT, [1, 2])
+    X = helper.make_tensor_value_info("X", TensorProto.FLOAT, [1, 2])  # noqa: N806
 
     # Create second input (ValueInfoProto)
-    Pads = helper.make_tensor_value_info("Pads_not_on_disk", TensorProto.INT64, [4])
+    Pads = helper.make_tensor_value_info("Pads_not_on_disk", TensorProto.INT64, [4])  # noqa: N806
 
     # Create one output (ValueInfoProto)
-    Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [1, 4])
+    Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [1, 4])  # noqa: N806
 
     # Create a node (NodeProto)
     node_def = helper.make_node(
@@ -49,9 +49,9 @@ def GenerateModel(model_name):
     # Create the model (ModelProto)
     model_def = helper.make_model(graph_def, producer_name="onnx-example")
 
-    print("The ir_version in model: {}\n".format(model_def.ir_version))
-    print("The producer_name in model: {}\n".format(model_def.producer_name))
-    print("The graph in model:\n{}".format(model_def.graph))
+    print(f"The ir_version in model: {model_def.ir_version}\n")
+    print(f"The producer_name in model: {model_def.producer_name}\n")
+    print(f"The graph in model:\n{model_def.graph}")
     with open(model_name, "wb") as model_file:
         model_file.write(model_def.SerializeToString())
 
diff --git a/onnxruntime/test/testdata/model_with_external_initializers.py b/onnxruntime/test/testdata/model_with_external_initializers.py
index 8b591549963fd..9b987f08b6663 100644
--- a/onnxruntime/test/testdata/model_with_external_initializers.py
+++ b/onnxruntime/test/testdata/model_with_external_initializers.py
@@ -11,25 +11,25 @@
 def create_external_data_tensor(value, tensor_name):  # type: (List[Any], Text) -> TensorProto
     tensor = from_array(np.array(value))
     tensor.name = tensor_name
-    tensor_filename = "{}.bin".format(tensor_name)
+    tensor_filename = f"{tensor_name}.bin"
     set_external_data(tensor, location=tensor_filename)
 
-    with open(os.path.join(tensor_filename), "wb") as data_file:
+    with open(os.path.join(tensor_filename), "wb") as data_file:  # noqa: F821
         data_file.write(tensor.raw_data)
     tensor.ClearField("raw_data")
     tensor.data_location = onnx.TensorProto.EXTERNAL
     return tensor
 
 
-def GenerateModel(model_name):
+def GenerateModel(model_name):  # noqa: N802
     # Create one input (ValueInfoProto)
-    X = helper.make_tensor_value_info("X", TensorProto.FLOAT, [1, 2])
+    X = helper.make_tensor_value_info("X", TensorProto.FLOAT, [1, 2])  # noqa: N806
 
     # Create second input (ValueInfoProto)
-    Pads = helper.make_tensor_value_info("Pads", TensorProto.INT64, [4])
+    Pads = helper.make_tensor_value_info("Pads", TensorProto.INT64, [4])  # noqa: N806
 
     # Create one output (ValueInfoProto)
-    Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [1, 4])
+    Y = helper.make_tensor_value_info("Y", TensorProto.FLOAT, [1, 4])  # noqa: N806
 
     # Create a node (NodeProto)
     node_def = helper.make_node(
@@ -61,9 +61,9 @@ def GenerateModel(model_name):
     # Create the model (ModelProto)
     model_def = helper.make_model(graph_def, producer_name="onnx-example")
 
-    print("The ir_version in model: {}\n".format(model_def.ir_version))
-    print("The producer_name in model: {}\n".format(model_def.producer_name))
-    print("The graph in model:\n{}".format(model_def.graph))
+    print(f"The ir_version in model: {model_def.ir_version}\n")
+    print(f"The producer_name in model: {model_def.producer_name}\n")
+    print(f"The graph in model:\n{model_def.graph}")
     onnx.checker.check_model(model_def)
     print("The model is checked!")
     with open(model_name, "wb") as model_file:
diff --git a/onnxruntime/test/testdata/model_with_metadata.py b/onnxruntime/test/testdata/model_with_metadata.py
index 7e01b6a38f54c..645c529a1441c 100644
--- a/onnxruntime/test/testdata/model_with_metadata.py
+++ b/onnxruntime/test/testdata/model_with_metadata.py
@@ -3,7 +3,7 @@
 
 
 # Create a model with metadata to test ORT conversion
-def GenerateModel(model_name):
+def GenerateModel(model_name):  # noqa: N802
     nodes = [
         helper.make_node("Sigmoid", ["X"], ["Y"], "sigmoid"),
     ]
diff --git a/onnxruntime/test/testdata/nnapi_internal_uint8_support.py b/onnxruntime/test/testdata/nnapi_internal_uint8_support.py
index 0956ba3bd13fa..574d19cf9a1d8 100644
--- a/onnxruntime/test/testdata/nnapi_internal_uint8_support.py
+++ b/onnxruntime/test/testdata/nnapi_internal_uint8_support.py
@@ -5,7 +5,7 @@
 # This is to test the operators without "Qlinear" support but still support uint8 input
 # These operators need to be internal to a graph/partition
 # def GenerateModel(model_name):
-def GenerateModel(model_name):
+def GenerateModel(model_name):  # noqa: N802
     nodes = [
         helper.make_node(
             "QuantizeLinear",
diff --git a/onnxruntime/test/testdata/nnapi_reshape_flatten_test.py b/onnxruntime/test/testdata/nnapi_reshape_flatten_test.py
index 27cfd1304f392..7670ba4ed30bc 100644
--- a/onnxruntime/test/testdata/nnapi_reshape_flatten_test.py
+++ b/onnxruntime/test/testdata/nnapi_reshape_flatten_test.py
@@ -5,7 +5,7 @@
 # Since NNAPI EP handles Reshape and Flatten differently,
 # Please see ReshapeOpBuilder::CanSkipReshape in <repo_root>/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
 # We have a separated test for these skip reshape scenarios
-def GenerateModel(model_name):
+def GenerateModel(model_name):  # noqa: N802
     nodes = [
         helper.make_node("Flatten", ["X"], ["Flatten_1_Y"], "flatten_1"),
         helper.make_node("MatMul", ["Flatten_1_Y", "MatMul_B"], ["MatMul_Y"], "matmul"),
diff --git a/onnxruntime/test/testdata/nnapi_sigmoid_input_rank_test.py b/onnxruntime/test/testdata/nnapi_sigmoid_input_rank_test.py
index 160e8ba73a90b..7a3c0d4e6515f 100644
--- a/onnxruntime/test/testdata/nnapi_sigmoid_input_rank_test.py
+++ b/onnxruntime/test/testdata/nnapi_sigmoid_input_rank_test.py
@@ -5,7 +5,7 @@
 # Sigmoid op support checker in NNAPI EP, so we don't fail hard. Added test case here.
 
 
-def GenerateModel(model_name):
+def GenerateModel(model_name):  # noqa: N802
     node = [
         helper.make_node("Sigmoid", ["X"], ["Y"], "sigmoid"),
     ]
diff --git a/onnxruntime/test/testdata/sparse_initializer_as_output.py b/onnxruntime/test/testdata/sparse_initializer_as_output.py
index 741ed6439e815..e3609a080c740 100644
--- a/onnxruntime/test/testdata/sparse_initializer_as_output.py
+++ b/onnxruntime/test/testdata/sparse_initializer_as_output.py
@@ -1,22 +1,18 @@
 import argparse
-import os
+import os  # noqa: F401
 import sys
 import traceback
-from typing import Any, Callable, Dict, List, Optional, Sequence, Text, Tuple, TypeVar, Union, cast
+from typing import Any, Callable, Dict, List, Optional, Sequence, Text, Tuple, TypeVar, Union, cast  # noqa: F401
 
 import numpy as np
 import onnx
-from onnx import (
-    AttributeProto,
-    GraphProto,
-    SparseTensorProto,
-    TensorProto,
-    ValueInfoProto,
-    helper,
-    mapping,
-    numpy_helper,
-    utils,
-)
+from onnx import AttributeProto  # noqa: F401
+from onnx import GraphProto  # noqa: F401
+from onnx import SparseTensorProto  # noqa: F401
+from onnx import mapping  # noqa: F401
+from onnx import numpy_helper  # noqa: F401
+from onnx import utils  # noqa: F401
+from onnx import TensorProto, ValueInfoProto, helper
 from onnx.helper import make_opsetid
 
 
@@ -112,7 +108,7 @@ def create_model(constant_node_name, output_file_name):
     )
 
     # Outputs, a square matrix
-    Values_info = make_sparse_tensor_value_info("values", TensorProto.FLOAT, dense_shape)
+    Values_info = make_sparse_tensor_value_info("values", TensorProto.FLOAT, dense_shape)  # noqa: N806
 
     graph_def = helper.make_graph(
         nodes=[constant_node],
diff --git a/onnxruntime/test/testdata/sparse_to_dense_matmul.py b/onnxruntime/test/testdata/sparse_to_dense_matmul.py
index 26fb426968c39..3ed71927c1e64 100644
--- a/onnxruntime/test/testdata/sparse_to_dense_matmul.py
+++ b/onnxruntime/test/testdata/sparse_to_dense_matmul.py
@@ -1,22 +1,18 @@
 import argparse
-import os
+import os  # noqa: F401
 import sys
 import traceback
-from typing import Any, Callable, Dict, List, Optional, Sequence, Text, Tuple, TypeVar, Union, cast
+from typing import Any, Callable, Dict, List, Optional, Sequence, Text, Tuple, TypeVar, Union, cast  # noqa: F401
 
-import numpy as np
+import numpy as np  # noqa: F401
 import onnx
-from onnx import (
-    AttributeProto,
-    GraphProto,
-    SparseTensorProto,
-    TensorProto,
-    ValueInfoProto,
-    helper,
-    mapping,
-    numpy_helper,
-    utils,
-)
+from onnx import AttributeProto  # noqa: F401
+from onnx import GraphProto  # noqa: F401
+from onnx import SparseTensorProto  # noqa: F401
+from onnx import mapping  # noqa: F401
+from onnx import numpy_helper  # noqa: F401
+from onnx import utils  # noqa: F401
+from onnx import TensorProto, ValueInfoProto, helper
 from onnx.helper import make_opsetid
 
 
@@ -85,9 +81,9 @@ def create_model(output_file_name):
         domain="com.microsoft",
     )
 
-    value_info_A = make_sparse_tensor_value_info("sparse_A", TensorProto.FLOAT, [9, 9])
-    value_info_B = helper.make_tensor_value_info("dense_B", TensorProto.FLOAT, [9, 9])
-    value_info_Y = helper.make_tensor_value_info("dense_Y", TensorProto.FLOAT, [9, 9])
+    value_info_A = make_sparse_tensor_value_info("sparse_A", TensorProto.FLOAT, [9, 9])  # noqa: N806
+    value_info_B = helper.make_tensor_value_info("dense_B", TensorProto.FLOAT, [9, 9])  # noqa: N806
+    value_info_Y = helper.make_tensor_value_info("dense_Y", TensorProto.FLOAT, [9, 9])  # noqa: N806
 
     graph_def = helper.make_graph(
         nodes=[matmul_node],
diff --git a/onnxruntime/test/testdata/test_data_generation/lr_scheduler/lr_scheduler_test_data_generator.py b/onnxruntime/test/testdata/test_data_generation/lr_scheduler/lr_scheduler_test_data_generator.py
index d54b15c276e28..95dfd7fa36bd6 100644
--- a/onnxruntime/test/testdata/test_data_generation/lr_scheduler/lr_scheduler_test_data_generator.py
+++ b/onnxruntime/test/testdata/test_data_generation/lr_scheduler/lr_scheduler_test_data_generator.py
@@ -30,7 +30,7 @@ class WarmupLinearSchedule(LambdaLR):
     def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
         self.warmup_steps = warmup_steps
         self.t_total = t_total
-        super(WarmupLinearSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
+        super().__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
 
     def lr_lambda(self, step):
         print(f"warmup_step_count_: {self.warmup_steps }, step: {step}, total_step_count_: {self.t_total}")
@@ -42,7 +42,6 @@ def lr_lambda(self, step):
 def main():
     """Main entry."""
     num_training_steps = 100
-    seed = 8888
     device = "cuda"
     batch_size, dimension_in, dimension_hidden = 2, 2, 3
 
@@ -97,7 +96,7 @@ def main():
 
         new_scheduler = WarmupLinearSchedule(new_adamw_optimizer, num_warmup_steps, num_training_steps)
         new_scheduler.load_state_dict(state_dict["lr_scheduler"])
-        for i in range(save_ckpt_step + 1, num_training_steps):
+        for i in range(save_ckpt_step + 1, num_training_steps):  # noqa: B007
             data.append([new_scheduler.last_epoch, new_scheduler.get_last_lr()])
             prediction = pt_model(input)
             loss = prediction.sum()
diff --git a/onnxruntime/test/testdata/test_data_generation/sgd_test/sgd_test_data_generator.py b/onnxruntime/test/testdata/test_data_generation/sgd_test/sgd_test_data_generator.py
index ac064963b5e43..a3d7946d63214 100644
--- a/onnxruntime/test/testdata/test_data_generation/sgd_test/sgd_test_data_generator.py
+++ b/onnxruntime/test/testdata/test_data_generation/sgd_test/sgd_test_data_generator.py
@@ -136,7 +136,7 @@ def _data_func():
         target = torch.randn(batch_size, dimension_hidden, device=device, dtype=torch.float32)
         return input, target
 
-    json_file_name = f"sgd_test_single_weight.json"
+    json_file_name = "sgd_test_single_weight.json"
     generate_sgd_test_data(seed, _model_setup_func, _data_func, run_step_count, json_file_name, device)
 
 
@@ -154,7 +154,7 @@ def data_func():
         target = torch.randn(batch_size, dim_out, device=device, dtype=torch.float32)
         return input, target
 
-    json_file_name = f"sgd_test_multiple_weights.json"
+    json_file_name = "sgd_test_multiple_weights.json"
     generate_sgd_test_data(seed, _model_setup_func, data_func, run_step_count, json_file_name, device)
 
 
diff --git a/onnxruntime/test/testdata/test_kernel_info_get_const_input.py b/onnxruntime/test/testdata/test_kernel_info_get_const_input.py
index d2f4b6ab8c9d6..c9326eba2e245 100644
--- a/onnxruntime/test/testdata/test_kernel_info_get_const_input.py
+++ b/onnxruntime/test/testdata/test_kernel_info_get_const_input.py
@@ -2,7 +2,7 @@
 from onnx import TensorProto, helper
 
 
-def GenerateModel(model_name):
+def GenerateModel(model_name):  # noqa: N802
     initializers = [
         helper.make_tensor(
             "weight",
diff --git a/onnxruntime/test/testdata/transform/cast_elimination.py b/onnxruntime/test/testdata/transform/cast_elimination.py
index fbf0932dcaa0d..466221bcf7aac 100644
--- a/onnxruntime/test/testdata/transform/cast_elimination.py
+++ b/onnxruntime/test/testdata/transform/cast_elimination.py
@@ -1,6 +1,5 @@
-import numpy as np
 import onnx
-from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import OperatorSetIdProto, TensorProto, helper
 
 X1 = helper.make_tensor_value_info("x1", TensorProto.INT64, [4, 4])
 X2 = helper.make_tensor_value_info("x2", TensorProto.INT64, [4, 1])
diff --git a/onnxruntime/test/testdata/transform/computation_reduction.py b/onnxruntime/test/testdata/transform/computation_reduction.py
index 7d33c9cc66c89..6f726a54261ed 100644
--- a/onnxruntime/test/testdata/transform/computation_reduction.py
+++ b/onnxruntime/test/testdata/transform/computation_reduction.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper  # noqa: F401
 
 vocab_size = 256  # 30258
 
@@ -13,12 +13,12 @@
 Y = helper.make_tensor_value_info("output", TensorProto.FLOAT, ["batch", "dynamic_prediction_count", vocab_size])
 Gather_Y = helper.make_tensor_value_info("gather_output", TensorProto.FLOAT, ["batch", 128])
 
-layer_norm1_weight_np_vals = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape((128))
+layer_norm1_weight_np_vals = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape(128)
 layer_norm1_weight_initializer = numpy_helper.from_array(
     layer_norm1_weight_np_vals, "bert.encoder.layer.2.output.LayerNorm.weight"
 )
 
-layer_norm1_bias_np_vals = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape((128))
+layer_norm1_bias_np_vals = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape(128)
 layer_norm1_bias_initializer = numpy_helper.from_array(
     layer_norm1_bias_np_vals, "bert.encoder.layer.2.output.LayerNorm.bias"
 )
@@ -26,15 +26,15 @@
 matmul1_np_vals = np.random.uniform(0.0, 1.0, (128, 128)).astype(np.float32).reshape((128, 128))
 matmul1_initializer = numpy_helper.from_array(matmul1_np_vals, "matmul1_initializer")
 
-add1_np_vals = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape((128))
+add1_np_vals = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape(128)
 add1_initializer = numpy_helper.from_array(add1_np_vals, "add1_initializerr")
 
-layer_norm2_weight_np_vals = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape((128))
+layer_norm2_weight_np_vals = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape(128)
 layer_norm2_weight_initializer = numpy_helper.from_array(
     layer_norm2_weight_np_vals, "cls.predictions.transform.LayerNorm.weight"
 )
 
-layer_norm2_bias_np_vals = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape((128))
+layer_norm2_bias_np_vals = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape(128)
 layer_norm2_bias_initializer = numpy_helper.from_array(
     layer_norm2_bias_np_vals, "cls.predictions.transform.LayerNorm.bias"
 )
@@ -42,7 +42,7 @@
 matmul2_np_vals = np.random.uniform(0.0, 1.0, (128, vocab_size)).astype(np.float32).reshape((128, vocab_size))
 matmul2_initializer = numpy_helper.from_array(matmul2_np_vals, "bert.embeddings.word_embeddings.weight_transposed")
 
-add2_np_vals = np.random.uniform(0.0, 1.0, (vocab_size)).astype(np.float32).reshape((vocab_size))
+add2_np_vals = np.random.uniform(0.0, 1.0, (vocab_size)).astype(np.float32).reshape(vocab_size)
 add2_initializer = numpy_helper.from_array(add2_np_vals, "cls.predictions.bias")
 
 gather_indice_np_vals = np.asarray([0]).astype(np.int64).reshape(())
diff --git a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/e2e.py b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/e2e.py
index c0e42e8a3709d..9e3e990632307 100755
--- a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/e2e.py
+++ b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/e2e.py
@@ -13,12 +13,12 @@
 Y = helper.make_tensor_value_info("output", TensorProto.FLOAT, ["batch", "dynamic_prediction_count", vocab_size])
 Gather_Y = helper.make_tensor_value_info("gather_output", TensorProto.FLOAT, ["batch", 128])
 
-layer_norm1_weight_np_vals = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape((128))
+layer_norm1_weight_np_vals = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape(128)
 layer_norm1_weight_initializer = numpy_helper.from_array(
     layer_norm1_weight_np_vals, "bert.encoder.layer.2.output.LayerNorm.weight"
 )
 
-layer_norm1_bias_np_vals = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape((128))
+layer_norm1_bias_np_vals = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape(128)
 layer_norm1_bias_initializer = numpy_helper.from_array(
     layer_norm1_bias_np_vals, "bert.encoder.layer.2.output.LayerNorm.bias"
 )
@@ -26,15 +26,15 @@
 matmul1_np_vals = np.random.uniform(0.0, 1.0, (128, 128)).astype(np.float32).reshape((128, 128))
 matmul1_initializer = numpy_helper.from_array(matmul1_np_vals, "matmul1_initializer")
 
-add1_np_vals = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape((128))
+add1_np_vals = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape(128)
 add1_initializer = numpy_helper.from_array(add1_np_vals, "add1_initializerr")
 
-layer_norm2_weight_np_vals = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape((128))
+layer_norm2_weight_np_vals = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape(128)
 layer_norm2_weight_initializer = numpy_helper.from_array(
     layer_norm2_weight_np_vals, "cls.predictions.transform.LayerNorm.weight"
 )
 
-layer_norm2_bias_np_vals = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape((128))
+layer_norm2_bias_np_vals = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape(128)
 layer_norm2_bias_initializer = numpy_helper.from_array(
     layer_norm2_bias_np_vals, "cls.predictions.transform.LayerNorm.bias"
 )
@@ -42,7 +42,7 @@
 matmul2_np_vals = np.random.uniform(0.0, 1.0, (128, vocab_size)).astype(np.float32).reshape((128, vocab_size))
 matmul2_initializer = numpy_helper.from_array(matmul2_np_vals, "bert.embeddings.word_embeddings.weight_transposed")
 
-add2_np_vals = np.random.uniform(0.0, 1.0, (vocab_size)).astype(np.float32).reshape((vocab_size))
+add2_np_vals = np.random.uniform(0.0, 1.0, (vocab_size)).astype(np.float32).reshape(vocab_size)
 add2_initializer = numpy_helper.from_array(add2_np_vals, "cls.predictions.bias")
 
 gather_indice_np_vals = np.asarray([0]).astype(np.int64).reshape(())
diff --git a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_add.py b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_add.py
index ec0fdc888bed8..cd823ce8391c2 100755
--- a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_add.py
+++ b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_add.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper  # noqa: F401
 
 X = helper.make_tensor_value_info("input", TensorProto.FLOAT, ["batch", "seqlen", 128])
 unsqueezed_masked_lm_positions = helper.make_tensor_value_info(
@@ -13,7 +13,7 @@
 nodes = []
 
 # case 1
-bias_np_val = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape((128))
+bias_np_val = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape(128)
 bias_initializer = numpy_helper.from_array(bias_np_val, "bias")
 add1 = helper.make_node("Add", ["input", "bias"], ["add_1"], name="add_1")
 nodes.append(add1)
@@ -28,7 +28,7 @@
 nodes.append(gathernd1)
 
 # case 2
-bias2_np_val = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape((128))
+bias2_np_val = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape(128)
 bias2_initializer = numpy_helper.from_array(bias2_np_val, "bias2")
 add2 = helper.make_node("Add", ["bias2", "input"], ["add_2"], name="add_2")
 nodes.append(add2)
diff --git a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_div.py b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_div.py
index d14f8a71adfc5..ee25bef5c1161 100755
--- a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_div.py
+++ b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_div.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper  # noqa: F401
 
 X = helper.make_tensor_value_info("input", TensorProto.FLOAT, ["batch", "seqlen", 128])
 unsqueezed_masked_lm_positions = helper.make_tensor_value_info(
@@ -13,7 +13,7 @@
 nodes = []
 
 # case 1
-divisor_np_val = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape((128))
+divisor_np_val = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape(128)
 divisor_initializer = numpy_helper.from_array(divisor_np_val, "divisor")
 div1 = helper.make_node("Div", ["input", "divisor"], ["div_1"], name="div_1")
 nodes.append(div1)
@@ -28,7 +28,7 @@
 nodes.append(gathernd1)
 
 # case 2
-divisor2_np_val = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape((128))
+divisor2_np_val = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape(128)
 divisor2_initializer = numpy_helper.from_array(divisor2_np_val, "divisor2")
 div2 = helper.make_node("Div", ["divisor2", "input"], ["div_2"], name="div_2")
 nodes.append(div2)
diff --git a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_gelu.py b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_gelu.py
index eade1b868ba84..b8b81a747b118 100755
--- a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_gelu.py
+++ b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_gelu.py
@@ -1,6 +1,5 @@
-import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import OperatorSetIdProto, TensorProto, helper
 
 X = helper.make_tensor_value_info("input", TensorProto.FLOAT, ["batch", "seqlen", 128])
 unsqueezed_masked_lm_positions = helper.make_tensor_value_info(
diff --git a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_layernormalization.py b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_layernormalization.py
index 9473d05010129..dc2abf1dda586 100755
--- a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_layernormalization.py
+++ b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_layernormalization.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper  # noqa: F401
 
 X = helper.make_tensor_value_info("input", TensorProto.FLOAT, ["batch", "seqlen", 128])
 unsqueezed_masked_lm_positions = helper.make_tensor_value_info(
@@ -10,12 +10,12 @@
 )
 Y = helper.make_tensor_value_info("output", TensorProto.FLOAT, ["batch", "dynamic_prediction_count", 128])
 
-layer_norm1_weight_np_vals = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape((128))
+layer_norm1_weight_np_vals = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape(128)
 layer_norm1_weight_initializer = numpy_helper.from_array(
     layer_norm1_weight_np_vals, "bert.encoder.layer.2.output.LayerNorm.weight"
 )
 
-layer_norm1_bias_np_vals = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape((128))
+layer_norm1_bias_np_vals = np.random.uniform(0.0, 1.0, (128)).astype(np.float32).reshape(128)
 layer_norm1_bias_initializer = numpy_helper.from_array(
     layer_norm1_bias_np_vals, "bert.encoder.layer.2.output.LayerNorm.bias"
 )
diff --git a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_matmul.py b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_matmul.py
index 50167bbd0a3a3..bc850c4031741 100755
--- a/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_matmul.py
+++ b/onnxruntime/test/testdata/transform/computation_reduction/gathernd/gathernd_matmul.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper  # noqa: F401
 
 X = helper.make_tensor_value_info("input", TensorProto.FLOAT, ["batch", "seqlen", 128])
 unsqueezed_masked_lm_positions = helper.make_tensor_value_info(
diff --git a/onnxruntime/test/testdata/transform/concat_graph_gen.py b/onnxruntime/test/testdata/transform/concat_graph_gen.py
index 85817154e9014..f380d04bc9678 100644
--- a/onnxruntime/test/testdata/transform/concat_graph_gen.py
+++ b/onnxruntime/test/testdata/transform/concat_graph_gen.py
@@ -3,7 +3,7 @@
 from onnx import TensorProto, helper
 
 
-def GenerateModel(model_name):
+def GenerateModel(model_name):  # noqa: N802
     nodes = [
         helper.make_node("Gather", ["embed_weights", "input_1"], ["gather_out"], "gather"),
         helper.make_node("Add", ["gather_out", "add_q_weight"], ["add_q_out"], "add_q"),
diff --git a/onnxruntime/test/testdata/transform/concat_slice_elimination.py b/onnxruntime/test/testdata/transform/concat_slice_elimination.py
index 88a1236922a19..9eade63328aec 100644
--- a/onnxruntime/test/testdata/transform/concat_slice_elimination.py
+++ b/onnxruntime/test/testdata/transform/concat_slice_elimination.py
@@ -1,8 +1,8 @@
-import random
+import random  # noqa: F401
 
 import numpy as np
 import onnx
-from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper  # noqa: F401
 
 batch = 3
 hidden_size = 4
diff --git a/onnxruntime/test/testdata/transform/cse/generate.py b/onnxruntime/test/testdata/transform/cse/generate.py
index 1cd1b54b09a53..ecca4f586f400 100644
--- a/onnxruntime/test/testdata/transform/cse/generate.py
+++ b/onnxruntime/test/testdata/transform/cse/generate.py
@@ -1,7 +1,7 @@
 import os
 
 import onnx
-from onnx import AttributeProto, GraphProto, TensorProto, helper, shape_inference
+from onnx import AttributeProto, GraphProto, TensorProto, helper, shape_inference  # noqa: F401
 
 _this_dir = os.path.abspath(os.path.dirname(__file__))
 
diff --git a/onnxruntime/test/testdata/transform/expand_elimination.py b/onnxruntime/test/testdata/transform/expand_elimination.py
index da1530876348e..86340c9e2553c 100644
--- a/onnxruntime/test/testdata/transform/expand_elimination.py
+++ b/onnxruntime/test/testdata/transform/expand_elimination.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper  # noqa: F401
 
 X1 = helper.make_tensor_value_info("input1", TensorProto.FLOAT, [2, 1])
 X2 = helper.make_tensor_value_info("input2", TensorProto.FLOAT, ["dynamic", 4])
diff --git a/onnxruntime/test/testdata/transform/fusion/attention_gen.py b/onnxruntime/test/testdata/transform/fusion/attention_gen.py
index cd1569ae5cd2a..19f46ab9f358a 100644
--- a/onnxruntime/test/testdata/transform/fusion/attention_gen.py
+++ b/onnxruntime/test/testdata/transform/fusion/attention_gen.py
@@ -1,5 +1,5 @@
 import sys
-from enum import Enum
+from enum import Enum  # noqa: F401
 
 import onnx
 from onnx import TensorProto, helper
@@ -317,7 +317,7 @@
 ]
 
 
-def GenerateModel(model_name):
+def GenerateModel(model_name):  # noqa: N802
     nodes = [  # Attention subgraph
         helper.make_node(
             "LayerNormalization",
@@ -454,7 +454,7 @@ def GenerateModel(model_name):
     onnx.save(model, model_name)
 
 
-def GenerateModel2(model_name):
+def GenerateModel2(model_name):  # noqa: N802
     nodes = [  # Attention subgraph
         helper.make_node(
             "LayerNormalization",
diff --git a/onnxruntime/test/testdata/transform/fusion/constant_folding_with_shape_to_initializer.py b/onnxruntime/test/testdata/transform/fusion/constant_folding_with_shape_to_initializer.py
index 6cc5cdeb79f4a..c49ae8b0a422c 100644
--- a/onnxruntime/test/testdata/transform/fusion/constant_folding_with_shape_to_initializer.py
+++ b/onnxruntime/test/testdata/transform/fusion/constant_folding_with_shape_to_initializer.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper  # noqa: F401
 
 X = helper.make_tensor_value_info("input", TensorProto.FLOAT, [2, 4, 8])
 Y = helper.make_tensor_value_info("output", TensorProto.FLOAT, [2, 4, 16])
diff --git a/onnxruntime/test/testdata/transform/fusion/div_mul.py b/onnxruntime/test/testdata/transform/fusion/div_mul.py
index 7263a986d40ca..8cd34a6b53fcf 100644
--- a/onnxruntime/test/testdata/transform/fusion/div_mul.py
+++ b/onnxruntime/test/testdata/transform/fusion/div_mul.py
@@ -1,4 +1,4 @@
-from enum import Enum
+from enum import Enum  # noqa: F401
 
 import onnx
 from onnx import OperatorSetIdProto, TensorProto, helper
@@ -18,7 +18,7 @@
 kwargs["opset_imports"] = opsets
 
 
-def GenerateModel(model_name):
+def GenerateModel(model_name):  # noqa: N802
     nodes = [  # subgraph
         # float
         helper.make_node("Div", ["float_1", "A"], ["div_1"], "div_1"),
diff --git a/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.py b/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.py
index 6eff2e01ec8bf..3ec3cabbc8b77 100644
--- a/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.py
+++ b/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.py
@@ -1,10 +1,10 @@
-from enum import Enum
+from enum import Enum  # noqa: F401
 
 import onnx
 from onnx import TensorProto, helper
 
 
-def GenerateModel(model_name, b_has_zp=True, has_bias=False, bias_ND=False):
+def GenerateModel(model_name, b_has_zp=True, has_bias=False, bias_ND=False):  # noqa: N802
     mul_output = "Mul_output" if has_bias else "output"
     nodes = [  # construct graph
         helper.make_node(
diff --git a/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_gen.py b/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_gen.py
index cc1058c37e31f..ed06495b42beb 100644
--- a/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_gen.py
+++ b/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_gen.py
@@ -1,4 +1,4 @@
-from enum import Enum
+from enum import Enum  # noqa: F401
 
 import onnx
 from onnx import TensorProto, helper
@@ -12,7 +12,7 @@
     raise RuntimeError("Please pip install onnx==1.8.0 or 1.6.0 before running this script")
 
 
-def GenerateNodes(model_name, has_cast, suffix=""):
+def GenerateNodes(model_name, has_cast, suffix=""):  # noqa: N802
     nodes = [  # LayerNorm subgraph
         helper.make_node("Shape", ["input_ids" + suffix], ["shape1_out" + suffix], "shape1" + suffix),
         helper.make_node(
@@ -199,7 +199,7 @@ def GenerateNodes(model_name, has_cast, suffix=""):
     return nodes
 
 
-def GenerateInitializers():
+def GenerateInitializers():  # noqa: N802
     # hidden_size=4, num_heads=2
     initializers = [  # initializers
         helper.make_tensor("indices_0", TensorProto.INT64, [], [0]),
@@ -281,7 +281,7 @@ def GenerateInitializers():
     return initializers
 
 
-def GenerateMultipleEmbedModel(model_name):
+def GenerateMultipleEmbedModel(model_name):  # noqa: N802
     nodes_1 = GenerateNodes(model_name, False, "_1")
     nodes_2 = GenerateNodes(model_name, False, "_2")
     nodes = nodes_1 + nodes_2
@@ -311,7 +311,7 @@ def GenerateMultipleEmbedModel(model_name):
     onnx.save(model, model_name)
 
 
-def GenerateModel3(model_name, has_cast):
+def GenerateModel3(model_name, has_cast):  # noqa: N802
     nodes = GenerateNodes(model_name, has_cast)
 
     # hidden_size=4, num_heads=2, max_seq_length=3
@@ -335,7 +335,7 @@ def GenerateModel3(model_name, has_cast):
     onnx.save(model, model_name)
 
 
-def GenerateModel5(model_name):
+def GenerateModel5(model_name):  # noqa: N802
     batch_size = 2
     hidden_size = 4
     attention_heads = 2
@@ -510,7 +510,7 @@ def GenerateModel5(model_name):
     onnx.save(model, model_name)
 
 
-def GenerateModel6(model_name):
+def GenerateModel6(model_name):  # noqa: N802
     nodes = [  # LayerNorm subgraph
         helper.make_node("Shape", ["input_ids"], ["shape1_out"], "shape1"),
         helper.make_node("Gather", ["shape1_out", "indices_0"], ["gather0_out"], "gather0"),
@@ -679,7 +679,7 @@ def GenerateModel6(model_name):
     onnx.save(model, model_name)
 
 
-def GenerateInitializers2(hidden_size):
+def GenerateInitializers2(hidden_size):  # noqa: N802
     qkv_weights = [1.0] * hidden_size * (3 * hidden_size)
 
     initializers = [  # initializers
@@ -744,7 +744,7 @@ def GenerateInitializers2(hidden_size):
     return initializers
 
 
-def GenerateNodes2(attention_heads):
+def GenerateNodes2(attention_heads):  # noqa: N802
     nodes = [
         helper.make_node(
             "Gather",
@@ -810,7 +810,7 @@ def GenerateNodes2(attention_heads):
     return nodes
 
 
-def GenerateModel7(model_name):
+def GenerateModel7(model_name):  # noqa: N802
     batch_size = 2
     hidden_size = 4
     attention_heads = 2
@@ -841,7 +841,7 @@ def GenerateModel7(model_name):
     onnx.save(model, model_name)
 
 
-def GenerateModel8(model_name):
+def GenerateModel8(model_name):  # noqa: N802
     batch_size = -1
     hidden_size = 4
     attention_heads = 2
@@ -881,7 +881,7 @@ def GenerateModel8(model_name):
     onnx.save(model, model_name)
 
 
-def GenerateModel9(model_name):
+def GenerateModel9(model_name):  # noqa: N802
     batch_size = -1
     hidden_size = 4
     attention_heads = 2
diff --git a/onnxruntime/test/testdata/transform/fusion/fast_gelu.py b/onnxruntime/test/testdata/transform/fusion/fast_gelu.py
index aaaffa4ab398a..20d78b6684609 100644
--- a/onnxruntime/test/testdata/transform/fusion/fast_gelu.py
+++ b/onnxruntime/test/testdata/transform/fusion/fast_gelu.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper  # noqa: F401
 
 # Gelu formula: x * 0.5 * (1.0 + tanh(0.7978845608028654 * x * (1.0 + 0.044715 * x * x)))
 
@@ -10,7 +10,7 @@
 X = helper.make_tensor_value_info("input", TensorProto.FLOAT, ["batch", "seqlen", 64])
 Y = helper.make_tensor_value_info("output", TensorProto.FLOAT, ["batch", "seqlen", 64])
 
-bias_np_vals = (0.01 * np.arange(64)).astype(np.float32).reshape((64))
+bias_np_vals = (0.01 * np.arange(64)).astype(np.float32).reshape(64)
 bias_initializer = numpy_helper.from_array(bias_np_vals, "input_bias")
 
 a_weight_np_vals = np.asarray([0.044714998453855515]).astype(np.float32).reshape(())
diff --git a/onnxruntime/test/testdata/transform/fusion/fast_gelu2.py b/onnxruntime/test/testdata/transform/fusion/fast_gelu2.py
index 5ff752afa7e6a..718f924ae5902 100644
--- a/onnxruntime/test/testdata/transform/fusion/fast_gelu2.py
+++ b/onnxruntime/test/testdata/transform/fusion/fast_gelu2.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper  # noqa: F401
 
 # Gelu formula: x * 0.5 * (1.0 + tanh((sqrt(2 / pi) * (x + 0.044715 * pow(x, 3)))))
 has_bias = False  # change it to True to generate fast_gelu_openai_with_bias.onnx
@@ -9,7 +9,7 @@
 X = helper.make_tensor_value_info("input", TensorProto.FLOAT, ["batch", "seqlen", 64])
 Y = helper.make_tensor_value_info("output", TensorProto.FLOAT, ["batch", "seqlen", 64])
 
-bias_np_vals = (0.01 * np.arange(64)).astype(np.float32).reshape((64))
+bias_np_vals = (0.01 * np.arange(64)).astype(np.float32).reshape(64)
 bias_initializer = numpy_helper.from_array(bias_np_vals, "input_bias")
 
 pow_np_vals = np.asarray([3]).astype(np.float32).reshape(())
diff --git a/onnxruntime/test/testdata/transform/fusion/fast_gelu3_with_casts.py b/onnxruntime/test/testdata/transform/fusion/fast_gelu3_with_casts.py
index 5220751a3e364..d7cfc351b8e97 100644
--- a/onnxruntime/test/testdata/transform/fusion/fast_gelu3_with_casts.py
+++ b/onnxruntime/test/testdata/transform/fusion/fast_gelu3_with_casts.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper  # noqa: F401
 
 # Gelu formula: x * 0.5 * (1.0 + tanh((sqrt(2 / pi) * (x + 0.044715 * pow(x, 3)))))
 
diff --git a/onnxruntime/test/testdata/transform/fusion/gelu_gen.py b/onnxruntime/test/testdata/transform/fusion/gelu_gen.py
index 45f546a04635e..428bb0ce00df0 100644
--- a/onnxruntime/test/testdata/transform/fusion/gelu_gen.py
+++ b/onnxruntime/test/testdata/transform/fusion/gelu_gen.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper  # noqa: F401
 
 """
 Generate test model for Gelu subgraph pattern 2:
@@ -20,7 +20,7 @@
 Y = helper.make_tensor_value_info("output", TensorProto.FLOAT, ["batch", "seqlen", 64])
 Z = helper.make_tensor_value_info("div", TensorProto.FLOAT, ["batch", "seqlen", 64])
 
-value = (0.01 * np.arange(64)).astype(np.float32).reshape((64))
+value = (0.01 * np.arange(64)).astype(np.float32).reshape(64)
 bias_initializer = numpy_helper.from_array(value, "input_bias")
 
 value = np.asarray([1.4142099618911743]).astype(np.float32).reshape(())
diff --git a/onnxruntime/test/testdata/transform/fusion/gemm_transpose_gen.py b/onnxruntime/test/testdata/transform/fusion/gemm_transpose_gen.py
index 276330c2064a2..c8b478b893a6d 100644
--- a/onnxruntime/test/testdata/transform/fusion/gemm_transpose_gen.py
+++ b/onnxruntime/test/testdata/transform/fusion/gemm_transpose_gen.py
@@ -77,6 +77,7 @@ def gen_gemm_inputs_output_transposed(model_path):
 gen_gemm_output_transposed("gemm_transpose_output_transposed.onnx")
 gen_gemm_inputs_output_transposed("gemm_transpose_inputs_output_transposed.onnx")
 
+
 # (A'(B')) = BA
 def gen_gemm_inputs_output_transposed_2(model_path):
     nodes = [
diff --git a/onnxruntime/test/testdata/transform/fusion/isinf_reducesum.py b/onnxruntime/test/testdata/transform/fusion/isinf_reducesum.py
index 447b873f01c6e..c6e70fe478701 100644
--- a/onnxruntime/test/testdata/transform/fusion/isinf_reducesum.py
+++ b/onnxruntime/test/testdata/transform/fusion/isinf_reducesum.py
@@ -1,4 +1,4 @@
-from enum import Enum
+from enum import Enum  # noqa: F401
 
 import onnx
 from onnx import OperatorSetIdProto, TensorProto, helper
@@ -18,7 +18,7 @@
 kwargs["opset_imports"] = opsets
 
 
-def GenerateModel(model_name):
+def GenerateModel(model_name):  # noqa: N802
     nodes = [  # subgraph
         helper.make_node("Cast", ["A"], ["cast1"], "cast_1", to=11),
         helper.make_node("IsInf", ["cast1"], ["IsInf_out"], "is_inf"),
diff --git a/onnxruntime/test/testdata/transform/fusion/layer_norm_t5_gen.py b/onnxruntime/test/testdata/transform/fusion/layer_norm_t5_gen.py
index eb184fef5e59d..aa4b78f4525de 100644
--- a/onnxruntime/test/testdata/transform/fusion/layer_norm_t5_gen.py
+++ b/onnxruntime/test/testdata/transform/fusion/layer_norm_t5_gen.py
@@ -1,10 +1,10 @@
-from enum import Enum
+from enum import Enum  # noqa: F401
 
 import onnx
 from onnx import OperatorSetIdProto, TensorProto, helper
 
 
-def GenerateModel(model_name, has_casts=False):
+def GenerateModel(model_name, has_casts=False):  # noqa: N802
     nodes = [  # SimplifiedLayerNorm subgraph
         helper.make_node("Pow", ["cast_A" if has_casts else "A", "pow_in_2"], ["pow_out"], "pow"),
         helper.make_node("ReduceMean", ["pow_out"], ["rd2_out"], "reduce", axes=[-1], keepdims=1),
diff --git a/onnxruntime/test/testdata/transform/fusion/layer_norm_with_cast_2.py b/onnxruntime/test/testdata/transform/fusion/layer_norm_with_cast_2.py
index 091d38d9e6797..61b2e2249e7a3 100644
--- a/onnxruntime/test/testdata/transform/fusion/layer_norm_with_cast_2.py
+++ b/onnxruntime/test/testdata/transform/fusion/layer_norm_with_cast_2.py
@@ -1,11 +1,11 @@
-from enum import Enum
+from enum import Enum  # noqa: F401
 
-import numpy as np
+import numpy as np  # noqa: F401
 import onnx
 from onnx import OperatorSetIdProto, TensorProto, helper
 
 
-def GenerateModel(model_name):
+def GenerateModel(model_name):  # noqa: N802
     nodes = [  # LayerNormWithCast2 subgraph
         helper.make_node("ReduceMean", ["A"], ["rd1_out"], "reduce", axes=[-1]),
         helper.make_node("Sub", ["A", "rd1_out"], ["sub1_out"], "sub"),
diff --git a/onnxruntime/test/testdata/transform/fusion/layer_norm_with_cast_3.py b/onnxruntime/test/testdata/transform/fusion/layer_norm_with_cast_3.py
index a2f3928b5eaf2..f32caed5d25fe 100644
--- a/onnxruntime/test/testdata/transform/fusion/layer_norm_with_cast_3.py
+++ b/onnxruntime/test/testdata/transform/fusion/layer_norm_with_cast_3.py
@@ -2,7 +2,7 @@
 from onnx import OperatorSetIdProto, TensorProto, helper
 
 
-def GenerateModel(model_name):
+def GenerateModel(model_name):  # noqa: N802
     nodes = [  # LayerNormWithCast3 subgraph
         helper.make_node("ReduceMean", ["A"], ["rd1_out"], "reduce", axes=[-1]),
         helper.make_node("Sub", ["A", "rd1_out"], ["sub1_out"], "sub"),
@@ -51,7 +51,7 @@ def GenerateModel(model_name):
 GenerateModel("layer_norm_with_cast_3.onnx")
 
 
-def GenerateModel2(model_name):
+def GenerateModel2(model_name):  # noqa: N802
     nodes = [  # LayerNormWithCast4 subgraph
         helper.make_node("Cast", ["A"], ["cast_A"], "cast1", to=1),
         helper.make_node("ReduceMean", ["cast_A"], ["rd1_out"], "reduce", axes=[-1]),
diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py
index 7bba71723b2c8..018e5fb332dd0 100644
--- a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py
+++ b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py
@@ -1,10 +1,10 @@
-from enum import Enum
+from enum import Enum  # noqa: F401
 
 import onnx
 from onnx import TensorProto, helper
 
 
-def MakeSubGraph(suffix, has_bias):
+def MakeSubGraph(suffix, has_bias):  # noqa: N802
     mul_bottom_output = "mul_output" + suffix if has_bias else "output" + suffix
     nodes = [
         helper.make_node(
@@ -49,7 +49,7 @@ def MakeSubGraph(suffix, has_bias):
     return nodes
 
 
-def MakeInitializer(suffix):
+def MakeInitializer(suffix):  # noqa: N802
     return [
         helper.make_tensor("b_quantized" + suffix, TensorProto.UINT8, [2, 3], [2, 4, 5, 6, 7, 8]),
         helper.make_tensor("b_zp" + suffix, TensorProto.UINT8, [], [128]),
@@ -57,7 +57,7 @@ def MakeInitializer(suffix):
     ]
 
 
-def GenerateModel(model_name):
+def GenerateModel(model_name):  # noqa: N802
     nodes = [
         helper.make_node(
             "DynamicQuantizeLinear",
diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_scale_gen.py b/onnxruntime/test/testdata/transform/fusion/matmul_scale_gen.py
index 68d4cab6dd6d9..afd8259471342 100644
--- a/onnxruntime/test/testdata/transform/fusion/matmul_scale_gen.py
+++ b/onnxruntime/test/testdata/transform/fusion/matmul_scale_gen.py
@@ -95,7 +95,7 @@ def gen_unfusable(model_path, unfusable_type):
     elif unfusable_type == UNFUSABLE_SCALE_NOT_CONSTANT:
         scale_node = helper.make_node("Mul", ["input_0", "input_0"], ["scaled_input_0"], "scale input_0")
     else:
-        raise ValueError("Invalid unfusable_type: {}".format(unfusable_type))
+        raise ValueError(f"Invalid unfusable_type: {unfusable_type}")
 
     nodes = [
         scale_node,
@@ -191,7 +191,6 @@ def gen_int32(model_path):
 
 
 def gen_scale_input(model_path):
-
     nodes = [
         helper.make_node("Mul", ["input_0", "scale"], ["scaled_input_0"], "scale input_0"),
         helper.make_node(
diff --git a/onnxruntime/test/testdata/transform/fusion/not_where.py b/onnxruntime/test/testdata/transform/fusion/not_where.py
index 7e48164d5161a..82a128153ac70 100644
--- a/onnxruntime/test/testdata/transform/fusion/not_where.py
+++ b/onnxruntime/test/testdata/transform/fusion/not_where.py
@@ -1,4 +1,4 @@
-from enum import Enum
+from enum import Enum  # noqa: F401
 
 import onnx
 from onnx import OperatorSetIdProto, TensorProto, helper
@@ -18,7 +18,7 @@
 kwargs["opset_imports"] = opsets
 
 
-def GenerateModel(model_name):
+def GenerateModel(model_name):  # noqa: N802
     nodes = [  # subgraph
         # float
         helper.make_node("Not", ["X"], ["not_X_1"], "not_1"),
diff --git a/onnxruntime/test/testdata/transform/fusion/skip_layer_norm_gen.py b/onnxruntime/test/testdata/transform/fusion/skip_layer_norm_gen.py
index 07f3411d4a129..0ebc5b11e08b6 100644
--- a/onnxruntime/test/testdata/transform/fusion/skip_layer_norm_gen.py
+++ b/onnxruntime/test/testdata/transform/fusion/skip_layer_norm_gen.py
@@ -10,7 +10,7 @@ class Format(Enum):
     Format3 = 3
 
 
-def GenerateModel(format, model_name, multi_output_add=False, add_output_in_graph_output=False):
+def GenerateModel(format, model_name, multi_output_add=False, add_output_in_graph_output=False):  # noqa: N802
     nodes = [  # LayerNorm subgraph
         helper.make_node("ReduceMean", ["ln_in"], ["rd1_out"], "reduce1", axes=[-1], keepdims=1),
         helper.make_node("Sub", ["ln_in", "rd1_out"], ["sb1_out"], "sub1"),
diff --git a/onnxruntime/test/testdata/transform/id-elim.py b/onnxruntime/test/testdata/transform/id-elim.py
index 838fbb1f4a798..1f7b6e2607702 100644
--- a/onnxruntime/test/testdata/transform/id-elim.py
+++ b/onnxruntime/test/testdata/transform/id-elim.py
@@ -1,6 +1,6 @@
-import numpy as np
+import numpy as np  # noqa: F401
 import onnx
-from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper  # noqa: F401
 
 X1 = helper.make_tensor_value_info("x1", TensorProto.INT64, [4, 4])
 X2 = helper.make_tensor_value_info("x2", TensorProto.INT64, [4, 4])
diff --git a/onnxruntime/test/testdata/transform/id-scan9_sum.py b/onnxruntime/test/testdata/transform/id-scan9_sum.py
index f2a7de656c8ee..7ffd2e21b7333 100644
--- a/onnxruntime/test/testdata/transform/id-scan9_sum.py
+++ b/onnxruntime/test/testdata/transform/id-scan9_sum.py
@@ -1,6 +1,6 @@
-import numpy as np
+import numpy as np  # noqa: F401
 import onnx
-from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper  # noqa: F401
 
 initial = helper.make_tensor_value_info("initial", TensorProto.FLOAT, [2])
 x = helper.make_tensor_value_info("x", TensorProto.FLOAT, [3, 2])
diff --git a/onnxruntime/test/testdata/transform/model_parallel/bart_mlp_megatron_basic_test.py b/onnxruntime/test/testdata/transform/model_parallel/bart_mlp_megatron_basic_test.py
index 323ebf08e4acd..503d860baab67 100644
--- a/onnxruntime/test/testdata/transform/model_parallel/bart_mlp_megatron_basic_test.py
+++ b/onnxruntime/test/testdata/transform/model_parallel/bart_mlp_megatron_basic_test.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper  # noqa: F401
 
 hidden_size = 4
 weight_dim_to_split = 16
diff --git a/onnxruntime/test/testdata/transform/model_parallel/bart_self_attention_megatron_basic_test.py b/onnxruntime/test/testdata/transform/model_parallel/bart_self_attention_megatron_basic_test.py
index 596b294ca27ae..20bdebead3dac 100644
--- a/onnxruntime/test/testdata/transform/model_parallel/bart_self_attention_megatron_basic_test.py
+++ b/onnxruntime/test/testdata/transform/model_parallel/bart_self_attention_megatron_basic_test.py
@@ -1,8 +1,8 @@
-import random
+import random  # noqa: F401
 
 import numpy as np
 import onnx
-from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper  # noqa: F401
 
 batch = 6
 hidden_size = 4
diff --git a/onnxruntime/test/testdata/transform/model_parallel/mlp_megatron_basic_test.py b/onnxruntime/test/testdata/transform/model_parallel/mlp_megatron_basic_test.py
index b26d384cbb4c9..07487ee4880ed 100644
--- a/onnxruntime/test/testdata/transform/model_parallel/mlp_megatron_basic_test.py
+++ b/onnxruntime/test/testdata/transform/model_parallel/mlp_megatron_basic_test.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import AttributeProto, GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper  # noqa: F401
 
 hidden_size = 4
 weight_dim_to_split = 16
diff --git a/onnxruntime/test/testdata/transform/model_parallel/self_attention_megatron_basic_test.py b/onnxruntime/test/testdata/transform/model_parallel/self_attention_megatron_basic_test.py
index 5083ceeb434db..c57024538f5b2 100644
--- a/onnxruntime/test/testdata/transform/model_parallel/self_attention_megatron_basic_test.py
+++ b/onnxruntime/test/testdata/transform/model_parallel/self_attention_megatron_basic_test.py
@@ -1,6 +1,6 @@
 import numpy as np
 import onnx
-from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper
+from onnx import GraphProto, OperatorSetIdProto, TensorProto, helper, numpy_helper  # noqa: F401
 
 hidden_size = 4
 attention_head = 2
diff --git a/onnxruntime/test/testdata/transform/noop-add.py b/onnxruntime/test/testdata/transform/noop-add.py
index 11dbc7269e816..8ceba99bb7c62 100644
--- a/onnxruntime/test/testdata/transform/noop-add.py
+++ b/onnxruntime/test/testdata/transform/noop-add.py
@@ -16,7 +16,7 @@
 kwargs["opset_imports"] = opsets
 
 
-def GenerateModel(model_name):
+def GenerateModel(model_name):  # noqa: N802
     nodes = [  # subgraph
         # float
         helper.make_node("Identity", ["X1"], ["id_1"], "id_1"),
diff --git a/onnxruntime/test/testdata/transform/propagate_cast/gen_propagate_cast.py b/onnxruntime/test/testdata/transform/propagate_cast/gen_propagate_cast.py
index 910ff93a32ead..dca1cbaf1d3d7 100644
--- a/onnxruntime/test/testdata/transform/propagate_cast/gen_propagate_cast.py
+++ b/onnxruntime/test/testdata/transform/propagate_cast/gen_propagate_cast.py
@@ -29,9 +29,7 @@ def save(model_path, nodes, inputs, outputs, initializers):
 
 
 def gen_fuse_back2back_casts(model_path):
-
-    for (type1, type2) in list(itertools.product([TensorProto.FLOAT, TensorProto.FLOAT16], repeat=2)):
-
+    for type1, type2 in list(itertools.product([TensorProto.FLOAT, TensorProto.FLOAT16], repeat=2)):
         nodes = [
             helper.make_node("MatMul", ["input_0", "input_1"], ["product"], "MatMul_0"),
             helper.make_node("Cast", ["product"], ["product_cast"], "Cast_0", to=type1),
@@ -64,8 +62,7 @@ def gen_fuse_back2back_casts(model_path):
 
 
 def gen_fuse_sibling_casts(model_path):
-
-    for (type1, type2) in list(itertools.product([TensorProto.FLOAT, TensorProto.FLOAT16], repeat=2)):
+    for type1, type2 in list(itertools.product([TensorProto.FLOAT, TensorProto.FLOAT16], repeat=2)):
         input_type = (
             type2 if type1 != type2 else (TensorProto.FLOAT16 if type1 == TensorProto.FLOAT else TensorProto.FLOAT)
         )
@@ -298,10 +295,10 @@ def do_transpose(output_0, output_1, transpose, nodes):
 
 
 def gen_bool_to_float16_cast(model_path):
-    X1 = helper.make_tensor_value_info("x1", TensorProto.INT64, [1, 1])
-    X2 = helper.make_tensor_value_info("x2", TensorProto.INT64, [1, 1])
-    X3 = helper.make_tensor_value_info("x3", TensorProto.FLOAT, [1, 1])
-    Y = helper.make_tensor_value_info("output", TensorProto.FLOAT16, [1, 1])
+    X1 = helper.make_tensor_value_info("x1", TensorProto.INT64, [1, 1])  # noqa: N806
+    X2 = helper.make_tensor_value_info("x2", TensorProto.INT64, [1, 1])  # noqa: N806
+    X3 = helper.make_tensor_value_info("x3", TensorProto.FLOAT, [1, 1])  # noqa: N806
+    Y = helper.make_tensor_value_info("output", TensorProto.FLOAT16, [1, 1])  # noqa: N806
 
     less1 = helper.make_node("Less", ["x1", "x2"], ["less1"], name="less1")
     cast1 = helper.make_node("Cast", ["less1"], ["cast1"], name="cast1", to=TensorProto.FLOAT16)
@@ -312,10 +309,10 @@ def gen_bool_to_float16_cast(model_path):
 
 
 def gen_bool_to_float_cast(model_path):
-    X1 = helper.make_tensor_value_info("x1", TensorProto.INT64, [1, 1])
-    X2 = helper.make_tensor_value_info("x2", TensorProto.INT64, [1, 1])
-    X3 = helper.make_tensor_value_info("x3", TensorProto.FLOAT16, [1, 1])
-    Y = helper.make_tensor_value_info("output", TensorProto.FLOAT16, [1, 1])
+    X1 = helper.make_tensor_value_info("x1", TensorProto.INT64, [1, 1])  # noqa: N806
+    X2 = helper.make_tensor_value_info("x2", TensorProto.INT64, [1, 1])  # noqa: N806
+    X3 = helper.make_tensor_value_info("x3", TensorProto.FLOAT16, [1, 1])  # noqa: N806
+    Y = helper.make_tensor_value_info("output", TensorProto.FLOAT16, [1, 1])  # noqa: N806
 
     less1 = helper.make_node("Less", ["x1", "x2"], ["less1"], name="less1")
     cast1 = helper.make_node("Cast", ["less1"], ["cast1"], name="cast1", to=TensorProto.FLOAT)
@@ -327,11 +324,11 @@ def gen_bool_to_float_cast(model_path):
 
 
 def gen_one_input_one_output_test(op, model_path, axes_attribute=False):
-    X = helper.make_tensor_value_info("x", TensorProto.FLOAT16, [2, 2])
+    X = helper.make_tensor_value_info("x", TensorProto.FLOAT16, [2, 2])  # noqa: N806
     output_shape = [2, 2]
     if op == "Unsqueeze":
         output_shape.append(1)
-    Y = helper.make_tensor_value_info("y", TensorProto.FLOAT16, output_shape)
+    Y = helper.make_tensor_value_info("y", TensorProto.FLOAT16, output_shape)  # noqa: N806
     node_inputs = []
     graph_inputs = [X]
     cast1 = helper.make_node("Cast", ["x"], ["cast1"], name="cast1", to=TensorProto.FLOAT)
diff --git a/onnxruntime/test/testdata/transform/qdq_conv_gen.py b/onnxruntime/test/testdata/transform/qdq_conv_gen.py
index a8c4d64bb2999..986a13d61fdf8 100644
--- a/onnxruntime/test/testdata/transform/qdq_conv_gen.py
+++ b/onnxruntime/test/testdata/transform/qdq_conv_gen.py
@@ -3,7 +3,7 @@
 
 
 # Generate a basic QDQ Conv model with `num_convs` Conv nodes and their surrounding DQ/Q nodes
-def GenerateModel(model_path, num_convs):
+def GenerateModel(model_path, num_convs):  # noqa: N802
     nodes = []
     initializers = []
     inputs = []
@@ -12,7 +12,7 @@ def GenerateModel(model_path, num_convs):
     for i in range(num_convs):
 
         def name(base):
-            return f"{base}_{i}"
+            return f"{base}_{i}"  # noqa: B023
 
         nodes.extend(
             [
diff --git a/orttraining/orttraining/eager/opgen/onnxgen.py b/orttraining/orttraining/eager/opgen/onnxgen.py
index 87c4036f48b0a..960c790ee2cab 100755
--- a/orttraining/orttraining/eager/opgen/onnxgen.py
+++ b/orttraining/orttraining/eager/opgen/onnxgen.py
@@ -5,6 +5,7 @@
 
 import os.path as path
 from sys import argv
+
 from onnx import defs
 
 out_file = path.join(path.dirname(path.realpath(__file__)), "opgen", "onnxops.py")
@@ -38,7 +39,7 @@ def convert_to_aten_type(onnx_type_strs):
     return result
 
 
-with open(out_file, "wt") as fp:
+with open(out_file, "w") as fp:
 
     def write(s):
         fp.write(s)
@@ -46,20 +47,20 @@ def write(s):
     def writeline(s=""):
         fp.write(s + "\n")
 
-    writeline(f"# AUTO-GENERATED CODE! - DO NOT EDIT!")
+    writeline("# AUTO-GENERATED CODE! - DO NOT EDIT!")
     writeline(f'# $ python {" ".join(argv)}')
     writeline()
 
     writeline("from opgen.generator import ONNXAttr, ONNXOp, AttrType")
     writeline()
 
-    for op_name, schema in sorted(onnx_ops.items()):
+    for _op_name, schema in sorted(onnx_ops.items()):
         writeline(f"class {schema.name}(ONNXOp):")
-        writeline(f'  """')
+        writeline('  """')
         doc_str = schema.doc.strip("\r\n")
         for doc_line in str.splitlines(doc_str, keepends=False):
             writeline(f"  {doc_line}")
-        writeline(f'  """')
+        writeline('  """')
         writeline()
         write("  def __init__(self")
 
@@ -68,7 +69,7 @@ def writeline(s=""):
 
         if len(schema.attributes) > 0:
             writeline(",")
-            for i, (k, attr) in enumerate(schema.attributes.items()):
+            for i, (_k, attr) in enumerate(schema.attributes.items()):
                 write(f"    {attr.name}=None")
                 if i < len(schema.attributes) - 1:
                     writeline(", ")
@@ -88,7 +89,7 @@ def writeline(s=""):
 
         if len(schema.attributes) > 0:
             writeline(",")
-            for i, (k, attr) in enumerate(schema.attributes.items()):
+            for i, (_k, attr) in enumerate(schema.attributes.items()):
                 write(f"      {attr.name}=ONNXAttr({attr.name}, {attr.type})")
                 if i < len(schema.attributes) - 1:
                     writeline(", ")
@@ -97,7 +98,7 @@ def writeline(s=""):
         writeline()
 
     writeline("onnx_ops = {")
-    for i, (op_name, schema) in enumerate(onnx_ops.items()):
+    for i, (op_name, schema) in enumerate(onnx_ops.items()):  # noqa: B007
         writeline(f"  '{op_name}': {schema.name},")
     write("}")
 
diff --git a/orttraining/orttraining/eager/opgen/opgen.py b/orttraining/orttraining/eager/opgen/opgen.py
index ef216e71cc188..2de508f2ee4eb 100755
--- a/orttraining/orttraining/eager/opgen/opgen.py
+++ b/orttraining/orttraining/eager/opgen/opgen.py
@@ -38,7 +38,7 @@
 print(f"INFO: Using RegistrationDeclarations from: {regdecs_path}")
 output = sys.stdout
 if args.output_file:
-    output = open(args.output_file, "wt")
+    output = open(args.output_file, "w")  # noqa: SIM115
 
 with CPPParser(regdecs_path) as parser, SourceWriter(output) as writer:
     ortgen.run(parser, writer)
diff --git a/orttraining/orttraining/eager/opgen/opgen/ast.py b/orttraining/orttraining/eager/opgen/opgen/ast.py
index f41a93712aa51..df9d2f9a6b896 100644
--- a/orttraining/orttraining/eager/opgen/opgen/ast.py
+++ b/orttraining/orttraining/eager/opgen/opgen/ast.py
@@ -2,11 +2,12 @@
 # Licensed under the MIT License.
 
 import io
-from typing import TextIO, List, Union
+from typing import List, TextIO, Union
+
 from opgen.lexer import Token
 
 
-class Node(object):
+class Node:
     def __init__(self):
         self.tokens = []
 
diff --git a/orttraining/orttraining/eager/opgen/opgen/atenops.py b/orttraining/orttraining/eager/opgen/opgen/atenops.py
index b8b54b5d6ce84..70f97a669481e 100644
--- a/orttraining/orttraining/eager/opgen/opgen/atenops.py
+++ b/orttraining/orttraining/eager/opgen/opgen/atenops.py
@@ -1,13 +1,13 @@
-from copy import deepcopy
+from copy import deepcopy  # noqa: F401
 
 import torch
 from opgen.generator import MakeTorchFallback, ONNXOp, SignatureOnly
-from opgen.onnxops import *
+from opgen.onnxops import *  # noqa: F403
 from packaging import version
 
 TORCH_API_CHANGE_VERSION = "1.11.1"
 
-kMSDomain = "onnxruntime::kMSDomain"
+kMSDomain = "onnxruntime::kMSDomain"  # noqa: N816
 
 
 class ReluGrad(ONNXOp):
@@ -55,7 +55,7 @@ def __init__(self, dY, Y, axis=None):
             ],
             dY,
             Y,
-            axis=ONNXAttr(axis, AttrType.INT),
+            axis=ONNXAttr(axis, AttrType.INT),  # noqa: F405
         )
         self.domain = kMSDomain
 
@@ -75,7 +75,7 @@ def __init__(self, dY, Y, axis=None):
             ],
             dY,
             Y,
-            axis=ONNXAttr(axis, AttrType.INT),
+            axis=ONNXAttr(axis, AttrType.INT),  # noqa: F405
         )
         self.domain = kMSDomain
 
@@ -134,13 +134,13 @@ def __init__(self, dY, Y, axis=None):
 ]
 
 for unary_op in unary_ops_with_out:
-    ops[f"aten::{unary_op}.out"] = onnx_ops[unary_op]("self")
+    ops[f"aten::{unary_op}.out"] = onnx_ops[unary_op]("self")  # noqa: F405
 
 for unary_op in unary_ops_with_inplace:
-    ops[f"aten::{unary_op}_"] = onnx_ops[unary_op]("self")
+    ops[f"aten::{unary_op}_"] = onnx_ops[unary_op]("self")  # noqa: F405
 
 for unary_op in unary_ops:
-    ops[f"aten::{unary_op}"] = onnx_ops[unary_op]("self")
+    ops[f"aten::{unary_op}"] = onnx_ops[unary_op]("self")  # noqa: F405
 
 # Notes on Onnx op mapping
 #
@@ -162,41 +162,59 @@ def __init__(self, dY, Y, axis=None):
     "aten::as_strided": SignatureOnly(),
     # manually implement Slice using stride and offset.
     "aten::slice.Tensor": SignatureOnly(),
-    "aten::addmm": Gemm("mat1", "mat2", "self", alpha="alpha", beta="beta"),
-    "aten::t": Transpose("self"),
+    "aten::addmm": Gemm("mat1", "mat2", "self", alpha="alpha", beta="beta"),  # noqa: F405
+    "aten::t": Transpose("self"),  # noqa: F405
     # MatMul("self", "mat2"), fails since it resizes based on self but should be based on result shape of the mult
     "aten::mm.out": SignatureOnly(),
-    "aten::zeros_like": ConstantOfShape(
-        Shape("self")
+    "aten::zeros_like": ConstantOfShape(  # noqa: F405
+        Shape("self")  # noqa: F405
     ),  # the default constant is 0, so don't need to speicify attribute
-    "aten::sum.dim_IntList": ReduceSum("self", "dim", keepdims="keepdim"),
+    "aten::sum.dim_IntList": ReduceSum("self", "dim", keepdims="keepdim"),  # noqa: F405
     "aten::threshold_backward": ReluGrad("grad_output", "self"),
-    "aten::fmod.Scalar": Mod("self", "other", fmod=1),
-    "aten::fmod.Tensor": Mod("self", "other", fmod=1),
-    "aten::softshrink": Shrink("self", bias="lambd", lambd="lambd"),  # yes, bias is set to 'lambd'
-    "aten::hardshrink": Shrink("self", bias=0, lambd="lambd"),
+    "aten::fmod.Scalar": Mod("self", "other", fmod=1),  # noqa: F405
+    "aten::fmod.Tensor": Mod("self", "other", fmod=1),  # noqa: F405
+    "aten::softshrink": Shrink("self", bias="lambd", lambd="lambd"),  # yes, bias is set to 'lambd'  # noqa: F405
+    "aten::hardshrink": Shrink("self", bias=0, lambd="lambd"),  # noqa: F405
     "aten::gelu": Gelu("self"),
-    "aten::max": ReduceMax("self", keepdims=0),
-    "aten::min": ReduceMin("self", keepdims=0),
+    "aten::max": ReduceMax("self", keepdims=0),  # noqa: F405
+    "aten::min": ReduceMin("self", keepdims=0),  # noqa: F405
     "aten::cat.out": SignatureOnly(),
     "aten::fill_.Scalar": SignatureOnly(),
-    "aten::ne.Scalar_out": Cast(Not(Equal("self", "other")), to="GetONNXTensorProtoDataType(out.scalar_type())"),
-    "aten::ne.Tensor_out": Cast(Not(Equal("self", "other")), to="GetONNXTensorProtoDataType(out.scalar_type())"),
-    "aten::eq.Tensor_out": Cast(Equal("self", "other"), to="GetONNXTensorProtoDataType(out.scalar_type())"),
-    "aten::eq.Scalar_out": Cast(Equal("self", "other"), to="GetONNXTensorProtoDataType(out.scalar_type())"),
-    "aten::bitwise_and.Tensor_out": And("self", "other"),  # This generates a fallback for all but Bool, as expected.
-    "aten::masked_select": GatherND("self", Transpose(NonZero(Expand("mask", Shape("self"))))),
+    "aten::ne.Scalar_out": Cast(  # noqa: F405
+        Not(Equal("self", "other")), to="GetONNXTensorProtoDataType(out.scalar_type())"  # noqa: F405
+    ),
+    "aten::ne.Tensor_out": Cast(  # noqa: F405
+        Not(Equal("self", "other")), to="GetONNXTensorProtoDataType(out.scalar_type())"  # noqa: F405
+    ),
+    "aten::eq.Tensor_out": Cast(  # noqa: F405
+        Equal("self", "other"), to="GetONNXTensorProtoDataType(out.scalar_type())"  # noqa: F405
+    ),
+    "aten::eq.Scalar_out": Cast(  # noqa: F405
+        Equal("self", "other"), to="GetONNXTensorProtoDataType(out.scalar_type())"  # noqa: F405
+    ),
+    "aten::bitwise_and.Tensor_out": And(  # noqa: F405
+        "self", "other"
+    ),  # This generates a fallback for all but Bool, as expected.
+    "aten::masked_select": GatherND("self", Transpose(NonZero(Expand("mask", Shape("self"))))),  # noqa: F405
     "aten::_local_scalar_dense": MakeTorchFallback(),  # This function extracts a scalar value from
     #   a tensor with exactly one value; there's no need to try to do this on an ORT device.
     #   See CPU impl at pytorch/blob/master/aten/src/ATen/native/Scalar.cpp
-    "aten::lt.Scalar_out": Cast(Less(A="self", B="other"), to="GetONNXTensorProtoDataType(out.scalar_type())"),
-    "aten::lt.Tensor_out": Cast(Less(A="self", B="other"), to="GetONNXTensorProtoDataType(out.scalar_type())"),
-    "aten::gt.Scalar_out": Cast(Greater(A="self", B="other"), to="GetONNXTensorProtoDataType(out.scalar_type())"),
-    "aten::gt.Tensor_out": Cast(Greater(A="self", B="other"), to="GetONNXTensorProtoDataType(out.scalar_type())"),
+    "aten::lt.Scalar_out": Cast(  # noqa: F405
+        Less(A="self", B="other"), to="GetONNXTensorProtoDataType(out.scalar_type())"  # noqa: F405
+    ),
+    "aten::lt.Tensor_out": Cast(  # noqa: F405
+        Less(A="self", B="other"), to="GetONNXTensorProtoDataType(out.scalar_type())"  # noqa: F405
+    ),
+    "aten::gt.Scalar_out": Cast(  # noqa: F405
+        Greater(A="self", B="other"), to="GetONNXTensorProtoDataType(out.scalar_type())"  # noqa: F405
+    ),
+    "aten::gt.Tensor_out": Cast(  # noqa: F405
+        Greater(A="self", B="other"), to="GetONNXTensorProtoDataType(out.scalar_type())"  # noqa: F405
+    ),
     "aten::equal": SignatureOnly(),
-    "aten::_softmax": Softmax("self", axis="dim"),
+    "aten::_softmax": Softmax("self", axis="dim"),  # noqa: F405
     "aten::argmax.out": SignatureOnly(),
-    "aten::nonzero": Transpose(NonZero("self")),
+    "aten::nonzero": Transpose(NonZero("self")),  # noqa: F405
     "aten::nonzero.out": SignatureOnly(),
     "aten::_log_softmax.out": SignatureOnly(),
     # NegativeLogLikelihoodLoss is not supported by the CPU Execution Provider so testing is not possible
@@ -205,9 +223,9 @@ def __init__(self, dY, Y, axis=None):
     "aten::nll_loss_backward.grad_input": MakeTorchFallback(),
     "aten::_softmax_backward_data": SoftmaxGrad("grad_output", "output", axis="dim"),
     "aten::_log_softmax_backward_data": LogSoftmaxGrad("grad_output", "output", axis="dim"),
-    "aten::squeeze.dim": Squeeze("self", "dim"),
+    "aten::squeeze.dim": Squeeze("self", "dim"),  # noqa: F405
     "aten::squeeze": SignatureOnly(),
-    "aten::unsqueeze": Unsqueeze(data="self", axes="dim"),
+    "aten::unsqueeze": Unsqueeze(data="self", axes="dim"),  # noqa: F405
     # until the generator is modified to include resizing out based on broadcast result, we hand write
     # add, sub, mul, and div. The majority of the code was generated by using the commented onnx ops.
     "aten::add.out": SignatureOnly(),  # Add("self", Mul("alpha", "other")),
@@ -224,7 +242,7 @@ def __init__(self, dY, Y, axis=None):
 # This is done to make sure it is backward and future compatible
 if version.parse(torch.__version__) < version.parse(TORCH_API_CHANGE_VERSION):
     hand_implemented["aten::gelu_backward"] = GeluGrad("grad", "self")
-    hand_implemented["aten::_cat"] = Concat("tensors", "dim")
+    hand_implemented["aten::_cat"] = Concat("tensors", "dim")  # noqa: F405
 else:
     hand_implemented["aten::gelu_backward"] = GeluGrad("grad_output", "self")
 
diff --git a/orttraining/orttraining/eager/opgen/opgen/custom_ops.py b/orttraining/orttraining/eager/opgen/opgen/custom_ops.py
index a8031fe7d8635..0c303780a1e7f 100644
--- a/orttraining/orttraining/eager/opgen/opgen/custom_ops.py
+++ b/orttraining/orttraining/eager/opgen/opgen/custom_ops.py
@@ -1,4 +1,4 @@
-from opgen.onnxops import BatchNormalization, Gemm, Concat
+from opgen.onnxops import BatchNormalization, Concat, Gemm
 
 ops = {
     "gemm": Gemm("A", "B", "C", "alpha", "beta", "transA", "transB"),
diff --git a/orttraining/orttraining/eager/opgen/opgen/generator.py b/orttraining/orttraining/eager/opgen/opgen/generator.py
index 8813ad15ba483..920cc5d3d5d8c 100644
--- a/orttraining/orttraining/eager/opgen/opgen/generator.py
+++ b/orttraining/orttraining/eager/opgen/opgen/generator.py
@@ -17,7 +17,7 @@ def __init__(self, count: int):
         self.name = None
 
     def __str__(self):
-        return self.name if self.name else f"<unbound output>"
+        return self.name if self.name else "<unbound output>"
 
 
 class AttrType:
@@ -180,8 +180,7 @@ def run(self, cpp_parser: parser.CPPParser, writer: opgenwriter.SourceWriter):
 
         if len(self._mapped_ops) > 0:
             raise Exception(
-                "Torch operation(s) could not be parsed for mapping: "
-                + ", ".join([f"'{o}'" for o in self._mapped_ops.keys()])
+                "Torch operation(s) could not be parsed for mapping: " + ", ".join([f"'{o}'" for o in self._mapped_ops])
             )
 
     def _write_file_prelude(self, writer: opgenwriter.SourceWriter):
@@ -223,7 +222,7 @@ def _write_function_signature(self, writer: opgenwriter.SourceWriter, cpp_func:
         writer.write(")")
 
     def _write_cpu_fall_back(self, writer: opgenwriter.SourceWriter, mapped_func: MappedOpFunction):
-        onnx_op, cpp_func = mapped_func.onnx_op, mapped_func.cpp_func
+        onnx_op, cpp_func = mapped_func.onnx_op, mapped_func.cpp_func  # noqa: F841
         # return at::native::call_fallback_fn<
         #  &at::native::cpu_fallback,
         #  ATEN_OP(eq_Tensor)>::call(self, other);
@@ -282,10 +281,10 @@ def _write_function_body_onnx_op_node_attributes(self, writer, onnx_op, attrs, a
             if attr.type.startswith("at::ScalarType::"):
                 writer.write(f", {attr.type}")
             elif attr.type == AttrType.TENSOR:
-                writer.write(f", true")
+                writer.write(", true")
             elif attr.type != AttrType.STRING:
                 raise FunctionGenerationError(
-                    cpp_func,
+                    cpp_func,  # noqa: F821
                     f'Unsure how how to map ONNX op "{onnx_op.name}" attribute '
                     + f'"{attr_name}" of type "{attr.type}" to a call to '
                     + "create_ort_attribute. Please teach generator.py.",
@@ -432,9 +431,9 @@ def _write_function_body_return_multiple(self, writer, cpp_func, in_place_params
             isinstance(cpp_func.return_type, ast.TemplateType)
             and cpp_func.return_type.identifier_tokens[-1].value == "std::tuple"
         ):
-            raise Exception(f"")
-        tensorRef = "Tensor&," * len(in_place_params)
-        tensorRef = tensorRef[: len(tensorRef) - 1]
+            raise Exception("")
+        tensorRef = "Tensor&," * len(in_place_params)  # noqa: N806
+        tensorRef = tensorRef[: len(tensorRef) - 1]  # noqa: N806
         writer.write(f"return std::tuple<{tensorRef}>(")
         for index, key in enumerate(sorted(in_place_params)):
             if index > 0:
@@ -486,7 +485,7 @@ def _write_function_body_first_param_assert(self, writer, first_param):
             not isinstance(first_param.parameter_type.desugar(), ast.ConcreteType)
             or "Tensor" not in first_param.parameter_type.desugar().identifier_tokens[0].value
         ):
-            raise FunctionGenerationError(cpp_func, "First parameter must be an at::Tensor")
+            raise FunctionGenerationError(cpp_func, "First parameter must be an at::Tensor")  # noqa: F821
 
     # Generates code to get an ORT Invoker for the device from the first param.
     # The Invoker will be used and reused in _write_function_body_onnx_op_invocation.
@@ -776,7 +775,7 @@ def _parse_mapped_function_decls(self, cpp_parser: parser.CPPParser):
                 try:
                     op_namespace = op_name[0 : op_name.index("::")]
                     op_namewithoutnamespace = op_name[len(op_namespace) + 2 :]
-                except:
+                except Exception:
                     op_namespace = None
                     op_namewithoutnamespace = op_name
 
@@ -802,12 +801,12 @@ def _parse_function_decls(self, cpp_parser: parser.CPPParser):
         # Parse the Torch schema from the JSON comment that follows each C++ decl
         # and link associated Torch and C++ decls (functions, parameters, returns)
         for cpp_func in tu:
-            hasSchema = False
+            hasSchema = False  # noqa: N806
             if cpp_func.semicolon and cpp_func.semicolon.trailing_trivia:
                 for trivia in cpp_func.semicolon.trailing_trivia:
                     if trivia.kind == lexer.TokenKind.SINGLE_LINE_COMMENT:
                         yield self._parse_and_link_torch_function_decl(cpp_func, trivia)
-                        hasSchema = True
+                        hasSchema = True  # noqa: N806
                         break
 
             if not hasSchema:
@@ -816,7 +815,7 @@ def _parse_function_decls(self, cpp_parser: parser.CPPParser):
                 yield cpp_func
 
     def _parse_and_link_torch_function_decl(self, cpp_func: ast.FunctionDecl, torch_schema_comment_trivia: lexer.Token):
-        metadata = json.loads(torch_schema_comment_trivia.value.lstrip("//"))
+        metadata = json.loads(torch_schema_comment_trivia.value.lstrip("//"))  # noqa: B005
         schema = metadata["schema"]
 
         schema_parser = parser.torch_create_from_string(schema)
diff --git a/orttraining/orttraining/eager/opgen/opgen/lexer.py b/orttraining/orttraining/eager/opgen/opgen/lexer.py
index 661d646350f53..7c60fb1a1fc3f 100644
--- a/orttraining/orttraining/eager/opgen/opgen/lexer.py
+++ b/orttraining/orttraining/eager/opgen/opgen/lexer.py
@@ -1,12 +1,12 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-from enum import Enum
 from abc import ABC
-from typing import List, Optional, Union, Tuple
+from enum import Enum
+from typing import List, Optional, Tuple, Union
 
 
-class SourceLocation(object):
+class SourceLocation:
     def __init__(self, offset: int = 0, line: int = 1, column: int = 1):
         self.offset = offset
         self.line = line
@@ -63,7 +63,7 @@ class TokenKind(Enum):
     ARROW = 27
 
 
-class Token(object):
+class Token:
     def __init__(
         self,
         location: Union[SourceLocation, Tuple[int, int, int]],
@@ -72,7 +72,7 @@ def __init__(
         leading_trivia: Optional[List["Token"]] = None,
         trailing_trivia: Optional[List["Token"]] = None,
     ):
-        if isinstance(location, tuple) or isinstance(location, list):
+        if isinstance(location, (tuple, list)):
             location = SourceLocation(location[0], location[1], location[2])
 
         self.location = location
@@ -91,10 +91,7 @@ def is_trivia(self) -> bool:
     def has_trailing_trivia(self, trivia_kind: TokenKind) -> bool:
         if not self.trailing_trivia:
             return False
-        for trivia in self.trailing_trivia:
-            if trivia.kind == trivia_kind:
-                return True
-        return False
+        return any(trivia.kind == trivia_kind for trivia in self.trailing_trivia)
 
     def __str__(self) -> str:
         return f"{self.location}: [{self.kind}] '{self.value}'"
@@ -120,11 +117,11 @@ def __eq__(self, other) -> bool:
         )
 
 
-class Reader(ABC):
-    def open(self):
+class Reader(ABC):  # noqa: B024
+    def open(self):  # noqa: B027
         pass
 
-    def close(self):
+    def close(self):  # noqa: B027
         pass
 
     def read_char(self) -> str:
@@ -158,7 +155,7 @@ def read_char(self) -> str:
         return None
 
 
-class Lexer(object):
+class Lexer:
     _peek: str
     _next_token: Token
     _first_token_leading_trivia: List[Token]
diff --git a/orttraining/orttraining/eager/opgen/opgen/onnxops.py b/orttraining/orttraining/eager/opgen/opgen/onnxops.py
index 98a2dd4d5997e..f3ecf4f1bb479 100644
--- a/orttraining/orttraining/eager/opgen/opgen/onnxops.py
+++ b/orttraining/orttraining/eager/opgen/opgen/onnxops.py
@@ -1,7 +1,7 @@
 # AUTO-GENERATED CODE! - DO NOT EDIT!
 # $ python onnxgen.py
 
-from opgen.generator import ONNXAttr, ONNXOp, AttrType
+from opgen.generator import AttrType, ONNXAttr, ONNXOp
 
 
 class Abs(ONNXOp):
@@ -3037,7 +3037,7 @@ class MaxUnpool(ONNXOp):
      pooling op that the unpooling op is trying to invert.
     """
 
-    def __init__(self, X, I, output_shape, kernel_shape=None, pads=None, strides=None):
+    def __init__(self, X, I, output_shape, kernel_shape=None, pads=None, strides=None):  # noqa: E741
         super().__init__(
             "MaxUnpool",
             1,
diff --git a/orttraining/orttraining/eager/opgen/opgen/parser.py b/orttraining/orttraining/eager/opgen/opgen/parser.py
index c1ba7e8378c5b..fee8e850246a0 100644
--- a/orttraining/orttraining/eager/opgen/opgen/parser.py
+++ b/orttraining/orttraining/eager/opgen/opgen/parser.py
@@ -1,32 +1,33 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-from opgen.lexer import *
-from opgen.ast import *
-from typing import List, Tuple, Union, Optional
+from typing import List, Optional, Tuple, Union
+
+from opgen.ast import *  # noqa: F403
+from opgen.lexer import *  # noqa: F403
 
 
 class UnexpectedTokenError(RuntimeError):
-    def __init__(self, expected: TokenKind, actual: Token):
+    def __init__(self, expected: TokenKind, actual: Token):  # noqa: F405
         self.expected = expected
         self.actual = actual
         super().__init__(f"unexpected token {actual}; expected {expected}")
 
 
 class ExpectedSyntaxError(RuntimeError):
-    def __init__(self, expected: str, actual: Token = None):
+    def __init__(self, expected: str, actual: Token = None):  # noqa: F405
         super().__init__(f"expected {expected}; actual {actual}")
 
 
-class ParserBase(object):
-    _peek_queue: List[Token]
+class ParserBase:
+    _peek_queue: List[Token]  # noqa: F405
 
-    def __init__(self, lexer: Union[Lexer, Reader]):
+    def __init__(self, lexer: Union[Lexer, Reader]):  # noqa: F405
         self._own_lexer = False
-        if isinstance(lexer, Reader):
+        if isinstance(lexer, Reader):  # noqa: F405
             self._own_lexer = True
-            lexer = Lexer(lexer)
-        elif not isinstance(lexer, Lexer):
+            lexer = Lexer(lexer)  # noqa: F405
+        elif not isinstance(lexer, Lexer):  # noqa: F405
             raise TypeError("lexer must be a Lexer or Reader")
         self._lexer = lexer
         self._peek_queue = []
@@ -40,17 +41,17 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         if self._own_lexer:
             self._lexer.__exit__(exc_type, exc_val, exc_tb)
 
-    def set_source_location(self, origin: SourceLocation):
+    def set_source_location(self, origin: SourceLocation):  # noqa: F405
         self._lexer.set_source_location(origin)
 
     def _peek_token(
-        self, kinds: Union[TokenKind, List[TokenKind]] = None, value: str = None, look_ahead: int = 1
-    ) -> Optional[Token]:
+        self, kinds: Union[TokenKind, List[TokenKind]] = None, value: str = None, look_ahead: int = 1  # noqa: F405
+    ) -> Optional[Token]:  # noqa: F405
         if look_ahead < 1:
             raise IndexError("look_ahead must be at least 1")
         if look_ahead >= len(self._peek_queue):
             for _ in range(look_ahead - len(self._peek_queue)):
-                self._peek_queue = [self._lexer.lex()] + self._peek_queue
+                self._peek_queue = [self._lexer.lex(), *self._peek_queue]
         peek = self._peek_queue[-look_ahead]
         if not kinds:
             return peek
@@ -63,10 +64,10 @@ def _peek_token(
                 return peek
         return None
 
-    def _read_token(self) -> Token:
+    def _read_token(self) -> Token:  # noqa: F405
         return self._peek_queue.pop() if self._peek_queue else self._lexer.lex()
 
-    def _expect_token(self, kind: TokenKind) -> Token:
+    def _expect_token(self, kind: TokenKind) -> Token:  # noqa: F405
         token = self._read_token()
         if token.kind != kind:
             raise UnexpectedTokenError(kind, token)
@@ -74,12 +75,12 @@ def _expect_token(self, kind: TokenKind) -> Token:
 
     def _parse_list(
         self,
-        open_token_kind: TokenKind,
-        separator_token_kind: TokenKind,
-        close_token_kind: TokenKind,
+        open_token_kind: TokenKind,  # noqa: F405
+        separator_token_kind: TokenKind,  # noqa: F405
+        close_token_kind: TokenKind,  # noqa: F405
         member_parser: callable,
-    ) -> SyntaxList:
-        syntax_list = SyntaxList()
+    ) -> SyntaxList:  # noqa: F405
+        syntax_list = SyntaxList()  # noqa: F405
         if open_token_kind:
             syntax_list.open_token = self._expect_token(open_token_kind)
         while True:
@@ -94,250 +95,261 @@ def _parse_list(
             syntax_list.close_token = self._expect_token(close_token_kind)
         return syntax_list
 
-    def parse_translation_unit(self) -> TranslationUnitDecl:
+    def parse_translation_unit(self) -> TranslationUnitDecl:  # noqa: F405
         decls = []
-        while not self._peek_token(TokenKind.EOF):
+        while not self._peek_token(TokenKind.EOF):  # noqa: F405
             decls.append(self.parse_function())
-        return TranslationUnitDecl(decls)
+        return TranslationUnitDecl(decls)  # noqa: F405
 
-    def parse_function_parameter_default_value_expression(self) -> Expression:
+    def parse_function_parameter_default_value_expression(self) -> Expression:  # noqa: F405
         return self.parse_expression()
 
-    def parse_function_parameter(self) -> ParameterDecl:
+    def parse_function_parameter(self) -> ParameterDecl:  # noqa: F405
         parameter_type = self.parse_type()
 
-        if not self._peek_token(TokenKind.IDENTIFIER):
-            return ParameterDecl(parameter_type)
+        if not self._peek_token(TokenKind.IDENTIFIER):  # noqa: F405
+            return ParameterDecl(parameter_type)  # noqa: F405
 
         parameter_name = self._read_token()
 
-        if not self._peek_token(TokenKind.EQUALS):
-            return ParameterDecl(parameter_type, parameter_name)
+        if not self._peek_token(TokenKind.EQUALS):  # noqa: F405
+            return ParameterDecl(parameter_type, parameter_name)  # noqa: F405
 
-        return ParameterDecl(
+        return ParameterDecl(  # noqa: F405
             parameter_type, parameter_name, self._read_token(), self.parse_function_parameter_default_value_expression()
         )
 
-    def parse_function_parameters(self) -> SyntaxList:
+    def parse_function_parameters(self) -> SyntaxList:  # noqa: F405
         return self._parse_list(
-            TokenKind.OPEN_PAREN, TokenKind.COMMA, TokenKind.CLOSE_PAREN, self.parse_function_parameter
+            TokenKind.OPEN_PAREN, TokenKind.COMMA, TokenKind.CLOSE_PAREN, self.parse_function_parameter  # noqa: F405
         )
 
-    def parse_function(self) -> FunctionDecl:
+    def parse_function(self) -> FunctionDecl:  # noqa: F405
         raise NotImplementedError()
 
-    def parse_expression(self) -> Expression:
+    def parse_expression(self) -> Expression:  # noqa: F405
         raise NotImplementedError()
 
-    def parse_type(self) -> Type:
+    def parse_type(self) -> Type:  # noqa: F405
         raise NotImplementedError()
 
 
 class CPPParser(ParserBase):
-    def parse_function(self) -> FunctionDecl:
+    def parse_function(self) -> FunctionDecl:  # noqa: F405
         return_type = self.parse_type()
-        return FunctionDecl(
-            self._expect_token(TokenKind.IDENTIFIER),
+        return FunctionDecl(  # noqa: F405
+            self._expect_token(TokenKind.IDENTIFIER),  # noqa: F405
             self.parse_function_parameters(),
             return_type,
-            semicolon=self._expect_token(TokenKind.SEMICOLON),
+            semicolon=self._expect_token(TokenKind.SEMICOLON),  # noqa: F405
         )
 
-    def parse_expression(self) -> Expression:
+    def parse_expression(self) -> Expression:  # noqa: F405
         if (
-            self._peek_token(TokenKind.IDENTIFIER)
-            or self._peek_token(TokenKind.NUMBER)
-            or self._peek_token(TokenKind.STRING)
+            self._peek_token(TokenKind.IDENTIFIER)  # noqa: F405
+            or self._peek_token(TokenKind.NUMBER)  # noqa: F405
+            or self._peek_token(TokenKind.STRING)  # noqa: F405
         ):
-            return LiteralExpression(self._read_token())
+            return LiteralExpression(self._read_token())  # noqa: F405
         else:
             raise UnexpectedTokenError("expression", self._peek_token())
 
-    def parse_type(self) -> Type:
-        if self._peek_token(TokenKind.IDENTIFIER, "const"):
-            parsed_type = ConstType(self._read_token(), self.parse_type())
-        elif self._peek_token([TokenKind.IDENTIFIER, TokenKind.DOUBLECOLON]):
+    def parse_type(self) -> Type:  # noqa: F405
+        if self._peek_token(TokenKind.IDENTIFIER, "const"):  # noqa: F405
+            parsed_type = ConstType(self._read_token(), self.parse_type())  # noqa: F405
+        elif self._peek_token([TokenKind.IDENTIFIER, TokenKind.DOUBLECOLON]):  # noqa: F405
             identifiers = []
             while True:
-                token = self._peek_token([TokenKind.IDENTIFIER, TokenKind.DOUBLECOLON])
+                token = self._peek_token([TokenKind.IDENTIFIER, TokenKind.DOUBLECOLON])  # noqa: F405
                 if not token:
                     break
                 identifiers.append(self._read_token())
-                if token.has_trailing_trivia(TokenKind.WHITESPACE):
+                if token.has_trailing_trivia(TokenKind.WHITESPACE):  # noqa: F405
                     break
-            if self._peek_token(TokenKind.LESS_THAN):
-                parsed_type = TemplateType(
+            if self._peek_token(TokenKind.LESS_THAN):  # noqa: F405
+                parsed_type = TemplateType(  # noqa: F405
                     identifiers,
                     self._parse_list(
-                        TokenKind.LESS_THAN, TokenKind.COMMA, TokenKind.GREATER_THAN, self._parse_template_type_argument
+                        TokenKind.LESS_THAN,  # noqa: F405
+                        TokenKind.COMMA,  # noqa: F405
+                        TokenKind.GREATER_THAN,  # noqa: F405
+                        self._parse_template_type_argument,
                     ),
                 )
             elif identifiers[-1].value == "TensorOptions":
-                parsed_type = TensorOptionsType(identifiers)
+                parsed_type = TensorOptionsType(identifiers)  # noqa: F405
             else:
-                parsed_type = ConcreteType(identifiers)
+                parsed_type = ConcreteType(identifiers)  # noqa: F405
         else:
             raise ExpectedSyntaxError("type", self._peek_token())
 
         while True:
-            if self._peek_token(TokenKind.AND):
-                parsed_type = ReferenceType(parsed_type, self._read_token())
+            if self._peek_token(TokenKind.AND):  # noqa: F405
+                parsed_type = ReferenceType(parsed_type, self._read_token())  # noqa: F405
             else:
                 return parsed_type
 
-    def _parse_template_type_argument(self) -> Type:
-        if self._peek_token(TokenKind.NUMBER):
-            return ExpressionType(self.parse_expression())
+    def _parse_template_type_argument(self) -> Type:  # noqa: F405
+        if self._peek_token(TokenKind.NUMBER):  # noqa: F405
+            return ExpressionType(self.parse_expression())  # noqa: F405
         return self.parse_type()
 
 
 class TorchParser(ParserBase):
-    def __init__(self, lexer: Union[Lexer, Reader]):
+    def __init__(self, lexer: Union[Lexer, Reader]):  # noqa: F405
         super().__init__(lexer)
         self._next_anonymous_alias_id = 0
 
-    def parse_function(self) -> FunctionDecl:
-        return FunctionDecl(
-            self._expect_token(TokenKind.IDENTIFIER),
+    def parse_function(self) -> FunctionDecl:  # noqa: F405
+        return FunctionDecl(  # noqa: F405
+            self._expect_token(TokenKind.IDENTIFIER),  # noqa: F405
             self.parse_function_parameters(),
-            arrow=self._expect_token(TokenKind.ARROW),
+            arrow=self._expect_token(TokenKind.ARROW),  # noqa: F405
             return_type=self.parse_type(),
         )
 
-    def parse_expression(self) -> Expression:
+    def parse_expression(self) -> Expression:  # noqa: F405
         if (
-            self._peek_token(TokenKind.NUMBER)
-            or self._peek_token(TokenKind.IDENTIFIER)
-            or self._peek_token(TokenKind.STRING)
+            self._peek_token(TokenKind.NUMBER)  # noqa: F405
+            or self._peek_token(TokenKind.IDENTIFIER)  # noqa: F405
+            or self._peek_token(TokenKind.STRING)  # noqa: F405
         ):
-            return LiteralExpression(self._read_token())
-        elif self._peek_token(TokenKind.OPEN_BRACKET):
-            return ArrayExpression(
+            return LiteralExpression(self._read_token())  # noqa: F405
+        elif self._peek_token(TokenKind.OPEN_BRACKET):  # noqa: F405
+            return ArrayExpression(  # noqa: F405
                 self._parse_list(
-                    TokenKind.OPEN_BRACKET, TokenKind.COMMA, TokenKind.CLOSE_BRACKET, self.parse_expression
+                    TokenKind.OPEN_BRACKET,  # noqa: F405
+                    TokenKind.COMMA,  # noqa: F405
+                    TokenKind.CLOSE_BRACKET,  # noqa: F405
+                    self.parse_expression,
                 )
             )
         else:
             raise UnexpectedTokenError("expression", self._peek_token())
 
-    def _create_alias_info_type(self, parsed_type: Type, alias_info: AliasInfo) -> AliasInfoType:
-        if isinstance(parsed_type, ModifiedType):
-            parsed_type.base_type = AliasInfoType(parsed_type.base_type, alias_info)
+    def _create_alias_info_type(self, parsed_type: Type, alias_info: AliasInfo) -> AliasInfoType:  # noqa: F405
+        if isinstance(parsed_type, ModifiedType):  # noqa: F405
+            parsed_type.base_type = AliasInfoType(parsed_type.base_type, alias_info)  # noqa: F405
         else:
-            parsed_type = AliasInfoType(parsed_type, alias_info)
+            parsed_type = AliasInfoType(parsed_type, alias_info)  # noqa: F405
         return parsed_type
 
-    def parse_type(self) -> Type:
+    def parse_type(self) -> Type:  # noqa: F405
         parsed_type, alias_info = self._parse_type_and_alias()
         if not alias_info:
             return parsed_type
         return self._create_alias_info_type(parsed_type, alias_info)
 
-    def _parse_type_and_alias(self) -> Tuple[Type, AliasInfo]:
-        parsed_type: Type = None
-        alias_info: AliasInfo = None
+    def _parse_type_and_alias(self) -> Tuple[Type, AliasInfo]:  # noqa: F405
+        parsed_type: Type = None  # noqa: F405
+        alias_info: AliasInfo = None  # noqa: F405
 
-        if self._peek_token(TokenKind.MUL):
-            return (KWArgsSentinelType(self._read_token()), None)
+        if self._peek_token(TokenKind.MUL):  # noqa: F405
+            return (KWArgsSentinelType(self._read_token()), None)  # noqa: F405
 
-        if self._peek_token(TokenKind.OPEN_PAREN):
+        if self._peek_token(TokenKind.OPEN_PAREN):  # noqa: F405
 
             def parse_tuple_element():
                 element_type, element_alias_info = self._parse_type_and_alias()
                 if element_alias_info:
                     element_type = self._create_alias_info_type(element_type, element_alias_info)
 
-                return TupleMemberType(
-                    element_type, self._read_token() if self._peek_token(TokenKind.IDENTIFIER) else None
+                return TupleMemberType(  # noqa: F405
+                    element_type, self._read_token() if self._peek_token(TokenKind.IDENTIFIER) else None  # noqa: F405
                 )
 
-            parsed_type = TupleType(
-                self._parse_list(TokenKind.OPEN_PAREN, TokenKind.COMMA, TokenKind.CLOSE_PAREN, parse_tuple_element)
+            parsed_type = TupleType(  # noqa: F405
+                self._parse_list(
+                    TokenKind.OPEN_PAREN,  # noqa: F405
+                    TokenKind.COMMA,  # noqa: F405
+                    TokenKind.CLOSE_PAREN,  # noqa: F405
+                    parse_tuple_element,
+                )
             )
-        elif self._peek_token(TokenKind.IDENTIFIER, "Tensor"):
-            parsed_type = TensorType(self._read_token())
+        elif self._peek_token(TokenKind.IDENTIFIER, "Tensor"):  # noqa: F405
+            parsed_type = TensorType(self._read_token())  # noqa: F405
             alias_info = self._parse_torch_alias_info()
         else:
             parsed_type = self._parse_torch_base_type()
             alias_info = self._parse_torch_alias_info()
 
         while True:
-            if self._peek_token(TokenKind.OPEN_BRACKET):
-                parsed_type = ArrayType(
+            if self._peek_token(TokenKind.OPEN_BRACKET):  # noqa: F405
+                parsed_type = ArrayType(  # noqa: F405
                     parsed_type,
                     self._read_token(),
-                    self._read_token() if self._peek_token(TokenKind.NUMBER) else None,
-                    self._expect_token(TokenKind.CLOSE_BRACKET),
+                    self._read_token() if self._peek_token(TokenKind.NUMBER) else None,  # noqa: F405
+                    self._expect_token(TokenKind.CLOSE_BRACKET),  # noqa: F405
                 )
-            elif self._peek_token(TokenKind.QUESTION_MARK):
-                parsed_type = OptionalType(parsed_type, self._read_token())
+            elif self._peek_token(TokenKind.QUESTION_MARK):  # noqa: F405
+                parsed_type = OptionalType(parsed_type, self._read_token())  # noqa: F405
             else:
                 return (parsed_type, alias_info)
 
-    def _parse_torch_base_type(self) -> Type:
+    def _parse_torch_base_type(self) -> Type:  # noqa: F405
         base_type_parsers = {
-            "int": IntType,
-            "float": FloatType,
-            "bool": BoolType,
-            "str": StrType,
-            "Scalar": ScalarType,
-            "ScalarType": ScalarTypeType,
-            "Dimname": DimnameType,
-            "Layout": LayoutType,
-            "Device": DeviceType,
-            "Generator": GeneratorType,
-            "MemoryFormat": MemoryFormatType,
-            "QScheme": QSchemeType,
-            "Storage": StorageType,
-            "ConstQuantizerPtr": ConstQuantizerPtrType,
-            "Stream": StreamType,
-            "SymInt": SymIntType,
+            "int": IntType,  # noqa: F405
+            "float": FloatType,  # noqa: F405
+            "bool": BoolType,  # noqa: F405
+            "str": StrType,  # noqa: F405
+            "Scalar": ScalarType,  # noqa: F405
+            "ScalarType": ScalarTypeType,  # noqa: F405
+            "Dimname": DimnameType,  # noqa: F405
+            "Layout": LayoutType,  # noqa: F405
+            "Device": DeviceType,  # noqa: F405
+            "Generator": GeneratorType,  # noqa: F405
+            "MemoryFormat": MemoryFormatType,  # noqa: F405
+            "QScheme": QSchemeType,  # noqa: F405
+            "Storage": StorageType,  # noqa: F405
+            "ConstQuantizerPtr": ConstQuantizerPtrType,  # noqa: F405
+            "Stream": StreamType,  # noqa: F405
+            "SymInt": SymIntType,  # noqa: F405
         }
-        identifier = self._expect_token(TokenKind.IDENTIFIER)
+        identifier = self._expect_token(TokenKind.IDENTIFIER)  # noqa: F405
         base_type_parser = base_type_parsers.get(identifier.value)
         if not base_type_parser:
             raise ExpectedSyntaxError("|".join(base_type_parsers.keys()), identifier)
         base_type = base_type_parser(identifier)
         return base_type
 
-    def _parse_torch_alias_info(self) -> AliasInfo:
-        alias_info = AliasInfo()
+    def _parse_torch_alias_info(self) -> AliasInfo:  # noqa: F405
+        alias_info = AliasInfo()  # noqa: F405
 
         def parse_set(alias_set: List[str]):
             while True:
-                if self._peek_token(TokenKind.MUL):
+                if self._peek_token(TokenKind.MUL):  # noqa: F405
                     alias_info.tokens.append(self._read_token())
                     alias_set.append("*")
                 elif "*" not in alias_set:
-                    identifier = self._expect_token(TokenKind.IDENTIFIER)
+                    identifier = self._expect_token(TokenKind.IDENTIFIER)  # noqa: F405
                     alias_info.tokens.append(identifier)
                     alias_set.append(identifier.value)
                 else:
                     raise ExpectedSyntaxError("alias wildcard * or alias identifier", self._peek_token())
 
-                if self._peek_token(TokenKind.OR):
+                if self._peek_token(TokenKind.OR):  # noqa: F405
                     alias_info.tokens.append(self._read_token())
                 else:
                     return
 
-        if self._peek_token(TokenKind.OPEN_PAREN):
+        if self._peek_token(TokenKind.OPEN_PAREN):  # noqa: F405
             alias_info.tokens.append(self._read_token())
 
             parse_set(alias_info.before_set)
 
-            if self._peek_token(TokenKind.EXCLAIMATION_MARK):
+            if self._peek_token(TokenKind.EXCLAIMATION_MARK):  # noqa: F405
                 alias_info.tokens.append(self._read_token())
                 alias_info.is_writable = True
 
-            if self._peek_token(TokenKind.ARROW):
+            if self._peek_token(TokenKind.ARROW):  # noqa: F405
                 alias_info.tokens.append(self._read_token())
                 parse_set(alias_info.after_set)
             else:
                 # no '->' so assume before and after are identical
                 alias_info.after_set = alias_info.before_set
 
-            alias_info.tokens.append(self._expect_token(TokenKind.CLOSE_PAREN))
-        elif self._peek_token(TokenKind.EXCLAIMATION_MARK):
+            alias_info.tokens.append(self._expect_token(TokenKind.CLOSE_PAREN))  # noqa: F405
+        elif self._peek_token(TokenKind.EXCLAIMATION_MARK):  # noqa: F405
             alias_info.is_writable = True
             alias_info.before_set.append(str(self._next_anonymous_alias_id))
             self._next_anonymous_alias_id += 1
@@ -348,16 +360,16 @@ def parse_set(alias_set: List[str]):
 
 
 def cpp_create_from_file(path: str) -> CPPParser:
-    return CPPParser(FileReader(path))
+    return CPPParser(FileReader(path))  # noqa: F405
 
 
 def cpp_create_from_string(buffer: str) -> CPPParser:
-    return CPPParser(StringReader(buffer))
+    return CPPParser(StringReader(buffer))  # noqa: F405
 
 
 def torch_create_from_file(path: str) -> TorchParser:
-    return TorchParser(FileReader(path))
+    return TorchParser(FileReader(path))  # noqa: F405
 
 
 def torch_create_from_string(buffer: str) -> TorchParser:
-    return TorchParser(StringReader(buffer))
+    return TorchParser(StringReader(buffer))  # noqa: F405
diff --git a/orttraining/orttraining/eager/opgen/opgen/writer.py b/orttraining/orttraining/eager/opgen/opgen/writer.py
index 460a29a879dfc..b5281e1843ed8 100644
--- a/orttraining/orttraining/eager/opgen/opgen/writer.py
+++ b/orttraining/orttraining/eager/opgen/opgen/writer.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-from typing import TextIO, List
+from typing import List, TextIO
 
 
 class SourceWriter:
diff --git a/orttraining/orttraining/eager/opgen/opgen_test/lexer_test.py b/orttraining/orttraining/eager/opgen/opgen_test/lexer_test.py
index 30e78377b2445..8e5b99762c158 100644
--- a/orttraining/orttraining/eager/opgen/opgen_test/lexer_test.py
+++ b/orttraining/orttraining/eager/opgen/opgen_test/lexer_test.py
@@ -3,7 +3,7 @@
 
 import unittest
 
-from opgen.lexer import StringReader, Lexer, Token, TokenKind, SourceLocation
+from opgen.lexer import Lexer, SourceLocation, StringReader, Token, TokenKind  # noqa: F401
 
 
 class LexerTestCase(unittest.TestCase):
@@ -69,7 +69,7 @@ def assert_number(number):
             assert_number(number)
 
         for number in ["1.2.3", "e1", "-e1", "123e0.5"]:
-            self.assertRaises(BaseException, lambda: assert_number(number))
+            self.assertRaises(BaseException, lambda: assert_number(number))  # noqa: B023
 
         lexer = self.create_lexer("1.2.3.4e5.6")
         self.assertEqual(Token((0, 1, 1), TokenKind.NUMBER, "1.2"), lexer.lex())
diff --git a/orttraining/orttraining/eager/test/__main__.py b/orttraining/orttraining/eager/test/__main__.py
index f188f3c1fc3c3..cd381c050ec00 100644
--- a/orttraining/orttraining/eager/test/__main__.py
+++ b/orttraining/orttraining/eager/test/__main__.py
@@ -3,8 +3,8 @@
 
 import glob
 import os
-import sys
 import subprocess
+import sys
 
 selfdir = os.path.dirname(os.path.realpath(__file__))
 
diff --git a/orttraining/orttraining/eager/test/linux_only_ortmodule_eager_test.py b/orttraining/orttraining/eager/test/linux_only_ortmodule_eager_test.py
index 17318710850ed..dd20d5b9e2878 100644
--- a/orttraining/orttraining/eager/test/linux_only_ortmodule_eager_test.py
+++ b/orttraining/orttraining/eager/test/linux_only_ortmodule_eager_test.py
@@ -1,7 +1,7 @@
-import os
+import os  # noqa: F401
 import unittest
 
-import numpy as np
+import numpy as np  # noqa: F401
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -17,7 +17,7 @@ def my_loss(x, target):
 
 class NeuralNet(nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNet, self).__init__()
+        super().__init__()
         self.fc1 = nn.Linear(input_size, hidden_size)
         self.relu = nn.ReLU()
         self.fc2 = nn.Linear(hidden_size, num_classes)
@@ -31,7 +31,7 @@ def forward(self, x):
 
 class NoOpNet(torch.nn.Module):
     def __init__(self):
-        super(NoOpNet, self).__init__()
+        super().__init__()
         self.dummy_weight = torch.nn.Parameter(torch.ones(128, 128, dtype=torch.float16))
 
     def forward(self, input):
@@ -62,7 +62,7 @@ def test_ortmodule_inference(self):
 
         with torch.no_grad():
             data = torch.rand(batch_size, input_size)
-            y = model(data.to(device))
+            model(data.to(device))
         print("Done")
 
     @unittest.skip("Test fails with newest pytorch version.")
diff --git a/orttraining/orttraining/eager/test/ort_eps_test.py b/orttraining/orttraining/eager/test/ort_eps_test.py
index 7a4c8de5c5d25..1b12f0cfef742 100644
--- a/orttraining/orttraining/eager/test/ort_eps_test.py
+++ b/orttraining/orttraining/eager/test/ort_eps_test.py
@@ -1,24 +1,25 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-import unittest
-import torch
-import onnxruntime_pybind11_state as torch_ort
 import os
 import sys
+import unittest
+
+import onnxruntime_pybind11_state as torch_ort
+import torch
 
 
 def is_windows():
     return sys.platform.startswith("win")
 
 
-from io import StringIO
-import sys
-import threading
-import time
+import sys  # noqa: E402, F811
+import threading  # noqa: E402
+import time  # noqa: E402
+from io import StringIO  # noqa: E402, F401
 
 
-class OutputGrabber(object):
+class OutputGrabber:
     """
     Class used to grab standard output or another stream.
     """
@@ -81,7 +82,7 @@ def stop(self):
         # Close the duplicate stream:
         os.close(self.streamfd)
 
-    def readOutput(self):
+    def readOutput(self):  # noqa: N802
         """
         Read the stream data (one byte at a time)
         and save the text in `capturedtext`.
@@ -107,22 +108,22 @@ def test_import_custom_eps(self):
         # capture std out
         with OutputGrabber() as out:
             torch_ort.set_device(1, "TestExecutionProvider", {"device_id": "0", "some_config": "val"})
-            ort_device = torch_ort.device(1)
+            torch_ort.device(1)
         assert "My EP provider created, with device id: 0, some_option: val" in out.capturedtext
         with OutputGrabber() as out:
             torch_ort.set_device(2, "TestExecutionProvider", {"device_id": "1", "some_config": "val"})
-            ort_device = torch_ort.device(1)
+            torch_ort.device(1)
         assert "My EP provider created, with device id: 1, some_option: val" in out.capturedtext
         # test the reusing EP instance
         with OutputGrabber() as out:
             torch_ort.set_device(3, "TestExecutionProvider", {"device_id": "0", "some_config": "val"})
-            ort_device = torch_ort.device(1)
+            torch_ort.device(1)
         assert "My EP provider created, with device id: 0, some_option: val" not in out.capturedtext
         # test clear training ep instance pool
         torch_ort.clear_training_ep_instances()
         with OutputGrabber() as out:
             torch_ort.set_device(3, "TestExecutionProvider", {"device_id": "0", "some_config": "val"})
-            ort_device = torch_ort.device(1)
+            torch_ort.device(1)
         assert "My EP provider created, with device id: 0, some_option: val" in out.capturedtext
 
     @unittest.skip("Test fails with newest pytorch version.")
diff --git a/orttraining/orttraining/eager/test/ort_init.py b/orttraining/orttraining/eager/test/ort_init.py
index 43602cc6a5fdb..af13d439cbe66 100644
--- a/orttraining/orttraining/eager/test/ort_init.py
+++ b/orttraining/orttraining/eager/test/ort_init.py
@@ -8,6 +8,7 @@
 # after the import, hence this test is isolated from the others.
 
 import unittest
+
 import torch
 
 
@@ -22,7 +23,7 @@ def ort_alloc():
         with self.assertRaises(BaseException):
             ort_alloc()
 
-        import onnxruntime_pybind11_state as torch_ort
+        import onnxruntime_pybind11_state as torch_ort  # noqa: F401
 
         ort_alloc()
         self.assertIn(config_match, torch._C._show_config())
diff --git a/orttraining/orttraining/eager/test/ort_ops.py b/orttraining/orttraining/eager/test/ort_ops.py
index cf694432cd7e8..2f6391b982725 100644
--- a/orttraining/orttraining/eager/test/ort_ops.py
+++ b/orttraining/orttraining/eager/test/ort_ops.py
@@ -41,7 +41,7 @@ def test_type_promotion_add(self):
         assert ort_result.dtype == torch.float32
         assert torch.allclose(cpu_result, ort_result.cpu())
 
-        ## verify setting out to type int while inputs are float cause an error as casting float to int is not allowed.
+        # verify setting out to type int while inputs are float cause an error as casting float to int is not allowed.
         cpu_out_tensor = torch.tensor([], dtype=torch.int)
         ort_out_tensor = cpu_out_tensor.to(device)
         with self.assertRaises(RuntimeError):
@@ -605,19 +605,23 @@ def test_add_broadcasting(self):
     # ["det"          ]]
 
     # The function renames the test function: ops/math_sign_ops (e.g. abs)+ the test name(e.g. out), results in: test_abs_out
-    def rename_func(testcase_func, param_num, param):
+    def rename_func(testcase_func, param_num, param):  # noqa: N805
         return f"test_{parameterized.to_safe_name(str(param.args[0]))}{testcase_func.__name__[7:]}"
 
     # @parameterized.expand generate test methods for ops and using name_func we renaming the test to be test_{ops}
     @parameterized.expand(ops, name_func=rename_func)
-    def test_op(self, test_name, tensor_test=torch.rand(6)):
+    def test_op(self, test_name, tensor_test=None):
+        if tensor_test is None:
+            tensor_test = torch.rand(6)
         cpu_result = getattr(torch, test_name)(tensor_test)
         ort_result = getattr(torch, test_name)(tensor_test.to(self.get_device()))
 
         assert torch.allclose(cpu_result, ort_result.cpu(), equal_nan=True)
 
     @parameterized.expand(ops, name_func=rename_func)
-    def test_op_inplace(self, test_name, tensor_test=torch.rand(6)):
+    def test_op_inplace(self, test_name, tensor_test=None):
+        if tensor_test is None:
+            tensor_test = torch.rand(6)
         device = self.get_device()
 
         cpu_tensor = tensor_test
@@ -629,8 +633,10 @@ def test_op_inplace(self, test_name, tensor_test=torch.rand(6)):
         assert torch.allclose(cpu_tensor, ort_tensor.cpu(), equal_nan=True)
 
     @parameterized.expand(ops, name_func=rename_func)
-    def test_op_out(self, test_name, tensor_test=torch.rand(6)):
-        ##relu -don't have output
+    def test_op_out(self, test_name, tensor_test=None):
+        if tensor_test is None:
+            tensor_test = torch.rand(6)
+        # relu doesn't have output
         if test_name == "relu":
             self.skipTest(f"no {test_name}_output")
         ### troubleshoot later: the following tests are Failing.
diff --git a/orttraining/orttraining/eager/test/ort_tensor.py b/orttraining/orttraining/eager/test/ort_tensor.py
index a0cfdaa2cd0d6..162fb2a308d0f 100644
--- a/orttraining/orttraining/eager/test/ort_tensor.py
+++ b/orttraining/orttraining/eager/test/ort_tensor.py
@@ -2,8 +2,9 @@
 # Licensed under the MIT License.
 
 import unittest
+
+import onnxruntime_pybind11_state as torch_ort  # noqa: F401
 import torch
-import onnxruntime_pybind11_state as torch_ort
 
 
 class OrtTensorTests(unittest.TestCase):
@@ -49,7 +50,7 @@ def test_stride(self):
         ort_ones = cpu_ones.to("ort")
         y = torch.as_strided(ort_ones, (2, 2), (1, 2))
         assert y.size() == (2, 2)
-        assert y.is_contiguous() == False
+        assert y.is_contiguous() is False
         contiguous_y = y.contiguous()
         w = torch.ones((2, 3))
         ort_w = w.to("ort")
@@ -65,7 +66,7 @@ def test_slice(self):
         ort_ones = cpu_ones.to("ort")
         y_cpu = cpu_ones[0:128, :128]
         y = ort_ones[0:128, :128]
-        assert y.is_contiguous() == False
+        assert y.is_contiguous() is False
         assert y.size() == (128, 128)
         assert torch.allclose(y.cpu(), y_cpu)
 
diff --git a/orttraining/orttraining/eager/test_model_OrtModule/mnist_fc_training.py b/orttraining/orttraining/eager/test_model_OrtModule/mnist_fc_training.py
index 505fdf24933de..f2bac5fa5bc50 100644
--- a/orttraining/orttraining/eager/test_model_OrtModule/mnist_fc_training.py
+++ b/orttraining/orttraining/eager/test_model_OrtModule/mnist_fc_training.py
@@ -1,25 +1,24 @@
-## This code is from https://github.com/pytorch/examples/blob/master/mnist/main.py
-## with modification to do training using onnxruntime as backend on cuda device.
-## A private PyTorch build from https://aiinfra.visualstudio.com/Lotus/_git/pytorch (ORTTraining branch) is needed to run the demo.
+# This code is from https://github.com/pytorch/examples/blob/master/mnist/main.py
+# with modification to do training using onnxruntime as backend on cuda device.
+# A private PyTorch build from https://aiinfra.visualstudio.com/Lotus/_git/pytorch (ORTTraining branch) is needed to run the demo.
 
-## Model testing is not complete.
+# Model testing is not complete.
 
-from __future__ import print_function
 import argparse
+
 import torch
-from onnxruntime.training import ORTModule
-from onnxruntime.capi import _pybind_state as torch_ort_eager
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
 from torchvision import datasets, transforms
-import numpy as np
-import os
+
+from onnxruntime.capi import _pybind_state as torch_ort_eager
+from onnxruntime.training import ORTModule
 
 
 class NeuralNet(nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNet, self).__init__()
+        super().__init__()
         self.fc1 = nn.Linear(input_size, hidden_size)
         self.relu = nn.ReLU()
         self.fc2 = nn.Linear(hidden_size, num_classes)
@@ -82,7 +81,7 @@ def main():
     )
 
     args = parser.parse_args()
-    use_cuda = not args.no_cuda and torch.cuda.is_available()
+    not args.no_cuda and torch.cuda.is_available()
 
     torch.manual_seed(args.seed)
 
@@ -98,7 +97,7 @@ def main():
         shuffle=True,
         **kwargs,
     )
-    test_loader = torch.utils.data.DataLoader(
+    torch.utils.data.DataLoader(
         datasets.MNIST(
             "./data",
             train=False,
diff --git a/orttraining/orttraining/eager/test_models/mnist_fc.py b/orttraining/orttraining/eager/test_models/mnist_fc.py
index 0f0b3bb604149..2009303f5b08d 100644
--- a/orttraining/orttraining/eager/test_models/mnist_fc.py
+++ b/orttraining/orttraining/eager/test_models/mnist_fc.py
@@ -1,16 +1,11 @@
-from __future__ import print_function
-import argparse
+import onnxruntime_pybind11_state as torch_ort
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
-import numpy as np
-import os
-import onnxruntime_pybind11_state as torch_ort
 
 
 class NeuralNet(nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNet, self).__init__()
+        super().__init__()
         self.fc1 = nn.Linear(input_size, hidden_size)
         self.relu = nn.ReLU()
         self.fc2 = nn.Linear(hidden_size, num_classes)
@@ -31,7 +26,6 @@ def forward(self, x):
 device = torch_ort.device()
 
 with torch.no_grad():
-
     model = NeuralNet(input_size, hidden_size, num_classes)
     pred = model(batch)
     print("inference result is: ")
diff --git a/orttraining/orttraining/eager/test_models/mnist_fc_training.py b/orttraining/orttraining/eager/test_models/mnist_fc_training.py
index 95ba3bf060332..744a264e87cfb 100644
--- a/orttraining/orttraining/eager/test_models/mnist_fc_training.py
+++ b/orttraining/orttraining/eager/test_models/mnist_fc_training.py
@@ -4,7 +4,6 @@
 # pylint: disable=missing-docstring
 # pylint: disable=C0103
 
-from __future__ import print_function
 
 import argparse
 import os
@@ -99,7 +98,7 @@ def main():
         shuffle=True,
         **kwargs,
     )
-    test_loader = torch.utils.data.DataLoader(
+    torch.utils.data.DataLoader(
         datasets.MNIST(
             dataset_root_dir,
             train=False,
diff --git a/orttraining/orttraining/eager/test_models/scratchpad.py b/orttraining/orttraining/eager/test_models/scratchpad.py
index 049aa859c842c..01237d0cd029d 100644
--- a/orttraining/orttraining/eager/test_models/scratchpad.py
+++ b/orttraining/orttraining/eager/test_models/scratchpad.py
@@ -1,8 +1,8 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-import torch
 import onnxruntime_pybind11_state as torch_ort
+import torch
 
 device = torch_ort.device()
 
diff --git a/orttraining/orttraining/eager/test_models/training_test.py b/orttraining/orttraining/eager/test_models/training_test.py
index c0103e55cf85e..943def74bd707 100644
--- a/orttraining/orttraining/eager/test_models/training_test.py
+++ b/orttraining/orttraining/eager/test_models/training_test.py
@@ -49,7 +49,7 @@ def forward(self, sample):
 
 def train_loop(dataloader, model, loss_fn, optimizer):
     size = len(dataloader.dataset)
-    for batch, (X, y) in enumerate(dataloader):
+    for batch, (X, y) in enumerate(dataloader):  # noqa: N806
         # Compute prediction and loss
         x_ort = X.to(device)
         y_ort = y.to(device)
@@ -72,7 +72,7 @@ def test_loop(dataloader, model, loss_fn):
     test_loss, correct = 0, 0
 
     with torch.no_grad():
-        for X, y in dataloader:
+        for X, y in dataloader:  # noqa: N806
             x_ort = X.to(device)
             y_ort = y.to(device)
             pred = model(x_ort)
diff --git a/orttraining/orttraining/python/checkpointing_utils.py b/orttraining/orttraining/python/checkpointing_utils.py
index 359f6a8c53552..26df5acd8c96c 100644
--- a/orttraining/orttraining/python/checkpointing_utils.py
+++ b/orttraining/orttraining/python/checkpointing_utils.py
@@ -1,4 +1,5 @@
 import os
+
 import torch
 
 
@@ -14,8 +15,8 @@ def list_checkpoint_files(checkpoint_dir, checkpoint_prefix, extension=".ort.pt"
 
 
 def get_checkpoint_name(prefix, is_partitioned, world_rank=None, world_size=None):
-    SINGLE_CHECKPOINT_FILENAME = "{prefix}.ort.pt"
-    MULTIPLE_CHECKPOINT_FILENAME = "{prefix}.ZeRO.{world_rank}.{world_size}.ort.pt"
+    SINGLE_CHECKPOINT_FILENAME = "{prefix}.ort.pt"  # noqa: N806
+    MULTIPLE_CHECKPOINT_FILENAME = "{prefix}.ZeRO.{world_rank}.{world_size}.ort.pt"  # noqa: N806
 
     if is_partitioned:
         filename = MULTIPLE_CHECKPOINT_FILENAME.format(
@@ -42,9 +43,8 @@ def _split_state_dict(state_dict):
     return split_sd
 
 
-class CombineZeroCheckpoint(object):
+class CombineZeroCheckpoint:
     def __init__(self, checkpoint_files, clean_state_dict=None):
-
         assert len(checkpoint_files) > 0, "No checkpoint files passed"
         self.checkpoint_files = checkpoint_files
         self.clean_state_dict = clean_state_dict
diff --git a/orttraining/orttraining/python/deprecated/__init__.py b/orttraining/orttraining/python/deprecated/__init__.py
index 344b15a4f35d0..6e02db707bc47 100644
--- a/orttraining/orttraining/python/deprecated/__init__.py
+++ b/orttraining/orttraining/python/deprecated/__init__.py
@@ -2,5 +2,5 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
-from onnxruntime.capi._pybind_state import TrainingParameters
-from onnxruntime.capi.training.training_session import TrainingSession
+from onnxruntime.capi._pybind_state import TrainingParameters  # noqa: F401
+from onnxruntime.capi.training.training_session import TrainingSession  # noqa: F401
diff --git a/orttraining/orttraining/python/deprecated/training_session.py b/orttraining/orttraining/python/deprecated/training_session.py
index b6a63dbee35d2..a6900578e174b 100644
--- a/orttraining/orttraining/python/deprecated/training_session.py
+++ b/orttraining/orttraining/python/deprecated/training_session.py
@@ -3,14 +3,14 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-import sys
-import os
+import os  # noqa: F401
+import sys  # noqa: F401
 
 from onnxruntime.capi import _pybind_state as C
+from onnxruntime.capi.onnxruntime_inference_collection import IOBinding  # noqa: F401
 from onnxruntime.capi.onnxruntime_inference_collection import (
-    Session,
     InferenceSession,
-    IOBinding,
+    Session,
     check_and_normalize_provider_args,
 )
 
@@ -38,7 +38,7 @@ def __init__(self, path_or_bytes, parameters, sess_options=None, providers=None,
         elif isinstance(path_or_bytes, bytes):
             config_result = self._sess.read_bytes(path_or_bytes, parameters, providers, provider_options)
         else:
-            raise TypeError("Unable to load from type '{0}'".format(type(path_or_bytes)))
+            raise TypeError(f"Unable to load from type '{type(path_or_bytes)}'")
 
         self.loss_scale_input_name = config_result.loss_scale_input_name
 
diff --git a/orttraining/orttraining/python/ort_trainer.py b/orttraining/orttraining/python/ort_trainer.py
index 5434edd7d4439..00dbd82e1ea16 100644
--- a/orttraining/orttraining/python/ort_trainer.py
+++ b/orttraining/orttraining/python/ort_trainer.py
@@ -1,7 +1,6 @@
 import io
 import os
 import warnings
-from packaging.version import Version as LooseVersion
 
 import numpy as np
 import onnx
@@ -9,6 +8,7 @@
 import torch.nn
 import torch.onnx
 from onnx import helper, numpy_helper
+from packaging.version import Version as LooseVersion
 
 import onnxruntime as ort
 import onnxruntime.capi.pt_patch
@@ -140,13 +140,13 @@ def ort_training_session_run_helper(session, iobinding, inputs, input_descs, out
     return torch_outputs
 
 
-def FuseSofmaxNLLToSoftmaxCE(onnx_model):
+def FuseSofmaxNLLToSoftmaxCE(onnx_model):  # noqa: N802
     nll_count = 0
     while True:
         nll_count = nll_count + 1
         nll_loss_node = None
         nll_loss_node_index = 0
-        for nll_loss_node_index, node in enumerate(onnx_model.graph.node):
+        for nll_loss_node_index, node in enumerate(onnx_model.graph.node):  # noqa: B007
             if node.op_type == "nll_loss" or node.op_type == "NegativeLogLikelihoodLoss":
                 nll_loss_node = node
                 break
@@ -158,7 +158,7 @@ def FuseSofmaxNLLToSoftmaxCE(onnx_model):
         softmax_node_index = 0
         label_input_name = None
         weight_input_name = None
-        for softmax_node_index, node in enumerate(onnx_model.graph.node):
+        for softmax_node_index, node in enumerate(onnx_model.graph.node):  # noqa: B007
             if node.op_type == "LogSoftmax":
                 # has to be connected to nll_loss
                 if len(nll_loss_node.input) > 2:
@@ -238,9 +238,9 @@ def dtype_torch_to_numpy(torch_dtype):
         raise Exception("Torch type to numpy type mapping unavailable for: " + str(torch_dtype))
 
 
-class model_loss_cls(torch.nn.Module):
+class model_loss_cls(torch.nn.Module):  # noqa: N801
     def __init__(self, model, loss_fn):
-        super(model_loss_cls, self).__init__()
+        super().__init__()
         self.model_ = model
         self.loss_fn_ = loss_fn
 
@@ -253,7 +253,7 @@ def forward(self, *inputs):
 
 class WrapModel(torch.nn.Module):
     def __init__(self, model, loss_fn, input_names):
-        super(WrapModel, self).__init__()
+        super().__init__()
         self.model_ = model
         self.loss_fn_ = loss_fn
         self.input_names_ = input_names
@@ -264,10 +264,10 @@ def forward(self, *inputs):
         # *inputs is given by torch trace. It is in the order of input_names.
         # model_ takes input in a order (which can be obtained via inspect.signature(model.forward)) different than input_names.
         sig = inspect.signature(self.model_.forward)
-        ordered_list_keys = list(sig.parameters.keys())
+        list(sig.parameters.keys())
 
         input_dict = {}
-        for key in sig.parameters.keys():
+        for key in sig.parameters:
             if key in self.input_names_:
                 input_dict[key] = inputs[self.input_names_.index(key)]
 
@@ -422,7 +422,7 @@ def convert_model_loss_fn_to_onnx(model, loss_fn, model_desc, device, inputs, op
     onnx_model = onnx.load_model_from_string(f.getvalue())
 
     # Remove 'model_.' prefix introduced by model wrapper for initializers.
-    if isinstance(model, WrapModel) or isinstance(model, model_loss_cls):
+    if isinstance(model, (WrapModel, model_loss_cls)):
         replace_name_dict = {}
         for n in onnx_model.graph.initializer:
             if n.name.startswith("model_."):
@@ -450,7 +450,7 @@ def create_ort_training_session_with_optimizer(
     allreduce_post_accumulation=False,
     deepspeed_zero_stage=0,
     enable_grad_norm_clip=True,
-    frozen_weights=[],
+    frozen_weights=[],  # noqa: B006
     opset_version=DEFAULT_OPSET_VERSION,
     use_deterministic_compute=False,
     use_memory_efficient_gradient=False,
@@ -482,7 +482,7 @@ def create_ort_training_session_with_optimizer(
 
     unused_frozen_weights = [n for n in frozen_weights if n not in [i.name for i in model.graph.initializer]]
     if unused_frozen_weights:
-        raise RuntimeError("{} in frozen_weights not found in model weights.".format(unused_frozen_weights))
+        raise RuntimeError(f"{unused_frozen_weights} in frozen_weights not found in model weights.")
 
     weights_to_train = set()
     for initializer in model.graph.initializer:
@@ -521,7 +521,7 @@ def create_ort_training_session_with_optimizer(
     ort_parameters.optimizer_attributes_map = optimizer_attributes_map
     ort_parameters.optimizer_int_attributes_map = optimizer_int_attributes_map
 
-    sessionOptions = ort.SessionOptions()
+    sessionOptions = ort.SessionOptions()  # noqa: N806
     sessionOptions.use_deterministic_compute = use_deterministic_compute
     if len(optimized_model_filepath) > 0:
         sessionOptions.optimized_model_filepath = optimized_model_filepath
@@ -530,7 +530,7 @@ def create_ort_training_session_with_optimizer(
     eval_io_binding = session.io_binding()
 
     if bind_parameters:
-        for param in torch_params.keys():
+        for param in torch_params:
             torch_tensor = torch_params[param]
 
             train_io_binding.bind_input(
@@ -556,12 +556,12 @@ def create_ort_training_session_with_optimizer(
 def save_checkpoint(
     model, checkpoint_dir, checkpoint_prefix="ORT_checkpoint", checkpoint_state_dict=None, include_optimizer_state=True
 ):
-    if checkpoint_state_dict == None:
+    if checkpoint_state_dict is None:
         checkpoint_state_dict = {"model": model.state_dict(include_optimizer_state)}
     else:
         checkpoint_state_dict.update({"model": model.state_dict(include_optimizer_state)})
 
-    assert os.path.exists(checkpoint_dir), "ERROR: Checkpoint directory doesn't exist: {}".format(checkpoint_dir)
+    assert os.path.exists(checkpoint_dir), f"ERROR: Checkpoint directory doesn't exist: {checkpoint_dir}"
 
     checkpoint_name = get_checkpoint_name(
         checkpoint_prefix, model.deepspeed_zero_stage_, model.world_rank, model.world_size
@@ -569,7 +569,7 @@ def save_checkpoint(
     checkpoint_file = os.path.join(checkpoint_dir, checkpoint_name)
 
     if os.path.exists(checkpoint_file):
-        warnings.warn("{} already exists, overwriting.".format(checkpoint_file))
+        warnings.warn(f"{checkpoint_file} already exists, overwriting.")
 
     torch.save(checkpoint_state_dict, checkpoint_file)
 
@@ -585,7 +585,7 @@ def _load_single_checkpoint(model, checkpoint_dir, checkpoint_prefix, is_partiti
             + "checkpoint file exists for rank {} of {}."
         ).format(checkpoint_file, model.world_rank, model.world_size)
     else:
-        assert_msg = "Couldn't find checkpoint file {}.".format(checkpoint_file)
+        assert_msg = f"Couldn't find checkpoint file {checkpoint_file}."
 
     assert os.path.exists(checkpoint_file), assert_msg
 
@@ -649,7 +649,7 @@ def __init__(
         loss_scaler=None,
         deepspeed_zero_stage=0,
         enable_grad_norm_clip=True,
-        frozen_weights=[],
+        frozen_weights=[],  # noqa: B006
         _opset_version=DEFAULT_OPSET_VERSION,
         _enable_internal_postprocess=True,
         _extra_postprocess=None,
@@ -659,7 +659,7 @@ def __init__(
         enable_adasum=False,
         optimized_model_filepath="",
     ):
-        super(ORTTrainer, self).__init__()
+        super().__init__()
         """
         Initialize ORTTrainer.
 
@@ -976,7 +976,7 @@ def load_state_dict(self, state_dict, strict=False):
             if name in cur_initializers_names:
                 new_initializers[name] = state_dict[name].numpy()
             elif strict:
-                raise RuntimeError("Checkpoint tensor: {} is not present in the model.".format(name))
+                raise RuntimeError(f"Checkpoint tensor: {name} is not present in the model.")
 
         self._update_onnx_model_initializers(new_initializers)
 
@@ -1012,11 +1012,11 @@ def _prepare_input_and_fetches(
 
         for input_desc in input_desc_with_:
             if input_desc.name_ in kwargs:
-                input = input + (kwargs[input_desc.name_],)
+                input = (*input, kwargs[input_desc.name_])
         if internal_learning_rate is not None:
-            input = input + (internal_learning_rate,)
+            input = (*input, internal_learning_rate)
         if internal_loss_scale is not None:
-            input = input + (internal_loss_scale,)
+            input = (*input, internal_loss_scale)
         elif self.use_mixed_precision:
             # loss_scale input name is needed to call train_step, for example:
             #   kwargs[model.loss_scale_input_name] = loss_scale
@@ -1024,8 +1024,8 @@ def _prepare_input_and_fetches(
             # However, when first time train_step is called model.loss_scale_input_name is not set.
             # To workaround this problem, we use the special name 'default_loss_scale_input_name' to indicate
             # the loss_scale.
-            if "default_loss_scale_input_name" in kwargs.keys():
-                input = input + (kwargs["default_loss_scale_input_name"],)
+            if "default_loss_scale_input_name" in kwargs:
+                input = (*input, kwargs["default_loss_scale_input_name"])
 
         fetches = None
         if "fetches" in kwargs:
@@ -1203,12 +1203,12 @@ def __init__(
         self,
         loss_scale_input_name,
         is_dynamic_scale,
-        loss_scale=float(1 << 16),
+        loss_scale=float(1 << 16),  # noqa: B008
         up_scale_window=2000,
         min_loss_scale=1.0,
-        max_loss_scale=float(1 << 24),
+        max_loss_scale=float(1 << 24),  # noqa: B008
     ):
-        super(LossScaler, self).__init__()
+        super().__init__()
         self.loss_scale_input_name_ = loss_scale_input_name
         self.is_dynamic_scale_ = is_dynamic_scale
         self.initial_loss_scale_ = loss_scale
diff --git a/orttraining/orttraining/python/pt_patch.py b/orttraining/orttraining/python/pt_patch.py
index b524a286c9de7..5c5d205b21318 100644
--- a/orttraining/orttraining/python/pt_patch.py
+++ b/orttraining/orttraining/python/pt_patch.py
@@ -1,9 +1,7 @@
 import torch
-
-from torch.onnx import symbolic_opset10
-from torch.onnx import symbolic_opset12
-from torch.onnx.symbolic_helper import parse_args
 import torch.onnx.symbolic_helper as sym_help
+from torch.onnx import symbolic_opset10, symbolic_opset12
+from torch.onnx.symbolic_helper import parse_args
 
 
 @parse_args("v", "v", "v", "v", "i", "none")
diff --git a/orttraining/orttraining/python/training/__init__.py b/orttraining/orttraining/python/training/__init__.py
index 4a69f1439c656..d2a20c2029a4c 100644
--- a/orttraining/orttraining/python/training/__init__.py
+++ b/orttraining/orttraining/python/training/__init__.py
@@ -2,18 +2,19 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
+# isort: skip_file
+from onnxruntime.capi._pybind_state import PropagateCastOpsStrategy, TrainingParameters  # noqa: F401
+from onnxruntime.capi.training.training_session import TrainingSession  # noqa: F401
 
-from onnxruntime.capi._pybind_state import PropagateCastOpsStrategy, TrainingParameters
-from onnxruntime.capi.training.training_session import TrainingSession
 
 # Options need to be imported before `ORTTrainer`.
-from .orttrainer_options import ORTTrainerOptions
-from .orttrainer import ORTTrainer, TrainStepInfo
-from . import amp, checkpoint, model_desc_validation, optim
+from .orttrainer_options import ORTTrainerOptions  # noqa: F401
+from .orttrainer import ORTTrainer, TrainStepInfo  # noqa: F401
+from . import amp, checkpoint, model_desc_validation, optim  # noqa: F401
 
 
-try:
-    from .ortmodule import ORTModule
+try:  # noqa: SIM105
+    from .ortmodule import ORTModule  # noqa: F401
 except ImportError:
     # That is OK iff this is not a ORTModule training package
     pass
diff --git a/orttraining/orttraining/python/training/_checkpoint_storage.py b/orttraining/orttraining/python/training/_checkpoint_storage.py
index 461daa57134c0..ee2cd74fd843a 100644
--- a/orttraining/orttraining/python/training/_checkpoint_storage.py
+++ b/orttraining/orttraining/python/training/_checkpoint_storage.py
@@ -3,9 +3,10 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-import h5py
-from collections.abc import Mapping
 import pickle
+from collections.abc import Mapping
+
+import h5py
 
 
 def _dfs_save(group, save_obj):
@@ -99,7 +100,7 @@ def from_serialized_hex(serialized_hex):
     # serialized_hex can be either a regular string or a byte string.
     # if it is a byte string, convert to regular string using decode()
     # if it is a regular string, do nothing to it
-    try:
+    try:  # noqa: SIM105
         serialized_hex = serialized_hex.decode()
     except AttributeError:
         pass
diff --git a/orttraining/orttraining/python/training/_utils.py b/orttraining/orttraining/python/training/_utils.py
index 099a29764839f..7c555b9634d05 100644
--- a/orttraining/orttraining/python/training/_utils.py
+++ b/orttraining/orttraining/python/training/_utils.py
@@ -6,11 +6,11 @@
 import importlib.util
 import os
 import sys
-from functools import wraps
+from functools import wraps  # noqa: F401
 
 import numpy as np
 import torch
-from onnx import TensorProto
+from onnx import TensorProto  # noqa: F401
 from packaging.version import Version
 
 
@@ -213,11 +213,12 @@ def import_module_from_file(file_path, module_name=None):
 
     if not isinstance(file_path, str) or not os.path.exists(file_path):
         raise AssertionError(
-            "'file_path' must be a full path string with the python file to load. " "file_path=%r." % (file_path,)
+            "'file_path' must be a full path string with the python file to load. " "file_path={!r}.".format(file_path)
         )
     if module_name is not None and (not isinstance(module_name, str) or not module_name):
         raise AssertionError(
-            "'module_name' must be a string with the python module name to load. " "module_name=%r." % (module_name,)
+            "'module_name' must be a string with the python module name to load. "
+            "module_name={!r}.".format(module_name)
         )
 
     if not module_name:
diff --git a/orttraining/orttraining/python/training/amp/__init__.py b/orttraining/orttraining/python/training/amp/__init__.py
index 33274a8d5e10d..b7f03178f5004 100644
--- a/orttraining/orttraining/python/training/amp/__init__.py
+++ b/orttraining/orttraining/python/training/amp/__init__.py
@@ -1 +1 @@
-from .loss_scaler import LossScaler, DynamicLossScaler
+from .loss_scaler import DynamicLossScaler, LossScaler  # noqa: F401
diff --git a/orttraining/orttraining/python/training/amp/loss_scaler.py b/orttraining/orttraining/python/training/amp/loss_scaler.py
index 42d3d670a59ea..b842ec9346f9f 100644
--- a/orttraining/orttraining/python/training/amp/loss_scaler.py
+++ b/orttraining/orttraining/python/training/amp/loss_scaler.py
@@ -1,4 +1,4 @@
-class LossScaler(object):
+class LossScaler:
     r"""Base class for implementing custom loss scaler strategies
 
     Once the scaler is configured, no user intervention is needed to update loss scale during training.
@@ -88,10 +88,10 @@ class DynamicLossScaler(LossScaler):
     def __init__(
         self,
         automatic_update=True,
-        loss_scale=float(1 << 16),
+        loss_scale=float(1 << 16),  # noqa: B008
         up_scale_window=2000,
         min_loss_scale=1.0,
-        max_loss_scale=float(1 << 24),
+        max_loss_scale=float(1 << 24),  # noqa: B008
     ):
         super().__init__(loss_scale)
         self.automatic_update = automatic_update
diff --git a/orttraining/orttraining/python/training/api/__init__.py b/orttraining/orttraining/python/training/api/__init__.py
index c92b8bdbfe46e..4f7499ed30ddf 100644
--- a/orttraining/orttraining/python/training/api/__init__.py
+++ b/orttraining/orttraining/python/training/api/__init__.py
@@ -1,4 +1,4 @@
-from .checkpoint_state import CheckpointState
-from .lr_scheduler import LinearLRScheduler
-from .module import Module
-from .optimizer import Optimizer
+from .checkpoint_state import CheckpointState  # noqa: F401
+from .lr_scheduler import LinearLRScheduler  # noqa: F401
+from .module import Module  # noqa: F401
+from .optimizer import Optimizer  # noqa: F401
diff --git a/orttraining/orttraining/python/training/api/lr_scheduler.py b/orttraining/orttraining/python/training/api/lr_scheduler.py
index cff7eaaa14555..5783ee316d203 100644
--- a/orttraining/orttraining/python/training/api/lr_scheduler.py
+++ b/orttraining/orttraining/python/training/api/lr_scheduler.py
@@ -22,7 +22,6 @@ class LinearLRScheduler:
     """
 
     def __init__(self, optimizer, warmup_step_count, total_step_count, initial_lr) -> None:
-
         self._scheduler = C.LinearLRScheduler(optimizer._optimizer, warmup_step_count, total_step_count, initial_lr)
 
     def step(self):
diff --git a/orttraining/orttraining/python/training/api/module.py b/orttraining/orttraining/python/training/api/module.py
index 97194e447441b..da3889ac0eb72 100644
--- a/orttraining/orttraining/python/training/api/module.py
+++ b/orttraining/orttraining/python/training/api/module.py
@@ -2,14 +2,14 @@
 # Licensed under the MIT License.
 # module.py
 
+from typing import List
+
 import numpy as np
 
 from onnxruntime.capi import _pybind_state as C
 from onnxruntime.capi.onnxruntime_inference_collection import OrtValue, get_ort_device_type
 from onnxruntime.capi.onnxruntime_pybind11_state import OrtValueVector
 
-from typing import List
-
 
 class Module:
     """
diff --git a/orttraining/orttraining/python/training/checkpoint.py b/orttraining/orttraining/python/training/checkpoint.py
index e4a2f1230b7a4..99588dc8df0c2 100644
--- a/orttraining/orttraining/python/training/checkpoint.py
+++ b/orttraining/orttraining/python/training/checkpoint.py
@@ -1,12 +1,13 @@
-import numpy as np
-import onnx
 import os
-import torch
-import warnings
 import tempfile
+import warnings
 from enum import Enum
-from . import _checkpoint_storage, _utils
 
+import numpy as np
+import onnx
+import torch
+
+from . import _checkpoint_storage, _utils
 
 ################################################################################
 # Experimental Checkpoint APIs
@@ -65,7 +66,7 @@ def experimental_load_state_dict(ort_trainer, state_dict, strict=False):
         if name in cur_initializers_names:
             new_initializers[name] = state_dict[name].numpy()
         elif strict:
-            raise RuntimeError("Checkpoint tensor: {} is not present in the model.".format(name))
+            raise RuntimeError(f"Checkpoint tensor: {name} is not present in the model.")
 
     ort_trainer._update_onnx_model_initializers(new_initializers)
 
@@ -131,7 +132,7 @@ def experimental_load_checkpoint(ort_trainer, checkpoint_dir, checkpoint_prefix=
         return _load_single_checkpoint(ort_trainer, checkpoint_dir, checkpoint_prefix, is_partitioned, strict)
 
 
-class _AGGREGATION_MODE(Enum):
+class _AGGREGATION_MODE(Enum):  # noqa: N801
     Zero = 0
     Megatron = 1
 
@@ -241,7 +242,7 @@ def _aggregate_model_states(
                 model_state_key,
                 model_state_value,
                 state_dict[model][full_precision],
-                "Value mismatch for model state {}".format(model_state_key),
+                f"Value mismatch for model state {model_state_key}",
             )
 
 
@@ -283,7 +284,7 @@ def _aggregate_optimizer_states(rank_state_dict, sharded_states_original_dims, s
                     optimizer_key,
                     optimizer_value,
                     state_dict[optimizer][model_state_key],
-                    "Value mismatch for model state {} and optimizer state {}".format(model_state_key, optimizer_key),
+                    f"Value mismatch for model state {model_state_key} and optimizer state {optimizer_key}",
                 )
 
 
@@ -319,8 +320,8 @@ def _aggregate_trainer_options(rank_state_dict, state_dict, partial_aggregation)
     world_rank = _utils.state_dict_trainer_options_world_rank_key()
     world_size = _utils.state_dict_trainer_options_world_size_key()
     optimizer_name = _utils.state_dict_trainer_options_optimizer_name_key()
-    D_size = _utils.state_dict_trainer_options_data_parallel_size_key()
-    H_size = _utils.state_dict_trainer_options_horizontal_parallel_size_key()
+    D_size = _utils.state_dict_trainer_options_data_parallel_size_key()  # noqa: N806
+    H_size = _utils.state_dict_trainer_options_horizontal_parallel_size_key()  # noqa: N806
 
     state_dict[trainer_options][mixed_precision] = rank_state_dict[trainer_options][mixed_precision]
     state_dict[trainer_options][zero_stage] = 0
@@ -422,32 +423,32 @@ def _aggregate_over_ranks(
         assert (
             ranks[i] == rank_state_dict[_utils.state_dict_trainer_options_key()][world_rank]
         ), "Unexpected rank in file at path {}. Expected {}, got {}".format(
-            path, rank, rank_state_dict[_utils.state_dict_trainer_options_key()][world_rank]
+            path, rank, rank_state_dict[_utils.state_dict_trainer_options_key()][world_rank]  # noqa: F821
         )
         if loaded_mixed_precision is None:
             loaded_mixed_precision = rank_state_dict[_utils.state_dict_trainer_options_key()][mixed_precision]
         else:
             assert (
                 loaded_mixed_precision == rank_state_dict[_utils.state_dict_trainer_options_key()][mixed_precision]
-            ), "Mixed precision state mismatch among checkpoint files. File: {}".format(path)
+            ), f"Mixed precision state mismatch among checkpoint files. File: {path}"
         if loaded_world_size is None:
             loaded_world_size = rank_state_dict[_utils.state_dict_trainer_options_key()][world_size]
         else:
             assert (
                 loaded_world_size == rank_state_dict[_utils.state_dict_trainer_options_key()][world_size]
-            ), "World size state mismatch among checkpoint files. File: {}".format(path)
+            ), f"World size state mismatch among checkpoint files. File: {path}"
         if loaded_zero_stage is None:
             loaded_zero_stage = rank_state_dict[_utils.state_dict_trainer_options_key()][zero_stage]
         else:
             assert (
                 loaded_zero_stage == rank_state_dict[_utils.state_dict_trainer_options_key()][zero_stage]
-            ), "Zero stage mismatch among checkpoint files. File: {}".format(path)
+            ), f"Zero stage mismatch among checkpoint files. File: {path}"
         if loaded_optimizer_name is None:
             loaded_optimizer_name = rank_state_dict[_utils.state_dict_trainer_options_key()][optimizer_name]
         else:
             assert (
                 loaded_optimizer_name == rank_state_dict[_utils.state_dict_trainer_options_key()][optimizer_name]
-            ), "Optimizer name mismatch among checkpoint files. File: {}".format(path)
+            ), f"Optimizer name mismatch among checkpoint files. File: {path}"
 
         # aggregate all model states
         _aggregate_model_states(rank_state_dict, sharded_states_original_dims, state_dict, loaded_mixed_precision, mode)
@@ -484,7 +485,7 @@ def _aggregate_over_ranks(
     return _to_pytorch_format(state_dict) if pytorch_format else state_dict
 
 
-def _aggregate_over_D_H(ordered_paths, D_groups, H_groups, pytorch_format):
+def _aggregate_over_D_H(ordered_paths, D_groups, H_groups, pytorch_format):  # noqa: N802
     """Aggregate checkpoint files and return a single state dictionary for the D+H
     (Zero+Megatron) partitioning strategy.
     For D+H aggregation scenario, the first pass of aggregation(partial aggregation) is over D groups
@@ -540,14 +541,14 @@ def aggregate_checkpoints(paths, pytorch_format=True):
     """
 
     loaded_trainer_options = _checkpoint_storage.load(paths[0], key=_utils.state_dict_trainer_options_key())
-    D_size = _utils.state_dict_trainer_options_data_parallel_size_key()
-    H_size = _utils.state_dict_trainer_options_horizontal_parallel_size_key()
+    D_size = _utils.state_dict_trainer_options_data_parallel_size_key()  # noqa: N806
+    H_size = _utils.state_dict_trainer_options_horizontal_parallel_size_key()  # noqa: N806
     world_size = _utils.state_dict_trainer_options_world_size_key()
 
-    D_size = loaded_trainer_options[D_size]
-    H_size = loaded_trainer_options[H_size]
+    D_size = loaded_trainer_options[D_size]  # noqa: N806
+    H_size = loaded_trainer_options[H_size]  # noqa: N806
     world_size = loaded_trainer_options[world_size]
-    D_groups, H_groups = _get_parallellism_groups(D_size, H_size, world_size)
+    D_groups, H_groups = _get_parallellism_groups(D_size, H_size, world_size)  # noqa: N806
 
     combine_zero = loaded_trainer_options[_utils.state_dict_trainer_options_zero_stage_key()] > 0
     combine_megatron = len(H_groups[0]) > 1
@@ -630,8 +631,8 @@ def _list_checkpoint_files(checkpoint_dir, checkpoint_prefix, extension=".ort.pt
 
 
 def _get_checkpoint_name(prefix, is_partitioned, world_rank=None, world_size=None):
-    SINGLE_CHECKPOINT_FILENAME = "{prefix}.ort.pt"
-    MULTIPLE_CHECKPOINT_FILENAME = "{prefix}.ZeRO.{world_rank}.{world_size}.ort.pt"
+    SINGLE_CHECKPOINT_FILENAME = "{prefix}.ort.pt"  # noqa: N806
+    MULTIPLE_CHECKPOINT_FILENAME = "{prefix}.ZeRO.{world_rank}.{world_size}.ort.pt"  # noqa: N806
 
     if is_partitioned:
         filename = MULTIPLE_CHECKPOINT_FILENAME.format(
@@ -657,9 +658,8 @@ def _split_state_dict(state_dict):
     return split_sd
 
 
-class _CombineZeroCheckpoint(object):
+class _CombineZeroCheckpoint:
     def __init__(self, checkpoint_files, clean_state_dict=None):
-
         assert len(checkpoint_files) > 0, "No checkpoint files passed"
         self.checkpoint_files = checkpoint_files
         self.clean_state_dict = clean_state_dict
diff --git a/orttraining/orttraining/python/training/experimental/__init__.py b/orttraining/orttraining/python/training/experimental/__init__.py
index efc43a4bdc58c..b4a1ffa7364bd 100644
--- a/orttraining/orttraining/python/training/experimental/__init__.py
+++ b/orttraining/orttraining/python/training/experimental/__init__.py
@@ -1 +1 @@
-from .gradient_graph._gradient_graph_tools import export_gradient_graph
+from .gradient_graph._gradient_graph_tools import export_gradient_graph  # noqa: F401
diff --git a/orttraining/orttraining/python/training/experimental/exporter.py b/orttraining/orttraining/python/training/experimental/exporter.py
index 8c5ccd1119576..846bdf52f8cd9 100644
--- a/orttraining/orttraining/python/training/experimental/exporter.py
+++ b/orttraining/orttraining/python/training/experimental/exporter.py
@@ -4,7 +4,11 @@
 
 
 def _export_jit_graph_to_onnx_model_proto(graph: torch._C.Graph, operator_export_type: int):
-    from torch.onnx.symbolic_helper import _set_onnx_shape_inference, _set_operator_export_type, _set_opset_version
+    from torch.onnx.symbolic_helper import (  # noqa: F401
+        _set_onnx_shape_inference,
+        _set_operator_export_type,
+        _set_opset_version,
+    )
 
     _set_onnx_shape_inference(True)
     _set_operator_export_type(operator_export_type)
diff --git a/orttraining/orttraining/python/training/experimental/gradient_graph/_gradient_graph_tools.py b/orttraining/orttraining/python/training/experimental/gradient_graph/_gradient_graph_tools.py
index 91c656619f621..a5242ab04789f 100644
--- a/orttraining/orttraining/python/training/experimental/gradient_graph/_gradient_graph_tools.py
+++ b/orttraining/orttraining/python/training/experimental/gradient_graph/_gradient_graph_tools.py
@@ -1,11 +1,12 @@
 import io
 from pathlib import Path
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, Optional, Union  # noqa: F401
 
 import torch
-from onnxruntime.capi._pybind_state import GradientGraphBuilder
 from torch.onnx import TrainingMode
 
+from onnxruntime.capi._pybind_state import GradientGraphBuilder
+
 from ...ortmodule._custom_op_symbolic_registry import CustomOpSymbolicRegistry
 
 
@@ -67,7 +68,7 @@ def forward(self, model_input, expected_labels, *model_params):
     args = (example_input, example_labels, *tuple(model.parameters()))
     model_param_names = tuple(name for name, _ in model.named_parameters())
     input_names = ["input", "labels", *model_param_names]
-    nodes_needing_gradients = set(name for name, param in model.named_parameters() if param.requires_grad)
+    nodes_needing_gradients = {name for name, param in model.named_parameters() if param.requires_grad}
 
     f = io.BytesIO()
     torch.onnx.export(
diff --git a/orttraining/orttraining/python/training/model_desc_validation.py b/orttraining/orttraining/python/training/model_desc_validation.py
index e9181f732cb32..dd3f4cb95cd59 100644
--- a/orttraining/orttraining/python/training/model_desc_validation.py
+++ b/orttraining/orttraining/python/training/model_desc_validation.py
@@ -1,8 +1,9 @@
-import cerberus
 from collections import namedtuple
+
+import cerberus
 import torch
-from ._utils import static_vars
 
+from ._utils import static_vars
 
 LEARNING_RATE_IO_DESCRIPTION_NAME = "__learning_rate"
 ALL_FINITE_IO_DESCRIPTION_NAME = "__all_finite"
@@ -10,7 +11,7 @@
 GRADIENT_ACCUMULATION_IO_DESCRIPTION_NAME = "__gradient_accumulation_name"
 
 
-class _ORTTrainerModelDesc(object):
+class _ORTTrainerModelDesc:
     def __init__(self, model_desc):
         # Keep a copy of original input for debug
         self._original = dict(model_desc)
diff --git a/orttraining/orttraining/python/training/onnxblock/__init__.py b/orttraining/orttraining/python/training/onnxblock/__init__.py
index 2dbb19703144d..4dac94b4aa733 100644
--- a/orttraining/orttraining/python/training/onnxblock/__init__.py
+++ b/orttraining/orttraining/python/training/onnxblock/__init__.py
@@ -4,8 +4,8 @@
 
 """Offline tooling for generating files needed for ort training apis."""
 
-from . import loss, optim
-from .building_blocks import Block
-from .checkpoint_utils import load_checkpoint_to_model, save_checkpoint
-from .model import Model, TrainingModel
-from .model_accessor import onnx_model
+from . import loss, optim  # noqa: F401
+from .building_blocks import Block  # noqa: F401
+from .checkpoint_utils import load_checkpoint_to_model, save_checkpoint  # noqa: F401
+from .model import Model, TrainingModel  # noqa: F401
+from .model_accessor import onnx_model  # noqa: F401
diff --git a/orttraining/orttraining/python/training/onnxblock/_graph_utils.py b/orttraining/orttraining/python/training/onnxblock/_graph_utils.py
index d8dc0fa73354c..569e6c5244e00 100644
--- a/orttraining/orttraining/python/training/onnxblock/_graph_utils.py
+++ b/orttraining/orttraining/python/training/onnxblock/_graph_utils.py
@@ -7,8 +7,8 @@
 
 import onnx
 
-from onnxruntime.capi._pybind_state import GradientGraphBuilder, get_optimized_model
 import onnxruntime.training.onnxblock._qat_utils as qat_utils
+from onnxruntime.capi._pybind_state import GradientGraphBuilder, get_optimized_model
 
 
 def get_output_from_output_name(onnx_model, output_name):
diff --git a/orttraining/orttraining/python/training/onnxblock/building_blocks.py b/orttraining/orttraining/python/training/onnxblock/building_blocks.py
index 922c4a2fd0adf..a34d0b4a39906 100644
--- a/orttraining/orttraining/python/training/onnxblock/building_blocks.py
+++ b/orttraining/orttraining/python/training/onnxblock/building_blocks.py
@@ -13,7 +13,7 @@
 class Block(ABC):
     """Base class for all building blocks that can be stacked on top of each other."""
 
-    def __init__(self):
+    def __init__(self):  # noqa: B027
         ...
 
     @abstractmethod
diff --git a/orttraining/orttraining/python/training/onnxblock/loss/__init__.py b/orttraining/orttraining/python/training/onnxblock/loss/__init__.py
index ac21bb0f42438..03b1caa504e82 100644
--- a/orttraining/orttraining/python/training/onnxblock/loss/__init__.py
+++ b/orttraining/orttraining/python/training/onnxblock/loss/__init__.py
@@ -2,4 +2,4 @@
 # Licensed under the MIT License.
 # __init__.py
 
-from .loss import MSELoss, CrossEntropyLoss, BCEWithLogitsLoss
+from .loss import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss  # noqa: F401
diff --git a/orttraining/orttraining/python/training/onnxblock/optim/__init__.py b/orttraining/orttraining/python/training/onnxblock/optim/__init__.py
index c45ab01c26898..601b4eb3a76f9 100644
--- a/orttraining/orttraining/python/training/onnxblock/optim/__init__.py
+++ b/orttraining/orttraining/python/training/onnxblock/optim/__init__.py
@@ -2,4 +2,4 @@
 # Licensed under the MIT License.
 # __init__.py
 
-from .optim import AdamW, ClipGradNorm
+from .optim import AdamW, ClipGradNorm  # noqa: F401
diff --git a/orttraining/orttraining/python/training/optim/__init__.py b/orttraining/orttraining/python/training/optim/__init__.py
index f74fe08202397..3cace4d30c77d 100644
--- a/orttraining/orttraining/python/training/optim/__init__.py
+++ b/orttraining/orttraining/python/training/optim/__init__.py
@@ -1,11 +1,8 @@
-from .config import _OptimizerConfig, AdamConfig, LambConfig, SGDConfig
-from .lr_scheduler import (
-    _LRScheduler,
-    ConstantWarmupLRScheduler,
-    CosineWarmupLRScheduler,
-    LinearWarmupLRScheduler,
-    PolyWarmupLRScheduler,
-)
-
-from .fused_adam import FusedAdam, AdamWMode
-from .fp16_optimizer import FP16_Optimizer
+from .config import AdamConfig, LambConfig, SGDConfig, _OptimizerConfig  # noqa: F401
+from .fp16_optimizer import FP16_Optimizer  # noqa: F401
+from .fused_adam import AdamWMode, FusedAdam  # noqa: F401
+from .lr_scheduler import ConstantWarmupLRScheduler  # noqa: F401
+from .lr_scheduler import CosineWarmupLRScheduler  # noqa: F401
+from .lr_scheduler import LinearWarmupLRScheduler  # noqa: F401
+from .lr_scheduler import PolyWarmupLRScheduler  # noqa: F401
+from .lr_scheduler import _LRScheduler  # noqa: F401
diff --git a/orttraining/orttraining/python/training/optim/_apex_amp_modifier.py b/orttraining/orttraining/python/training/optim/_apex_amp_modifier.py
index 1b91ec2bf3594..9ad575388d4d6 100644
--- a/orttraining/orttraining/python/training/optim/_apex_amp_modifier.py
+++ b/orttraining/orttraining/python/training/optim/_apex_amp_modifier.py
@@ -8,6 +8,7 @@
 
 import types
 import warnings
+
 from ._modifier import FP16OptimizerModifier
 
 
@@ -21,8 +22,9 @@ def can_be_modified(self):
             ["_post_amp_backward", "zero_grad"], require_apex=True, require_torch_non_finite_check=False
         )
 
-    def override_function(m_self):
+    def override_function(m_self):  # noqa: N805
         from apex import amp as apex_amp
+
         from onnxruntime.training.ortmodule.torch_cpp_extensions import fused_ops
 
         warnings.warn("Apex AMP fp16_optimizer functions are overrided with faster implementation.", UserWarning)
diff --git a/orttraining/orttraining/python/training/optim/_ds_modifier.py b/orttraining/orttraining/python/training/optim/_ds_modifier.py
index 8a579f4ccd3d9..cdb39600a372e 100644
--- a/orttraining/orttraining/python/training/optim/_ds_modifier.py
+++ b/orttraining/orttraining/python/training/optim/_ds_modifier.py
@@ -49,7 +49,7 @@ def can_be_modified(self):
 
         try:
             from deepspeed.accelerator import get_accelerator
-        except ImportError as e:
+        except ImportError:
             warnings.warn("Unable to import get_accelerator from deepspeed.accelerator", UserWarning)
         else:
             if not get_accelerator().device_name().startswith("cuda"):
diff --git a/orttraining/orttraining/python/training/optim/_modifier.py b/orttraining/orttraining/python/training/optim/_modifier.py
index 952e90ee431ee..e9296bc63d560 100644
--- a/orttraining/orttraining/python/training/optim/_modifier.py
+++ b/orttraining/orttraining/python/training/optim/_modifier.py
@@ -18,7 +18,7 @@
 multi_tensor_applier = MultiTensorApply(2048 * 32)
 
 
-class FP16OptimizerModifier(object):
+class FP16OptimizerModifier:
     def __init__(self, optimizer) -> None:
         super().__init__()
         self._optimizer = optimizer
@@ -30,11 +30,11 @@ def apply(self):
     def check_requirements(self, required_funcs, require_apex=False, require_torch_non_finite_check=False):
         try:
             if require_apex is True:
-                import amp_C
-                from apex import amp
+                import amp_C  # noqa: F401
+                from apex import amp  # noqa: F401
             if require_torch_non_finite_check is True:
                 _ = torch._amp_foreach_non_finite_check_and_unscale_
-        except Exception as _:
+        except Exception:
             warnings.warn("Skip modifying optimizer because of Apex or torch_non_finite_check not found.", UserWarning)
             return False
 
diff --git a/orttraining/orttraining/python/training/optim/_modifier_registry.py b/orttraining/orttraining/python/training/optim/_modifier_registry.py
index 4291b792a4607..4a3a33ecc0513 100644
--- a/orttraining/orttraining/python/training/optim/_modifier_registry.py
+++ b/orttraining/orttraining/python/training/optim/_modifier_registry.py
@@ -3,9 +3,9 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
+from ._apex_amp_modifier import ApexAMPModifier
 from ._ds_modifier import DeepSpeedZeROModifier
 from ._megatron_modifier import LegacyMegatronLMModifier
-from ._apex_amp_modifier import ApexAMPModifier
 
 OptimizerModifierTypeRegistry = {
     "megatron.fp16.fp16.FP16_Optimizer": LegacyMegatronLMModifier,
diff --git a/orttraining/orttraining/python/training/optim/_multi_tensor_apply.py b/orttraining/orttraining/python/training/optim/_multi_tensor_apply.py
index d1e837eb1a350..c6fe3c1e86435 100644
--- a/orttraining/orttraining/python/training/optim/_multi_tensor_apply.py
+++ b/orttraining/orttraining/python/training/optim/_multi_tensor_apply.py
@@ -11,7 +11,7 @@
 """
 
 
-class MultiTensorApply(object):
+class MultiTensorApply:
     def __init__(self, chunk_size):
         self.chunk_size = chunk_size
 
diff --git a/orttraining/orttraining/python/training/optim/config.py b/orttraining/orttraining/python/training/optim/config.py
index 91ff7bed112e2..0b7a3bbe5fc1e 100644
--- a/orttraining/orttraining/python/training/optim/config.py
+++ b/orttraining/orttraining/python/training/optim/config.py
@@ -1,7 +1,7 @@
 from enum import IntEnum, unique
 
 
-class _OptimizerConfig(object):
+class _OptimizerConfig:
     r"""Base class for optimizer configuration
 
     This class is not an optimizer, but a means to configure existing ones from ORT backend.
@@ -13,7 +13,7 @@ class _OptimizerConfig(object):
         name (str): optimizer names.
             One of 'SGDOptimizer', 'AdamOptimizer' and 'LambOptimizer'
         defaults (dict): optimizer parameters applied to all model parameters.
-                         Used when a parameter group doesn’t specify them.
+                         Used when a parameter group doesn`t specify them.
                          NOTE: Every optimizer must have 'lr'.
         params (list of dict, default is []): list of parameter groups.
             Each dict must contain a 'params' key with a list of names of model's parameter that will
@@ -111,7 +111,7 @@ class SGDConfig(_OptimizerConfig):
         sgd_optim1 = SGDConfig(lr=0.001)
     """
 
-    def __init__(self, params=[], lr=0.001):
+    def __init__(self, params=[], lr=0.001):  # noqa: B006
         super().__init__(name="SGDOptimizer", params=params, defaults={"lr": lr})
         assert isinstance(params, list) and len(params) == 0, "'params' must be an empty list for SGD optimizer"
 
@@ -154,7 +154,7 @@ class DecayMode(IntEnum):
 
     def __init__(
         self,
-        params=[],
+        params=[],  # noqa: B006
         lr=0.001,
         alpha=0.9,
         beta=0.999,
@@ -229,7 +229,7 @@ class LambConfig(_OptimizerConfig):
 
     def __init__(
         self,
-        params=[],
+        params=[],  # noqa: B006
         lr=0.001,
         alpha=0.9,
         beta=0.999,
diff --git a/orttraining/orttraining/python/training/optim/fp16_optimizer.py b/orttraining/orttraining/python/training/optim/fp16_optimizer.py
index ce5fb8e09c6e4..2a5dfbc2189d3 100644
--- a/orttraining/orttraining/python/training/optim/fp16_optimizer.py
+++ b/orttraining/orttraining/python/training/optim/fp16_optimizer.py
@@ -8,7 +8,7 @@
 from ._modifier_registry import OptimizerModifierTypeRegistry
 
 
-def FP16_Optimizer(optimizer, **kwargs):
+def FP16_Optimizer(optimizer, **kwargs):  # noqa: N802
     """
     Simple wrapper to replace inefficient FP16_Optimizer function calls implemented by libraries for example
         Apex, DeepSpeed, Megatron-LM.
diff --git a/orttraining/orttraining/python/training/optim/fused_adam.py b/orttraining/orttraining/python/training/optim/fused_adam.py
index 30ebcf30e4844..52c6cb623e020 100644
--- a/orttraining/orttraining/python/training/optim/fused_adam.py
+++ b/orttraining/orttraining/python/training/optim/fused_adam.py
@@ -10,9 +10,11 @@
 This file is adapted from fused adam in NVIDIA/apex, commit a109f85
 """
 
+from enum import IntEnum
+
 import torch
+
 from ._multi_tensor_apply import MultiTensorApply
-from enum import IntEnum
 
 
 class AdamWMode(IntEnum):
@@ -72,12 +74,11 @@ def __init__(
         weight_decay=0.0,
         set_grad_none=True,
     ):
-
         # The FusedAdam implementation is mathematically equivalent to
         # transformers AdamW. The input arguments also have the same defaults.
 
         defaults = dict(lr=lr, bias_correction=bias_correction, betas=betas, eps=eps, weight_decay=weight_decay)
-        super(FusedAdam, self).__init__(params, defaults)
+        super().__init__(params, defaults)
         self._adam_w_mode = adam_w_mode
         self._set_grad_none = set_grad_none
 
@@ -96,7 +97,7 @@ def zero_grad(self):
                 for p in group["params"]:
                     p.grad = None
         else:
-            super(FusedAdam, self).zero_grad()
+            super().zero_grad()
 
     def step(self, closure=None):
         """Performs a single optimization step.
diff --git a/orttraining/orttraining/python/training/optim/lr_scheduler.py b/orttraining/orttraining/python/training/optim/lr_scheduler.py
index cbe013d32f310..2a9bf438fa172 100644
--- a/orttraining/orttraining/python/training/optim/lr_scheduler.py
+++ b/orttraining/orttraining/python/training/optim/lr_scheduler.py
@@ -1,7 +1,7 @@
 import math
 
 
-class _LRScheduler(object):
+class _LRScheduler:
     r"""Base class for implementing custom learning rate schedulers
 
     Schedulers can be either stateful or stateless.
@@ -273,10 +273,9 @@ def __init__(self, total_steps, lr_end=1e-7, power=1.0, warmup=0.002):
         self._num_warmup_steps = warmup * total_steps
 
     def _warmup_poly(self, train_step_info):
-
         assert (
             train_step_info.optimizer_config.lr > self.lr_end
-        ), f"lr_end ({lr_end}) must be be smaller than initial lr ({train_step_info.optimizer_config.lr})"
+        ), f"lr_end ({lr_end}) must be be smaller than initial lr ({train_step_info.optimizer_config.lr})"  # noqa: F821
 
         if train_step_info.optimization_step < self._num_warmup_steps:
             return float(train_step_info.optimization_step) / float(max(1, self._num_warmup_steps))
diff --git a/orttraining/orttraining/python/training/ortmodule/__init__.py b/orttraining/orttraining/python/training/ortmodule/__init__.py
index f6ed8827bded3..79385af2bbea1 100644
--- a/orttraining/orttraining/python/training/ortmodule/__init__.py
+++ b/orttraining/orttraining/python/training/ortmodule/__init__.py
@@ -25,7 +25,7 @@ def _defined_from_envvar(name, default_value, warn=True):
         new_value = type(default_value)(new_value)
     except (TypeError, ValueError) as e:
         if warn:
-            warnings.warn("Unable to overwrite constant %r due to %r." % (name, e))
+            warnings.warn(f"Unable to overwrite constant {name!r} due to {e!r}.")
         return default_value
     return new_value
 
@@ -88,7 +88,7 @@ def _defined_from_envvar(name, default_value, warn=True):
 
 # Initalized ORT's random seed with pytorch's initial seed
 # in case user has set pytorch seed before importing ORTModule
-set_seed((torch.initial_seed() % sys.maxsize))
+set_seed(torch.initial_seed() % sys.maxsize)
 
 
 # Override torch.manual_seed and torch.cuda.manual_seed
@@ -120,7 +120,7 @@ def _are_deterministic_algorithms_enabled():
     return ORTMODULE_IS_DETERMINISTIC
 
 
-from .debug_options import DebugOptions, LogLevel  # noqa: E402
+from .debug_options import DebugOptions, LogLevel  # noqa: E402, F401
 
 # ORTModule must be loaded only after all validation passes
-from .ortmodule import ORTModule  # noqa: E402
+from .ortmodule import ORTModule  # noqa: E402, F401
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function.py b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function.py
index 046619574487c..31a7f07f3cfbf 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function.py
@@ -4,7 +4,7 @@
 # --------------------------------------------------------------------------
 
 
-class Enabler(object):
+class Enabler:
     def __init__(self):
         self._state = False
 
@@ -30,9 +30,9 @@ def state(self, val):
 
 custom_autograd_function_enabler = Enabler()
 
+
 # Legacy API to enable the custom autograd, keep its name with default value for compatibility.
 def enable_custom_autograd_support(to_enable=True):
-
     import atexit
 
     from torch.onnx import register_custom_op_symbolic, unregister_custom_op_symbolic
@@ -64,7 +64,7 @@ def enable_custom_autograd_support(to_enable=True):
             # This is for the latest Pytorch nightly after this commit:
             # https://github.com/pytorch/pytorch/commit/11bc435622e6b7207bbf37ed1aafe999e1f296ec
             register_custom_op_symbolic("prim::PythonOp", _export, 1)
-        except:
+        except Exception:
             # This applies to Pytorch 1.9 and 1.9.1.
             register_custom_op_symbolic("::prim_PythonOp", _export, 1)
 
@@ -76,15 +76,15 @@ def enable_custom_autograd_support(to_enable=True):
             # This is for the latest Pytorch nightly after this commit:
             # https://github.com/pytorch/pytorch/commit/11bc435622e6b7207bbf37ed1aafe999e1f296ec
             unregister_custom_op_symbolic("prim::PythonOp", 1)
-        except:
+        except Exception:
             # This applies to Pytorch 1.9 and 1.9.1.
             unregister_custom_op_symbolic("::prim_PythonOp", 1)
 
         custom_autograd_function_enabler.state = False
 
 
-from onnxruntime.capi._pybind_state import is_torch_interop_default_on
-from onnxruntime.training import ortmodule
+from onnxruntime.capi._pybind_state import is_torch_interop_default_on  # noqa: E402
+from onnxruntime.training import ortmodule  # noqa: E402
 
 # Enable the custom autograd by default when PythonOp backend support is enabled during build.
 enable_custom_autograd_support(
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py
index 73fcf6b2d1354..0fe070b4c5528 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_exporter.py
@@ -11,12 +11,12 @@
 from packaging import version
 from torch.onnx import symbolic_helper
 
-from onnxruntime.capi._pybind_state import register_torch_autograd_function, register_miscellaneous_const_input
+from onnxruntime.capi._pybind_state import register_miscellaneous_const_input, register_torch_autograd_function
 from onnxruntime.training import ortmodule
 
 from . import _logger
-from ._fallback import ORTModuleONNXModelException, wrap_exception
 from ._custom_op_symbolic_registry import pytorch_type_to_onnx
+from ._fallback import ORTModuleONNXModelException, wrap_exception
 
 # Some autograd.Function's shouldn't be exported as PythonOp.
 # If CheckpointFunction is exported as PythonOp, the checkpointed computation
@@ -51,7 +51,7 @@ def _full_name(klass):
     return module + "." + klass.__qualname__
 
 
-def pytorch_type_to_onnx(scalar_type: str) -> torch.onnx.TensorProtoDataType:
+def pytorch_type_to_onnx(scalar_type: str) -> torch.onnx.TensorProtoDataType:  # noqa: F811
     try:
         return torch.onnx.JitScalarType.from_name(scalar_type).onnx_type()
     except AttributeError:
@@ -220,7 +220,7 @@ def _export_pt_1_10(g, n, *args, **kwargs):
     except Exception as e:
         sys.stdout.flush()
         sys.stderr.flush()
-        raise wrap_exception(ORTModuleONNXModelException, e)
+        raise wrap_exception(ORTModuleONNXModelException, e)  # noqa: B904
 
 
 # Starting from PyTorch 1.11, there has been a change to symbolic function signature
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_runner.py b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_runner.py
index 63daf53266291..8a7542d12180e 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_runner.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_autograd_function_runner.py
@@ -4,14 +4,14 @@
 # --------------------------------------------------------------------------
 
 import sys
-import torch
 
+import torch
 from torch.utils.dlpack import from_dlpack, to_dlpack
 
-from ._fallback import _FallbackManager, ORTModuleFallbackException, ORTModuleIOError, wrap_exception
-
 from onnxruntime.training.ortmodule.torch_cpp_extensions import torch_interop_utils
 
+from ._fallback import ORTModuleFallbackException, ORTModuleIOError, _FallbackManager, wrap_exception  # noqa: F401
+
 
 def wrap_as_dlpack_or_not(grad_flag, tensor_flag, inplace_flag, training_mode_flag, arg):
     """
@@ -139,7 +139,7 @@ def register_context(result):
         if isinstance(result, torch.Tensor):
             ctx = register_context([result])
             return [ctx, to_dlpack(result)]
-        elif isinstance(result, tuple) or isinstance(result, list):
+        elif isinstance(result, (tuple, list)):
             ctx = register_context(result)
             wrapped = [ctx]
             wrapped.extend(list(to_dlpack(value) if value is not None else None for value in result))
@@ -177,7 +177,7 @@ def register_context(result):
         print("Exception happens when running ", forward_function)
         sys.stdout.flush()
         sys.stderr.flush()
-        raise wrap_exception(ORTModuleFallbackException, e)
+        raise wrap_exception(ORTModuleFallbackException, e)  # noqa: B904
 
 
 def call_python_backward_function(
@@ -202,7 +202,7 @@ def call_python_backward_function(
         def wrap_all_outputs(result):
             if isinstance(result, torch.Tensor):
                 return [to_dlpack(result)]
-            elif isinstance(result, tuple) or isinstance(result, list):
+            elif isinstance(result, (tuple, list)):
                 return [to_dlpack(value) if value is not None else None for value in result]
             else:
                 raise wrap_exception(
@@ -235,4 +235,4 @@ def wrap_all_outputs(result):
             print("Exception happens when running ", backward_function)
             sys.stdout.flush()
             sys.stderr.flush()
-            raise wrap_exception(ORTModuleFallbackException, e)
+            raise wrap_exception(ORTModuleFallbackException, e)  # noqa: B904
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py b/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py
index 89a766bd36c29..661629b3bb5c6 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_gradient_registry.py
@@ -61,12 +61,12 @@ class CustomGradientRegistry:
 
     @classmethod
     def register(cls, domain, name, attributes, fn):
-        key = "::".join([domain, name] + list(attributes))
+        key = "::".join([domain, name, *list(attributes)])
         cls._GRADIENTS[key] = _to_gradient_definition(fn())
 
     @classmethod
     def register_custom_stop_gradient_edges(cls, edges, domain, name, *attributes):
-        key = "::".join([domain, name] + list(attributes))
+        key = "::".join([domain, name, *list(attributes)])
         cls._STOP_GRADIENT_EDGES[key] = set(edges)
 
     @classmethod
@@ -213,7 +213,7 @@ def adaptive_avg_pool2d_gradient():
 
 
 @register_gradient("org.pytorch.aten", "ATen", "numpy_T", "")
-def numpy_T_gradient():
+def numpy_T_gradient():  # noqa: N802
     return [
         (
             ("ATen", "org.pytorch.aten"),
@@ -242,7 +242,7 @@ def native_group_norm_gradient():
 def _upsample_gradient(backward_fn, dims):
     scales = ["" for _ in range(dims)]
     if "bilinear" in backward_fn:
-        scales = ["I(2)"] + scales
+        scales = ["I(2)", *scales]
     return [
         ("Shape", ["I(0)"], ["Shape_X"]),
         ("Shape", ["O(0)"], ["Shape_Y"]),
@@ -251,7 +251,7 @@ def _upsample_gradient(backward_fn, dims):
         ("Slice", ["Shape_Y", "Const_Start", "Const_End"], ["Sliced_Shape_Y"]),
         (
             ("ATen", "org.pytorch.aten"),
-            ["GO(0)", "Sliced_Shape_Y", "Shape_X"] + scales,
+            ["GO(0)", "Sliced_Shape_Y", "Shape_X", *scales],
             ["GI(0)"],
             {"operator": {"value": backward_fn, "dtype": "string"}},
         ),
diff --git a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
index 24b13b4ed134f..07b75008df2da 100644
--- a/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
+++ b/orttraining/orttraining/python/training/ortmodule/_custom_op_symbolic_registry.py
@@ -3,7 +3,7 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-import warnings
+import warnings  # noqa: F401
 
 import torch
 import torch.onnx.symbolic_helper as sym_help
@@ -13,7 +13,6 @@
 
 from onnxruntime.training import ortmodule
 
-
 # Mapping from pytorch scalar type to onnx scalar type.
 _CAST_PYTORCH_TO_ONNX = {
     "Byte": torch.onnx.TensorProtoDataType.UINT8,
@@ -174,7 +173,7 @@ def embedding(g, weight, indices, padding_idx, scale_grad_by_freq, sparse):
     )
     indices_shape = _get_tensor_sizes(indices)
     if indices_shape is not None and hasattr(weight.type(), "with_sizes"):
-        output_type = weight.type().with_sizes(indices_shape + [_get_tensor_dim_size(weight, 1)])
+        output_type = weight.type().with_sizes([*indices_shape, _get_tensor_dim_size(weight, 1)])
         output.setType(output_type)
     return output
 
@@ -272,7 +271,7 @@ def adaptive_avg_pool2d(g, self, output_size):
 
 
 @register_symbolic("numpy_T")
-def numpy_T(g, self):
+def numpy_T(g, self):  # noqa: N802
     # Numpy-style `a.T`: returns the tensor
     # with dims reversed
     rank = sym_help._get_tensor_rank(self)
@@ -302,7 +301,7 @@ def squeeze(g, self, dim=None):
 # exporting to Split with SplitGrad as gradient graph.
 # Exporter will fail to register symbolic with non-empty domain when torch version is < 1.11.0.
 @register_symbolic("ConstantChunk", "prim", torch_version_start="1.11.0")
-def prim_ConstantChunk(g, self, chunks, dim):
+def prim_ConstantChunk(g, self, chunks, dim):  # noqa: N802
     if chunks == 1:
         return self
     input_shape_dim = g.op(
@@ -551,7 +550,7 @@ def einsum_internal(g, equation, tensor_list):
     # After process contraction labels, contraction_labels = [k],
     # label_perm_map = {(s, 0), (m, 1), (k, 2)}, out_size = 2, perm_size = 3.
     out_size = len(result_labels)
-    label_perm_map = dict([(label, idx) for idx, label in enumerate(result_labels)])
+    label_perm_map = {label: idx for idx, label in enumerate(result_labels)}
     perm_size = out_size
     contraction_labels = []
     lhs_reduce_sum_axes = []
@@ -760,9 +759,9 @@ def group_norm(g, input, num_groups, weight, bias, eps, cudnn_enabled):
 
     shape = g.op("Shape", input)
     size = g.op("Size", input)
-    N = g.op("Gather", shape, g.op("Constant", value_t=torch.tensor(0, dtype=torch.long)), axis_i=0)
-    C = g.op("Gather", shape, g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)), axis_i=0)
-    HxW = g.op("Div", size, g.op("Mul", N, C))
+    N = g.op("Gather", shape, g.op("Constant", value_t=torch.tensor(0, dtype=torch.long)), axis_i=0)  # noqa: N806
+    C = g.op("Gather", shape, g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)), axis_i=0)  # noqa: N806
+    HxW = g.op("Div", size, g.op("Mul", N, C))  # noqa: N806
     return g.op(
         "org.pytorch.aten::ATen",
         input,
diff --git a/orttraining/orttraining/python/training/ortmodule/_execution_agent.py b/orttraining/orttraining/python/training/ortmodule/_execution_agent.py
index be2cf01b1f33b..533fea5a0a721 100644
--- a/orttraining/orttraining/python/training/ortmodule/_execution_agent.py
+++ b/orttraining/orttraining/python/training/ortmodule/_execution_agent.py
@@ -6,7 +6,7 @@
 import onnxruntime
 from onnxruntime.capi import _pybind_state as C
 from onnxruntime.capi._pybind_state import TrainingAgent as C_TrainingAgent
-from onnxruntime.capi.onnxruntime_inference_collection import IOBinding, OrtValue
+from onnxruntime.capi.onnxruntime_inference_collection import IOBinding, OrtValue  # noqa: F401
 
 
 class ExecutionAgentOutput:  # pylint: disable=R0903
@@ -81,7 +81,7 @@ def run_forward(self, iobinding, run_options):
         return ExecutionAgentOutput(ortvalues)
 
 
-class TrainingAgent(object):
+class TrainingAgent:
     """
     This is the main class used to run an ORTModule model training.
     """
diff --git a/orttraining/orttraining/python/training/ortmodule/_fallback.py b/orttraining/orttraining/python/training/ortmodule/_fallback.py
index 7129e522b8c49..c3d7c1154bbf5 100644
--- a/orttraining/orttraining/python/training/ortmodule/_fallback.py
+++ b/orttraining/orttraining/python/training/ortmodule/_fallback.py
@@ -3,24 +3,23 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-from . import _logger
-
 import os
-import torch
 import warnings
-
 from enum import IntFlag
 from typing import Optional
+
+import torch
+
+from . import _logger, _utils
+from ._fallback_exceptions import wrap_exception  # noqa: F401
 from ._fallback_exceptions import (
+    ORTModuleDeviceException,
     ORTModuleFallbackException,
     ORTModuleInitException,
-    ORTModuleDeviceException,
     ORTModuleIOError,
-    ORTModuleTorchModelException,
     ORTModuleONNXModelException,
-    wrap_exception,
+    ORTModuleTorchModelException,
 )
-from . import _utils
 
 
 class _FallbackPolicy(IntFlag):
@@ -52,7 +51,7 @@ def is_disabled(self):
         return _FallbackPolicy.FALLBACK_DISABLE in self
 
 
-class _FallbackManager(object):
+class _FallbackManager:
     """Manages fallbacks based on incoming exceptions and specified policies
 
     The basic algorithm is based on a dictionary whose keys are the supported fallback policies
@@ -69,7 +68,6 @@ class _FallbackManager(object):
     """
 
     def __init__(self, pytorch_module: torch.nn.Module, policy: _FallbackPolicy, retry: bool):
-
         self._original_module = pytorch_module
 
         # Read policy from environment variable for testing purposes
@@ -133,7 +131,6 @@ def _set_exception(policy: _FallbackPolicy, exception: Exception, log_level: _lo
                     and type(exception) in self._policy_exception_map[policy.value]
                 )
             ):
-
                 if log_level <= _logger.LogLevel.INFO:
                     warnings.warn(f"Fallback for policy {policy.name} is pending.", UserWarning)
 
diff --git a/orttraining/orttraining/python/training/ortmodule/_fallback_exceptions.py b/orttraining/orttraining/python/training/ortmodule/_fallback_exceptions.py
index 3bb88cdebee18..12780016a9ab1 100644
--- a/orttraining/orttraining/python/training/ortmodule/_fallback_exceptions.py
+++ b/orttraining/orttraining/python/training/ortmodule/_fallback_exceptions.py
@@ -3,7 +3,7 @@
 # _fallback_exceptions.py
 
 
-class ORTModuleFallbackException(Exception):
+class ORTModuleFallbackException(Exception):  # noqa: N818
     """Base exception class for fallback
 
     Although it must be specialized for specific scenarios,
diff --git a/orttraining/orttraining/python/training/ortmodule/_gradient_accumulation_manager.py b/orttraining/orttraining/python/training/ortmodule/_gradient_accumulation_manager.py
index 501a559a6ee0c..db23c142f47f6 100644
--- a/orttraining/orttraining/python/training/ortmodule/_gradient_accumulation_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_gradient_accumulation_manager.py
@@ -2,11 +2,12 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
-from . import _utils
 from onnxruntime.capi import _pybind_state as C
 
+from . import _utils
+
 
-class GradientAccumulationManager(object):
+class GradientAccumulationManager:
     """Handles Gradient accumulation optimization during training
 
     This feature must be enabled once before training and cannot be turned off within a training run.
diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_interface.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_interface.py
index f182d24b52f28..5624d59cab22f 100644
--- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_interface.py
+++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_interface.py
@@ -5,7 +5,7 @@
 from abc import ABC
 
 
-class GraphExecutionInterface(ABC):
+class GraphExecutionInterface(ABC):  # noqa: B024
     def __init__(self, module):
         self._original_module = module
         self._validate_module_type(module)
diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
index 8bb1eee641c75..ce8f1018119a3 100644
--- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
@@ -8,7 +8,7 @@
 import io
 import os
 import warnings
-from abc import ABC, abstractmethod
+from abc import ABC, abstractmethod  # noqa: F401
 from enum import IntFlag
 from functools import reduce
 
@@ -36,7 +36,7 @@
 from .torch_cpp_extensions.cpu.aten_op_executor import load_aten_op_executor_cpp_extension
 
 
-class _RunStateInfo(object):
+class _RunStateInfo:
     def __init__(self, state, output_info):
         """
         :param state: State of partial run that contains intermediate tensors needed to resume the run later.
@@ -72,7 +72,7 @@ class GraphExecutionManager(GraphExecutionInterface):
     def __init__(self, module, debug_options: DebugOptions, fallback_manager: _FallbackManager):
         """Manages construction and execution of ONNX graphs"""
 
-        super(GraphExecutionManager, self).__init__(module._original_module)
+        super().__init__(module._original_module)
 
         # IMPORTANT: Debug and Fallback must the configured first
         self._debug_options = debug_options
@@ -167,7 +167,7 @@ def __init__(self, module, debug_options: DebugOptions, fallback_manager: _Fallb
                         "The model's forward method has **kwargs parameter which has EXPERIMENTAL support!", UserWarning
                     )
 
-        self.is_rocm_pytorch = True if ((torch.version.hip is not None) and (ROCM_HOME is not None)) else False
+        self.is_rocm_pytorch = bool(torch.version.hip is not None and ROCM_HOME is not None)
 
         self._use_external_gpu_allocator = True
         # assign self._torch_alloc and self._torch_free if self._use_external_gpu_allocator is True
@@ -239,7 +239,7 @@ def execution_session_run_forward(execution_session, onnx_model, device, *inputs
             run_info: A _RunStateInfo which contains extra information about the execution of the graph
         """
 
-        raise NotImplemented
+        raise NotImplementedError
 
     @abstractmethod
     def forward(self):
@@ -424,7 +424,7 @@ def _get_exported_model(self, input_schema, *inputs, **kwargs):
                     **self._export_extra_kwargs,
                 )
         except Exception as e:
-            raise wrap_exception(
+            raise wrap_exception(  # noqa: B904
                 ORTModuleONNXModelException,
                 RuntimeError(
                     f"There was an error while exporting the PyTorch model to ONNX: "
diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager_factory.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager_factory.py
index 216417249bd20..95849a3335fe2 100644
--- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager_factory.py
+++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager_factory.py
@@ -3,13 +3,13 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-from ._training_manager import TrainingManager
+from ._fallback import _FallbackManager
 from ._inference_manager import InferenceManager
+from ._training_manager import TrainingManager
 from .debug_options import DebugOptions
-from ._fallback import _FallbackManager
 
 
-class GraphExecutionManagerFactory(object):
+class GraphExecutionManagerFactory:
     def __init__(self, module, debug_options: DebugOptions, fallback_manager: _FallbackManager):
         self._training_manager = TrainingManager(module, debug_options, fallback_manager)
         self._inference_manager = InferenceManager(module, debug_options, fallback_manager)
diff --git a/orttraining/orttraining/python/training/ortmodule/_io.py b/orttraining/orttraining/python/training/ortmodule/_io.py
index 6bb1a9fa51da7..a1c083db2e3d5 100644
--- a/orttraining/orttraining/python/training/ortmodule/_io.py
+++ b/orttraining/orttraining/python/training/ortmodule/_io.py
@@ -11,7 +11,7 @@
 
 import torch
 
-from ._fallback import ORTModuleIOError, ORTModuleONNXModelException, _FallbackManager, wrap_exception
+from ._fallback import ORTModuleIOError, ORTModuleONNXModelException, _FallbackManager, wrap_exception  # noqa: F401
 from ._utils import warn_of_constant_inputs
 
 
@@ -66,7 +66,7 @@ def symbolic(g, self):
         return g.op("Identity", self)
 
 
-class _PrimitiveType(object):
+class _PrimitiveType:
     _primitive_types = {int, bool, float}
 
     @staticmethod
@@ -110,7 +110,7 @@ def _flatten_kwargs(value, name):
     return flattened_kwargs
 
 
-class _InputInfo(object):
+class _InputInfo:
     def __init__(
         self,
         names,
@@ -224,7 +224,7 @@ def _expand_inputs(current_input, non_none_inputs, name=""):
 
         if inp is None:
             # Registered buffers are translated to user_input+initializer in ONNX
-            try:
+            try:  # noqa: SIM105
                 inp = buffer_names_dict[name]
             except KeyError:
                 # ONNX input name is not present in the registered buffer dict.
@@ -268,7 +268,7 @@ def extract_tensor(value):
     return sample_inputs_copy, sample_kwargs_copy
 
 
-class _TensorStub(object):
+class _TensorStub:
     """Tensor stub class used to represent model's input or output"""
 
     __slots__ = ["name", "dtype", "shape", "shape_dims"]
@@ -302,7 +302,7 @@ def __eq__(self, other):
         if not other:
             return False
         elif not isinstance(other, _TensorStub):
-            raise NotImplemented("_TensorStub must only be compared to another _TensorStub instance!")
+            raise NotImplementedError("_TensorStub must only be compared to another _TensorStub instance!")
         elif self.name != other.name:
             return False
         elif self.dtype != other.dtype:
@@ -466,7 +466,7 @@ def _flatten_data(data, flat_data):
 
 class _FlattenedModule(torch.nn.Module):
     def __init__(self, original_module):
-        super(_FlattenedModule, self).__init__()
+        super().__init__()
         self._original_module = original_module
 
         # Before `forward` is called, _ort_module must be assigned
diff --git a/orttraining/orttraining/python/training/ortmodule/_logger.py b/orttraining/orttraining/python/training/ortmodule/_logger.py
index 66e1cb556538f..f3d4d930746b6 100644
--- a/orttraining/orttraining/python/training/ortmodule/_logger.py
+++ b/orttraining/orttraining/python/training/ortmodule/_logger.py
@@ -3,12 +3,13 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-from onnxruntime.capi._pybind_state import Severity
-from contextlib import contextmanager
-from enum import IntEnum
 import io
 import sys
 import warnings
+from contextlib import contextmanager
+from enum import IntEnum
+
+from onnxruntime.capi._pybind_state import Severity
 
 
 class LogLevel(IntEnum):
diff --git a/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py b/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
index d9d936998edea..c55f2f0720580 100644
--- a/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
+++ b/orttraining/orttraining/python/training/ortmodule/_runtime_inspector.py
@@ -4,11 +4,12 @@
 # --------------------------------------------------------------------------
 
 import warnings
+
 import onnx
 import torch
-
 from onnx import helper
 from onnx import onnx_pb as onnx_proto
+
 from onnxruntime.training import ortmodule
 
 
diff --git a/orttraining/orttraining/python/training/ortmodule/_torch_module_factory.py b/orttraining/orttraining/python/training/ortmodule/_torch_module_factory.py
index 41d82eded40c1..d2954a287e804 100644
--- a/orttraining/orttraining/python/training/ortmodule/_torch_module_factory.py
+++ b/orttraining/orttraining/python/training/ortmodule/_torch_module_factory.py
@@ -2,9 +2,9 @@
 # Licensed under the MIT License.
 # _torch_module_factory.py
 
+from ._fallback import _FallbackManager
 from ._torch_module_ort import TorchModuleORT
 from .debug_options import DebugOptions
-from ._fallback import _FallbackManager
 
 
 class TorchModuleFactory:
diff --git a/orttraining/orttraining/python/training/ortmodule/_torch_module_interface.py b/orttraining/orttraining/python/training/ortmodule/_torch_module_interface.py
index 6d7a9db2433a0..897bf89c15063 100644
--- a/orttraining/orttraining/python/training/ortmodule/_torch_module_interface.py
+++ b/orttraining/orttraining/python/training/ortmodule/_torch_module_interface.py
@@ -3,9 +3,9 @@
 # _torch_module_interface.py
 
 from collections import OrderedDict
-import torch
-from typing import Iterator, Optional, Tuple, TypeVar, Callable
+from typing import Callable, Iterator, Optional, Tuple, TypeVar
 
+import torch
 
 T = TypeVar("T", bound="torch.nn.Module")
 
@@ -99,5 +99,5 @@ def named_modules(self, *args, **kwargs):
     def _replicate_for_data_parallel(self):
         raise NotImplementedError(f"_replicate_for_data_parallel is not implemented for {type(self)}.")
 
-    def add_module(self, name: str, module: Optional["Module"]) -> None:
+    def add_module(self, name: str, module: Optional["Module"]) -> None:  # noqa: F821
         raise NotImplementedError(f"add_module is not implemented for {type(self)}.")
diff --git a/orttraining/orttraining/python/training/ortmodule/_torch_module_ort.py b/orttraining/orttraining/python/training/ortmodule/_torch_module_ort.py
index ea0676b12587c..f9798f0c70549 100644
--- a/orttraining/orttraining/python/training/ortmodule/_torch_module_ort.py
+++ b/orttraining/orttraining/python/training/ortmodule/_torch_module_ort.py
@@ -2,15 +2,16 @@
 # Licensed under the MIT License.
 # _torch_module_ort.py
 
-from . import _io, _utils
-from .debug_options import DebugOptions
-from ._graph_execution_manager_factory import GraphExecutionManagerFactory
-from ._torch_module_interface import TorchModuleInterface
-from ._fallback import _FallbackManager, ORTModuleTorchModelException, wrap_exception
 from collections import OrderedDict
+from typing import Callable, Iterator, Optional, Tuple, TypeVar
+
 import torch
-from typing import Iterator, Optional, Tuple, TypeVar, Callable
 
+from . import _io, _utils
+from ._fallback import ORTModuleTorchModelException, _FallbackManager, wrap_exception
+from ._graph_execution_manager_factory import GraphExecutionManagerFactory
+from ._torch_module_interface import TorchModuleInterface
+from .debug_options import DebugOptions
 
 T = TypeVar("T", bound="torch.nn.Module")
 
@@ -145,7 +146,7 @@ def _replicate_for_data_parallel(self):
             ),
         )
 
-    def add_module(self, name: str, module: Optional["Module"]) -> None:
+    def add_module(self, name: str, module: Optional["Module"]) -> None:  # noqa: F821
         raise wrap_exception(
             ORTModuleTorchModelException, NotImplementedError("ORTModule does not support adding modules to it.")
         )
diff --git a/orttraining/orttraining/python/training/ortmodule/_torch_module_pytorch.py b/orttraining/orttraining/python/training/ortmodule/_torch_module_pytorch.py
index 44a43b2429e1c..9f7fb1d0dcd16 100644
--- a/orttraining/orttraining/python/training/ortmodule/_torch_module_pytorch.py
+++ b/orttraining/orttraining/python/training/ortmodule/_torch_module_pytorch.py
@@ -2,12 +2,12 @@
 # Licensed under the MIT License.
 # _torch_module_pytorch.py
 
-from ._torch_module_interface import TorchModuleInterface
-
 from collections import OrderedDict
+from typing import Callable, Iterator, Optional, Tuple, TypeVar
+
 import torch
-from typing import Iterator, Optional, Tuple, TypeVar, Callable
 
+from ._torch_module_interface import TorchModuleInterface
 
 T = TypeVar("T", bound="torch.nn.Module")
 
@@ -83,7 +83,7 @@ def named_modules(self, *args, **kwargs):
     def _replicate_for_data_parallel(self):
         return self._original_module._replicate_for_data_parallel()
 
-    def add_module(self, name: str, module: Optional["Module"]) -> None:
+    def add_module(self, name: str, module: Optional["Module"]) -> None:  # noqa: F821
         self._original_module.add_module(name, module)
 
     @TorchModuleInterface.module.getter
diff --git a/orttraining/orttraining/python/training/ortmodule/_training_manager.py b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
index 0b3ce5efe8108..5ec665f30f034 100644
--- a/orttraining/orttraining/python/training/ortmodule/_training_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_training_manager.py
@@ -390,7 +390,7 @@ def _reinitialize_graph_builder(self, input_info):
         return False
 
     def __getstate__(self):
-        state = super(TrainingManager, self).__getstate__()
+        state = super().__getstate__()
 
         # Only top level classes are pickleable. So, _ORTModuleFunction is
         # not pickleable. So, let's not pickle it, and redefine it when
@@ -399,6 +399,6 @@ def __getstate__(self):
         return state
 
     def __setstate__(self, state):
-        super(TrainingManager, self).__setstate__(state)
+        super().__setstate__(state)
 
         _utils.reinitialize_training_manager(self)
diff --git a/orttraining/orttraining/python/training/ortmodule/_utils.py b/orttraining/orttraining/python/training/ortmodule/_utils.py
index d256c91810ec2..0151cd532a7e9 100644
--- a/orttraining/orttraining/python/training/ortmodule/_utils.py
+++ b/orttraining/orttraining/python/training/ortmodule/_utils.py
@@ -12,11 +12,11 @@
 import traceback
 import types
 import warnings
-from typing import List
-from packaging.version import Version as LooseVersion
+from typing import List  # noqa: F401
 
 import numpy as np
 import torch
+from packaging.version import Version as LooseVersion
 from torch._C import _from_dlpack
 from torch.utils.dlpack import to_dlpack
 
@@ -67,7 +67,7 @@ def _ortvalues_to_torch_tensor(ortvalues, device=None):
     if len(ortvalues) == 0:
         return tuple()
 
-    if device is not None and "ort" == device.type:
+    if device is not None and device.type == "ort":
         if not hasattr(C, "to_aten_ort_device_tensor"):
             raise AttributeError("onnxruntime is missing to_aten_ort_device_tensor needed to support device == 'ort'.")
         return tuple(C.to_aten_ort_device_tensor(ov) for ov in ortvalues)
@@ -239,7 +239,6 @@ def check_for_name_collisions_and_bind_methods_to_ortmodule(ortmodule: torch.nn.
                 or not inspect.ismethod(torch_module_attributes[attribute_name])
                 or attribute.__func__ != torch_module_attributes[attribute_name].__func__
             ):
-
                 # forward is expected to be defined by the user.
                 if attribute_name == "forward":
                     continue
@@ -294,7 +293,6 @@ def get_state_after_deletion_of_non_ortmodule_methods(ortmodule, user_module):
                 and inspect.ismethod(ortmodule_attributes[attribute_name])
                 and attribute.__func__ == ortmodule_attributes[attribute_name].__func__
             ):
-
                 # forward is expected to be defined by the user.
                 if attribute_name == "forward":
                     continue
@@ -316,7 +314,7 @@ def get_exception_as_string(exception):
 
     try:
         raise exception
-    except:
+    except Exception:
         return traceback.format_exc()
 
 
diff --git a/orttraining/orttraining/python/training/ortmodule/experimental/hierarchical_ortmodule/__init__.py b/orttraining/orttraining/python/training/ortmodule/experimental/hierarchical_ortmodule/__init__.py
index 6cb1c95ea4b1e..f89c0ddf8eaa2 100644
--- a/orttraining/orttraining/python/training/ortmodule/experimental/hierarchical_ortmodule/__init__.py
+++ b/orttraining/orttraining/python/training/ortmodule/experimental/hierarchical_ortmodule/__init__.py
@@ -2,4 +2,4 @@
 # Licensed under the MIT License.
 # __init__.py
 
-from ._hierarchical_ortmodule import HierarchicalORTModule
+from ._hierarchical_ortmodule import HierarchicalORTModule  # noqa: F401
diff --git a/orttraining/orttraining/python/training/ortmodule/experimental/hierarchical_ortmodule/_hierarchical_ortmodule.py b/orttraining/orttraining/python/training/ortmodule/experimental/hierarchical_ortmodule/_hierarchical_ortmodule.py
index adb2edefad4c7..cb1715ccd80f7 100644
--- a/orttraining/orttraining/python/training/ortmodule/experimental/hierarchical_ortmodule/_hierarchical_ortmodule.py
+++ b/orttraining/orttraining/python/training/ortmodule/experimental/hierarchical_ortmodule/_hierarchical_ortmodule.py
@@ -13,9 +13,7 @@
 # nn.Module's in this set are considered exportable to ONNX.
 # For other nn.Module's, torch.onnx.export is called to check if
 # they are exportable.
-_force_exportable_set = set(
-    [torch.nn.Linear, torch.nn.Identity, torch.nn.modules.linear.NonDynamicallyQuantizableLinear]
-)
+_force_exportable_set = {torch.nn.Linear, torch.nn.Identity, torch.nn.modules.linear.NonDynamicallyQuantizableLinear}
 
 
 class _IteratedORTModule(torch.nn.Module):
@@ -32,7 +30,7 @@ class _IteratedORTModule(torch.nn.Module):
     """
 
     def __init__(self, module, count, log_level, save_onnx, onnx_prefix):
-        super(_IteratedORTModule, self).__init__()
+        super().__init__()
         assert count > 1
         self._count = count
         self._it = count - 1
@@ -90,7 +88,7 @@ def custom_forward(x_):
 
     def __init__(self, module, debug_options=None):
         self._initialized = False
-        super(HierarchicalORTModule, self).__init__()
+        super().__init__()
         self._original_module = module
         self._log_level = debug_options.logging.log_level if debug_options else LogLevel.ERROR
         self._save_onnx = debug_options.save_onnx_models.save if debug_options else False
diff --git a/orttraining/orttraining/python/training/ortmodule/experimental/json_config/__init__.py b/orttraining/orttraining/python/training/ortmodule/experimental/json_config/__init__.py
index 2f7322394f0de..d4399b1ee9c09 100644
--- a/orttraining/orttraining/python/training/ortmodule/experimental/json_config/__init__.py
+++ b/orttraining/orttraining/python/training/ortmodule/experimental/json_config/__init__.py
@@ -5,4 +5,4 @@
 # JSON global constants goes here
 JSON_PATH_ENVIRONMENT_KEY = "ORTMODULE_JSON_CONFIG_PATH"
 
-from ._load_config_from_json import load_from_json
+from ._load_config_from_json import load_from_json  # noqa: E402, F401
diff --git a/orttraining/orttraining/python/training/ortmodule/experimental/json_config/_load_config_from_json.py b/orttraining/orttraining/python/training/ortmodule/experimental/json_config/_load_config_from_json.py
index f251df27360ee..2f1451497ffcd 100644
--- a/orttraining/orttraining/python/training/ortmodule/experimental/json_config/_load_config_from_json.py
+++ b/orttraining/orttraining/python/training/ortmodule/experimental/json_config/_load_config_from_json.py
@@ -3,17 +3,18 @@
 # _load_config_from_json.py
 
 import json
-import os
 import logging
+import os
+from functools import reduce
 from types import SimpleNamespace
 
 from onnxruntime.capi import _pybind_state as C
-from functools import reduce
-from . import JSON_PATH_ENVIRONMENT_KEY
+from onnxruntime.training import ortmodule
+
 from ..._fallback import _FallbackPolicy
 from ..._graph_execution_manager import _SkipCheck
 from ...debug_options import DebugOptions, LogLevel, _SaveOnnxOptions
-from onnxruntime.training import ortmodule
+from . import JSON_PATH_ENVIRONMENT_KEY
 
 log = logging.getLogger(__name__)
 
diff --git a/orttraining/orttraining/python/training/ortmodule/ortmodule.py b/orttraining/orttraining/python/training/ortmodule/ortmodule.py
index 90f88459fc077..85508601722d6 100644
--- a/orttraining/orttraining/python/training/ortmodule/ortmodule.py
+++ b/orttraining/orttraining/python/training/ortmodule/ortmodule.py
@@ -2,9 +2,9 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
-
+# isort: skip_file
+# Import ordering is important in this module to aviod circular dependencies
 from ._torch_module_factory import TorchModuleFactory
-from ._torch_module_pytorch import TorchModulePytorch
 from ._torch_module_ort import TorchModuleORT
 from ._custom_op_symbolic_registry import CustomOpSymbolicRegistry
 from ._custom_gradient_registry import CustomGradientRegistry
@@ -16,11 +16,10 @@
 from onnxruntime.tools import pytorch_export_contrib_ops
 
 import torch
-from typing import Iterator, Optional, Tuple, TypeVar, Callable
-
+from typing import Iterator, Optional, OrderedDict, Tuple, TypeVar, Callable
 
 # Needed to override PyTorch methods
-T = TypeVar("T", bound="Module")
+T = TypeVar("T", bound="torch.nn.Module")
 
 
 class ORTModule(torch.nn.Module):
@@ -35,7 +34,6 @@ class ORTModule(torch.nn.Module):
     """
 
     def __init__(self, module, debug_options=None):
-
         # NOTE: torch.nn.Modules that call setattr on their internal attributes regularly
         #       (for example PyTorch Lightning), will trigger regular re-exports. This is
         #       because ORTModule auto detects such setattrs on the original module and
@@ -63,7 +61,7 @@ def __init__(self, module, debug_options=None):
             if ortmodule._FALLBACK_INIT_EXCEPTION:
                 raise ortmodule._FALLBACK_INIT_EXCEPTION
 
-            super(ORTModule, self).__init__()
+            super().__init__()
 
             self._torch_module = TorchModuleFactory()(module, debug_options, self._fallback_manager)
 
@@ -149,7 +147,7 @@ def _replicate_for_data_parallel(self):
 
         return self._torch_module._replicate_for_data_parallel()
 
-    def add_module(self, name: str, module: Optional["Module"]) -> None:
+    def add_module(self, name: str, module: Optional[torch.nn.Module]) -> None:
         """Raises a ORTModuleTorchModelException exception since ORTModule does not support adding modules to it"""
 
         self._torch_module.add_module(name, module)
@@ -180,7 +178,7 @@ def _apply(self, fn):
         self._torch_module._apply(fn)
         return self
 
-    def apply(self: T, fn: Callable[["Module"], None]) -> T:
+    def apply(self: T, fn: Callable[[torch.nn.Module], None]) -> T:
         """Override :meth:`~torch.nn.Module.apply` to delegate execution to ONNX Runtime"""
 
         self._torch_module.apply(fn)
@@ -207,7 +205,7 @@ def state_dict(self, destination=None, prefix="", keep_vars=False):
 
         return self._torch_module.state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
 
-    def load_state_dict(self, state_dict: "OrderedDict[str, Tensor]", strict: bool = True):
+    def load_state_dict(self, state_dict: "OrderedDict[str, torch.Tensor]", strict: bool = True):
         """Override :meth:`~torch.nn.Module.load_state_dict` to delegate execution to ONNX Runtime"""
 
         return self._torch_module.load_state_dict(state_dict, strict=strict)
@@ -261,12 +259,12 @@ def _load_from_state_dict(
             state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
         )
 
-    def named_children(self) -> Iterator[Tuple[str, "Module"]]:
+    def named_children(self) -> Iterator[Tuple[str, torch.nn.Module]]:
         """Override :meth:`~torch.nn.Module.named_children`"""
 
         yield from self._torch_module.named_children()
 
-    def modules(self) -> Iterator["Module"]:
+    def modules(self) -> Iterator[torch.nn.Module]:
         """Override :meth:`~torch.nn.Module.modules`"""
 
         yield from self._torch_module.modules()
@@ -284,16 +282,14 @@ def __getattr__(self, name: str):
             assert "_torch_module" in self.__dict__, "ORTModule does not have a reference to the user's model"
             return getattr(self.module, name)
         else:
-            return super(ORTModule, self).__getattr__(name)
+            return super().__getattr__(name)
 
     def __setattr__(self, name: str, value) -> None:
-
         if name in self.__dict__:
             # If the name is an attribute of ORTModule, update only ORTModule
             self.__dict__[name] = value
 
         elif "_is_initialized" in self.__dict__ and self.__dict__["_is_initialized"] is True:
-
             assert "_torch_module" in self.__dict__, "ORTModule does not have a reference to the user's model"
 
             # If the name is an attribute of user model, or is a new attribute, update there.
diff --git a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/__init__.py b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/__init__.py
index d6e78073f05d2..419b95bdb0003 100644
--- a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/__init__.py
+++ b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/__init__.py
@@ -4,7 +4,7 @@ def clear_all_grad_fns():
     torch_interop_utils.clear_all_grad_fns()
 
 
-import atexit
+import atexit  # noqa: E402
 
 # Clear all gradient functions, to avoid a deadlock issue.
 # Check the called function for more detailed comments.
diff --git a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/setup.py b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/setup.py
index 0ab8a0c1899e2..3b6d6050c4c17 100644
--- a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/setup.py
+++ b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/setup.py
@@ -5,7 +5,7 @@
 
 import os
 
-from setuptools import Extension, setup
+from setuptools import Extension, setup  # noqa: F401
 from torch.utils import cpp_extension
 
 filename = os.path.join(os.path.dirname(__file__), "torch_interop_utils.cc")
diff --git a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/setup.py b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/setup.py
index b73623c430525..6b028d8f05e11 100644
--- a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/setup.py
+++ b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/setup.py
@@ -3,9 +3,9 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-import fileinput
+import fileinput  # noqa: F401
 import os
-import sys
+import sys  # noqa: F401
 
 from setuptools import setup
 from torch.utils import cpp_extension
@@ -18,7 +18,7 @@
     os.path.join(os.path.dirname(__file__), "multi_tensor_l2norm_kernel.cu"),
 ]
 
-use_rocm = True if os.environ["ONNXRUNTIME_ROCM_VERSION"] else False
+use_rocm = bool(os.environ["ONNXRUNTIME_ROCM_VERSION"])
 extra_compile_args = {"cxx": ["-O3"]}
 if not use_rocm:
     nvcc_extra_args = os.environ.get("ONNXRUNTIME_CUDA_NVCC_EXTRA_ARGS", "")
diff --git a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/torch_gpu_allocator/setup.py b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/torch_gpu_allocator/setup.py
index 99f6699dca6a6..61bda26f2dc6b 100644
--- a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/torch_gpu_allocator/setup.py
+++ b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cuda/torch_gpu_allocator/setup.py
@@ -11,7 +11,7 @@
 from torch.utils import cpp_extension
 
 # TODO: Implement a cleaner way to auto-generate torch_gpu_allocator.cc
-use_rocm = True if os.environ["ONNXRUNTIME_ROCM_VERSION"] else False
+use_rocm = bool(os.environ["ONNXRUNTIME_ROCM_VERSION"])
 gpu_identifier = "hip" if use_rocm else "cuda"
 gpu_allocator_header = "HIPCachingAllocator" if use_rocm else "CUDACachingAllocator"
 filename = os.path.join(os.path.dirname(__file__), "torch_gpu_allocator.cc")
diff --git a/orttraining/orttraining/python/training/orttrainer.py b/orttraining/orttraining/python/training/orttrainer.py
index bdf6a1e9e1ea1..a6c6c8af2723b 100644
--- a/orttraining/orttraining/python/training/orttrainer.py
+++ b/orttraining/orttraining/python/training/orttrainer.py
@@ -12,11 +12,12 @@
 import onnxruntime as ort
 from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference
 
-from . import ORTTrainerOptions, _checkpoint_storage, _utils, amp, checkpoint, optim, postprocess
+from . import _checkpoint_storage, _utils, amp, checkpoint, optim, postprocess
 from .model_desc_validation import _ORTTrainerModelDesc
+from .orttrainer_options import ORTTrainerOptions
 
 
-class TrainStepInfo(object):
+class TrainStepInfo:
     r"""Private class used to store runtime information from current train step.
 
     After every train step, :py:meth:`ORTTrainer.train_step` updates the internal instance of
@@ -44,7 +45,7 @@ class TrainStepInfo(object):
 
     """
 
-    def __init__(self, optimizer_config, all_finite=True, fetches=[], optimization_step=0, step=0):
+    def __init__(self, optimizer_config, all_finite=True, fetches=[], optimization_step=0, step=0):  # noqa: B006
         assert isinstance(optimizer_config, optim._OptimizerConfig), "optimizer_config must be a optim._OptimizerConfig"
         assert isinstance(all_finite, bool), "all_finite must be a bool"
         assert isinstance(fetches, list) and all(
@@ -60,7 +61,7 @@ def __init__(self, optimizer_config, all_finite=True, fetches=[], optimization_s
         self.step = step
 
 
-class ORTTrainer(object):
+class ORTTrainer:
     r"""Pytorch frontend for ONNX Runtime training
 
     Entry point that exposes the C++ backend of ORT as a Pytorch frontend.
@@ -203,7 +204,7 @@ def __init__(self, model, model_desc, optim_config, loss_fn=None, options=None):
         try:
             from torch.utils.cpp_extension import ROCM_HOME
 
-            self.is_rocm_pytorch = True if ((torch.version.hip is not None) and (ROCM_HOME is not None)) else False
+            self.is_rocm_pytorch = bool(torch.version.hip is not None and ROCM_HOME is not None)
         except ImportError:
             self.is_rocm_pytorch = False
 
@@ -298,7 +299,7 @@ def save_as_onnx(self, path):
 
     def _check_model_export(self, input):
         from numpy.testing import assert_allclose
-        from onnx import TensorProto, helper, numpy_helper
+        from onnx import TensorProto, helper, numpy_helper  # noqa: F401
 
         onnx_model_copy = copy.deepcopy(self._onnx_model)
 
@@ -496,7 +497,7 @@ def forward(self, *inputs):
                 sig = signature(self.model.forward)
 
                 input_dict = {}
-                for key in sig.parameters.keys():
+                for key in sig.parameters:
                     if key in self.input_names:
                         input_dict[key] = inputs[self.input_names.index(key)]
 
@@ -582,7 +583,9 @@ def forward(self, *inputs):
 
         return onnx_model
 
-    def _create_ort_training_session(self, optimizer_state_dict={}, session_options=None, provider_options=None):
+    def _create_ort_training_session(self, optimizer_state_dict=None, session_options=None, provider_options=None):
+        if optimizer_state_dict is None:
+            optimizer_state_dict = {}
         # Validating frozen_weights names
         unused_frozen_weights = [
             n
@@ -590,9 +593,7 @@ def _create_ort_training_session(self, optimizer_state_dict={}, session_options=
             if n not in [i.name for i in self._onnx_model.graph.initializer]
         ]
         if unused_frozen_weights:
-            raise RuntimeError(
-                "{} params from 'frozen_weights' not found in the ONNX model.".format(unused_frozen_weights)
-            )
+            raise RuntimeError(f"{unused_frozen_weights} params from 'frozen_weights' not found in the ONNX model.")
 
         # Get loss name from model description
         loss_name = [item.name for item in self.model_desc.outputs if item.is_loss]
@@ -710,6 +711,7 @@ def _create_ort_training_session(self, optimizer_state_dict={}, session_options=
         # old ort session may already exists and occupies GPU memory when creating new session, this may cause OOM error.
         # for example, load_state_dict will be called before returing the function, and it calls _init_session again
         del self._training_session
+
         # Set provider-specific options if needed
         def get_providers(provider_options):
             providers = ort.get_available_providers()
@@ -729,7 +731,7 @@ def get_providers(provider_options):
                 if gpu_ep_name not in providers:
                     raise RuntimeError(
                         "ORTTrainer options specify a CUDA device but the {} provider is unavailable.".format(
-                            cuda_ep_name
+                            cuda_ep_name  # noqa: F821
                         )
                     )
 
@@ -777,7 +779,7 @@ def _init_onnx_model(self, inputs):
             provider_options=self.options._validated_opts["provider_options"],
         )
 
-    def _init_session(self, optimizer_state_dict={}, session_options=None, provider_options=None):
+    def _init_session(self, optimizer_state_dict={}, session_options=None, provider_options=None):  # noqa: B006
         if self._onnx_model is None:
             return
 
@@ -850,7 +852,7 @@ def _prepare_model_input(self, inputs_desc, lr, loss_scale, *inputs, **kwargs):
         # Append input from 'kwargs'
         for input_desc in inputs_desc:
             if input_desc.name in kwargs:
-                input = input + (kwargs[input_desc.name],)
+                input = (*input, kwargs[input_desc.name])
 
         # Append learning rate
         extra_inputs = 0
@@ -930,8 +932,8 @@ def _training_session_run_helper(self, is_train, inputs, inputs_desc, outputs_de
             # to move the data between device and host.
             # so output will be on the same device as input.
             try:
-                test_pt_device = torch.device(target_device)
-            except:
+                torch.device(target_device)
+            except Exception:
                 # in this case, input/output must on CPU
                 assert input.device.type == "cpu"
                 target_device = "cpu"
@@ -1018,8 +1020,8 @@ def _extract_trainer_options(self, state_dict):
         world_rank = _utils.state_dict_trainer_options_world_rank_key()
         world_size = _utils.state_dict_trainer_options_world_size_key()
         optimizer_name = _utils.state_dict_trainer_options_optimizer_name_key()
-        D_size = _utils.state_dict_trainer_options_data_parallel_size_key()
-        H_size = _utils.state_dict_trainer_options_horizontal_parallel_size_key()
+        D_size = _utils.state_dict_trainer_options_data_parallel_size_key()  # noqa: N806
+        H_size = _utils.state_dict_trainer_options_horizontal_parallel_size_key()  # noqa: N806
 
         state_dict[_utils.state_dict_trainer_options_key()] = {}
         state_dict[_utils.state_dict_trainer_options_key()][mixed_precision] = self.options.mixed_precision.enabled
@@ -1260,7 +1262,7 @@ def _load_model_states(self, state_dict, strict):
                 if state_key in initializer_names:
                     loaded_initializers[state_key] = state_value
                 elif strict:
-                    raise RuntimeError("Unexpected key: {} in state_dict[model][{}]".format(state_key, precision))
+                    raise RuntimeError(f"Unexpected key: {state_key} in state_dict[model][{precision}]")
 
         # update onnx model from loaded initializers
         self._update_onnx_model_initializers(loaded_initializers)
@@ -1280,7 +1282,7 @@ def _check_optimizer_mismatch(state_dict):
             # optimizer_name can be either a regular string or a byte string.
             # if it is a byte string, convert to regular string using decode()
             # if it is a regular string, do nothing to it
-            try:
+            try:  # noqa: SIM105
                 optimizer_name = optimizer_name.decode()
             except AttributeError:
                 pass
@@ -1325,9 +1327,9 @@ def _mismatch_keys(keys1, keys2, in_error_str, allow_unexpected=False):
             missing_keys = list(keys1 - keys2)
             unexpected_keys = list(keys2 - keys1)
             if len(missing_keys) > 0:
-                raise RuntimeError("Missing keys: {} in {}".format(missing_keys, in_error_str))
+                raise RuntimeError(f"Missing keys: {missing_keys} in {in_error_str}")
             if len(unexpected_keys) > 0 and not allow_unexpected:
-                raise RuntimeError("Unexpected keys: {} in {}".format(unexpected_keys, in_error_str))
+                raise RuntimeError(f"Unexpected keys: {unexpected_keys} in {in_error_str}")
 
         def _check_model_key_mismatch(current_state_dict, state_dict, allow_unexpected=False):
             """Check if there is any mismatch in the model sub state dictionary between the two state_dicts"""
@@ -1346,7 +1348,7 @@ def _check_model_key_mismatch(current_state_dict, state_dict, allow_unexpected=F
                 _mismatch_keys(
                     current_state_dict[_utils.state_dict_model_key()][precision_key],
                     state_dict[_utils.state_dict_model_key()][precision_key],
-                    "state_dict[model][{}]".format(precision_key),
+                    f"state_dict[model][{precision_key}]",
                     allow_unexpected,
                 )
 
@@ -1366,7 +1368,7 @@ def _check_optimizer_key_mismatch(current_state_dict, state_dict, allow_unexpect
                 _mismatch_keys(
                     current_state_dict[_utils.state_dict_optimizer_key()][model_state_key],
                     state_dict[_utils.state_dict_optimizer_key()][model_state_key],
-                    "state_dict[optimizer][{}]".format(model_state_key),
+                    f"state_dict[optimizer][{model_state_key}]",
                     allow_unexpected,
                 )
 
@@ -1394,7 +1396,7 @@ def _check_key_mismatch(current_state_dict, state_dict, allow_unexpected=False):
             if strict:
                 # for Zero enabled, the current trainer might not have the complete state, and we must allow
                 # extra keys to be present in the state dict
-                allow_unexpected = True if self.options.distributed.deepspeed_zero_optimization.stage > 0 else False
+                allow_unexpected = self.options.distributed.deepspeed_zero_optimization.stage > 0
                 _check_key_mismatch(current_state_dict, state_dict, allow_unexpected)
 
         # load the model states from the input state dictionary into the onnx graph
@@ -1458,7 +1460,7 @@ def load_state_dict(self, state_dict, strict=True):
             provider_options=self.options._validated_opts["provider_options"],
         )
 
-    def save_checkpoint(self, path, user_dict={}, include_optimizer_states=True):
+    def save_checkpoint(self, path, user_dict={}, include_optimizer_states=True):  # noqa: B006
         """Persists ORTTrainer state dictionary on disk along with user_dict.
 
         Saves the state_dict along with the user_dict to a file specified by path.
@@ -1521,7 +1523,7 @@ def load_checkpoint(self, *paths, strict=True):
             state_dict = checkpoint.aggregate_checkpoints(paths, pytorch_format=False)
         else:
             # if aggregation is not required, there must only be a single file that needs to be loaded
-            assert len(paths) == 1, "Expected number of files to load: 1, got {}".format(len(paths))
+            assert len(paths) == 1, f"Expected number of files to load: 1, got {len(paths)}"
             state_dict = _checkpoint_storage.load(paths[0])
 
         # extract user dict from the saved checkpoint
diff --git a/orttraining/orttraining/python/training/orttrainer_options.py b/orttraining/orttraining/python/training/orttrainer_options.py
index 9e7a2bde4dfa0..fc8322855ddb4 100644
--- a/orttraining/orttraining/python/training/orttrainer_options.py
+++ b/orttraining/orttraining/python/training/orttrainer_options.py
@@ -1,14 +1,13 @@
 import cerberus
-import torch
 
 import onnxruntime as ort
+from onnxruntime.capi._pybind_state import PropagateCastOpsStrategy
 
-from . import PropagateCastOpsStrategy
 from .amp import loss_scaler
 from .optim import lr_scheduler
 
 
-class ORTTrainerOptions(object):
+class ORTTrainerOptions:
     r"""Settings used by ONNX Runtime training backend
 
     The parameters are hierarchically organized to facilitate configuration through semantic groups
@@ -462,7 +461,7 @@ class ORTTrainerOptions(object):
             fp16_enabled = opts.mixed_precision.enabled
     """
 
-    def __init__(self, options={}):
+    def __init__(self, options={}):  # noqa: B006
         # Keep a copy of original input for debug
         self._original_opts = dict(options)
 
@@ -483,7 +482,7 @@ def __init__(self, options={}):
     def __repr__(self):
         return "{%s}" % str(
             ", ".join(
-                "'%s': %s" % (k, repr(v))
+                f"'{k}': {repr(v)}"
                 for (k, v) in self.__dict__.items()
                 if k not in ["_original_opts", "_validated_opts", "_main_class_name"]
             )
@@ -536,9 +535,9 @@ def _check_is_callable(field, value, error):
     try:
         # Python 3
         result = value is None or callable(value)
-    except:
+    except Exception:
         # Python 3 but < 3.2
-        if hasattr(value, "__call__"):
+        if hasattr(value, "__call__"):  # noqa: B004
             result = True
     if not result:
         error(field, "Must be callable or None")
diff --git a/orttraining/orttraining/python/training/postprocess.py b/orttraining/orttraining/python/training/postprocess.py
index ff77a05e41e31..b2da6186b62cf 100644
--- a/orttraining/orttraining/python/training/postprocess.py
+++ b/orttraining/orttraining/python/training/postprocess.py
@@ -1,12 +1,11 @@
-import sys
-import os.path
-from onnx import *
-import onnx
-import numpy as np
+import os.path  # noqa: F401
 import struct
+import sys  # noqa: F401
 
-from onnx import helper
-from onnx import numpy_helper
+import numpy as np  # noqa: F401
+import onnx
+from onnx import *  # noqa: F403
+from onnx import helper, numpy_helper  # noqa: F401
 
 
 def run_postprocess(model):
@@ -169,7 +168,7 @@ def fix_expand_shape_pt_1_5(model):
             if n_shape.op_type != "Shape" or n_constant_g.op_type != "Constant":
                 break
             n_input = n_shape.input[0]
-            if not n_input in model_inputs_names:
+            if n_input not in model_inputs_names:
                 break
             n_input_candidates.append(n_input)
 
@@ -264,7 +263,7 @@ def layer_norm_transform(model):
     # output
     graph = model.graph
 
-    nodes_ReduceMean = find_nodes(graph, "ReduceMean")
+    nodes_ReduceMean = find_nodes(graph, "ReduceMean")  # noqa: N806
 
     id = 0
     layer_norm_nodes = []
@@ -396,7 +395,7 @@ def layer_norm_transform(model):
 # Fuse SoftmaxCrossEntropy
 
 
-def fuse_softmaxNLL_to_softmaxCE(onnx_model):
+def fuse_softmaxNLL_to_softmaxCE(onnx_model):  # noqa: N802
     # Converting below subgraph
     #
     #    (subgraph)
@@ -420,7 +419,7 @@ def fuse_softmaxNLL_to_softmaxCE(onnx_model):
         nll_count = nll_count + 1
         nll_loss_node = None
         nll_loss_node_index = 0
-        for nll_loss_node_index, node in enumerate(onnx_model.graph.node):
+        for nll_loss_node_index, node in enumerate(onnx_model.graph.node):  # noqa: B007
             if node.op_type == "nll_loss" or node.op_type == "NegativeLogLikelihoodLoss":
                 nll_loss_node = node
                 break
@@ -432,7 +431,7 @@ def fuse_softmaxNLL_to_softmaxCE(onnx_model):
         softmax_node_index = 0
         label_input_name = None
         weight_input_name = None
-        for softmax_node_index, node in enumerate(onnx_model.graph.node):
+        for softmax_node_index, node in enumerate(onnx_model.graph.node):  # noqa: B007
             if node.op_type == "LogSoftmax":
                 # has to be connected to nll_loss
                 if len(nll_loss_node.input) > 2:
diff --git a/orttraining/orttraining/python/training/torchdynamo/ort_backend.py b/orttraining/orttraining/python/training/torchdynamo/ort_backend.py
index bfc3144317d8a..d224e2ca54306 100644
--- a/orttraining/orttraining/python/training/torchdynamo/ort_backend.py
+++ b/orttraining/orttraining/python/training/torchdynamo/ort_backend.py
@@ -18,13 +18,13 @@
 import torch.onnx
 import torch.onnx._onnx_supported_ops
 from torch._decomp import decomposition_table
+from torch._subclasses.fake_tensor import FakeTensor
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.passes.fake_tensor_prop import FakeTensorProp
 from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
 from torch.fx.passes.operator_support import OperatorSupport
 from torch.fx.passes.tools_common import CALLABLE_NODE_OPS
 from torch.onnx._globals import GLOBALS as ONNX_GLOBALS
-from torch._subclasses.fake_tensor import FakeTensor
 
 import onnxruntime  # type: ignore
 from onnxruntime.capi import _pybind_state as ORTC
@@ -103,12 +103,14 @@ def _get_onnx_supported_table() -> Set[str]:
     return onnx_supported_ops
 
 
-def _get_support_dictionaries_and_decomposition_tables() -> Tuple[
-    Dict[torch._ops.OpOverload, Any],
-    Dict[str, Any],
-    Dict[torch._ops.OpOverload, Callable],
-    Dict[torch._ops.OpOverload, Callable],
-]:
+def _get_support_dictionaries_and_decomposition_tables() -> (
+    Tuple[
+        Dict[torch._ops.OpOverload, Any],
+        Dict[str, Any],
+        Dict[torch._ops.OpOverload, Callable],
+        Dict[torch._ops.OpOverload, Callable],
+    ]
+):
     # The keys of this dictionary are OpOverload's which can be
     # exported by ONNX exporter. Type of key is torch._ops.OpOverload.
     # For example, if torch.ops.aten.add.default is a key in support_dict,
@@ -265,7 +267,6 @@ def _move_placeholder_to_front(graph_module: torch.fx.GraphModule) -> None:
 def _replace_to_copy_with_to(fx_module: torch.fx.GraphModule) -> None:
     # aten._to_copy doesn't have exporter so we replace it with aten.to.
     for node in fx_module.graph.nodes:
-
         if (
             isinstance(node.target, torch._ops.OpOverload)
             and node.target.overloadpacket == torch.ops.aten._to_copy  # type: ignore
@@ -443,7 +444,7 @@ def _assert_allclose_with_detailed_error_message(
     max_value = torch.max(torch.abs(actual), torch.abs(expected))
     max_value[max_value == 0.0] = 1.0
     real_rtol = torch.max(diff / max_value)
-    allclose = True if real_atol <= atol or real_rtol <= rtol else False
+    allclose = bool(real_atol <= atol or real_rtol <= rtol)
     if not allclose:
         raise RuntimeError(
             "ONNX output doesn't match baseline output with "
diff --git a/orttraining/orttraining/python/training/torchdynamo/register_backend.py b/orttraining/orttraining/python/training/torchdynamo/register_backend.py
index 2830a10b4feb7..6f6c0f6575b0b 100644
--- a/orttraining/orttraining/python/training/torchdynamo/register_backend.py
+++ b/orttraining/orttraining/python/training/torchdynamo/register_backend.py
@@ -5,8 +5,8 @@
 
 from functorch.compile import min_cut_rematerialization_partition
 from torch._dynamo.backends.common import aot_autograd
-from .ort_backend import OrtBackend
 
+from .ort_backend import OrtBackend
 
 # This should be the underlying compiler for ALL graphs if
 # the user uses ORT to accelerate PyTorch via Dynamo.
diff --git a/orttraining/orttraining/python/training/utils/data/__init__.py b/orttraining/orttraining/python/training/utils/data/__init__.py
index 91207012216d3..7eb9901cf5979 100644
--- a/orttraining/orttraining/python/training/utils/data/__init__.py
+++ b/orttraining/orttraining/python/training/utils/data/__init__.py
@@ -2,4 +2,4 @@
 # Licensed under the MIT License.
 # __init__.py
 
-from .sampler import LoadBalancingDistributedSampler, LoadBalancingDistributedBatchSampler
+from .sampler import LoadBalancingDistributedBatchSampler, LoadBalancingDistributedSampler  # noqa: F401
diff --git a/orttraining/orttraining/python/training/utils/data/sampler.py b/orttraining/orttraining/python/training/utils/data/sampler.py
index 932f9e76dc13c..4c8852f07c39c 100644
--- a/orttraining/orttraining/python/training/utils/data/sampler.py
+++ b/orttraining/orttraining/python/training/utils/data/sampler.py
@@ -2,13 +2,14 @@
 # Licensed under the MIT License.
 # sampler.py
 
-import torch
 import math
+from typing import Callable, Iterator, Optional
+
+import numpy as np
+import torch
 import torch.distributed as dist
-from torch.utils.data.sampler import Sampler
 from torch.utils.data.dataset import Dataset
-from typing import Optional, Iterator, Callable
-import numpy as np
+from torch.utils.data.sampler import Sampler
 
 
 def _shard_wrapped_indices_across_workers(dataset_index_list, num_shards, num_samples_per_shard):
@@ -149,7 +150,7 @@ def __init__(
         self.ordered_sample_complexities = None
 
         if random_level < 0.0 or random_level > 1.0:
-            raise ValueError("Invalid random level {}, shoule be in the range [0.0, 1.0]".format(random_level))
+            raise ValueError(f"Invalid random level {random_level}, shoule be in the range [0.0, 1.0]")
 
         self.random_level = random_level
         self.random_number = None
diff --git a/orttraining/orttraining/python/training/utils/hooks/__init__.py b/orttraining/orttraining/python/training/utils/hooks/__init__.py
index 0944db1067593..34e4a5bd04b81 100644
--- a/orttraining/orttraining/python/training/utils/hooks/__init__.py
+++ b/orttraining/orttraining/python/training/utils/hooks/__init__.py
@@ -3,7 +3,10 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-
-from ._subscriber_manager import SubscriberManager
+__all__ = [
+    "StatisticsSubscriber",
+    "SubscriberManager",
+]
 
 from ._statistics_subscriber import StatisticsSubscriber
+from ._subscriber_manager import SubscriberManager
diff --git a/orttraining/orttraining/python/training/utils/hooks/_statistics_subscriber.py b/orttraining/orttraining/python/training/utils/hooks/_statistics_subscriber.py
index d5ecef60301b3..60f6586f1f3b6 100644
--- a/orttraining/orttraining/python/training/utils/hooks/_statistics_subscriber.py
+++ b/orttraining/orttraining/python/training/utils/hooks/_statistics_subscriber.py
@@ -3,12 +3,12 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
+import os
+import shutil
+import warnings
 from pathlib import Path
 from typing import Union
 
-import os
-import warnings
-import shutil
 import torch
 
 from ._subscriber_base import SubscriberBase
@@ -53,7 +53,6 @@ def __init__(
         self._output_dir = output_dir
         if os.path.exists(self._output_dir):
             if override_output_dir:
-
                 warnings.warn(f"Output directory {self._output_dir} already exists, overriding it.")
                 shutil.rmtree(self._output_dir)
             else:
diff --git a/orttraining/orttraining/python/training/utils/hooks/_subscriber_base.py b/orttraining/orttraining/python/training/utils/hooks/_subscriber_base.py
index 55a55d4ce34bb..572f2fb99b3b9 100644
--- a/orttraining/orttraining/python/training/utils/hooks/_subscriber_base.py
+++ b/orttraining/orttraining/python/training/utils/hooks/_subscriber_base.py
@@ -4,9 +4,9 @@
 # --------------------------------------------------------------------------
 
 
+import sys
 from typing import Union
 
-import sys
 import torch
 
 
diff --git a/orttraining/orttraining/python/training/utils/hooks/merge_activation_summary.py b/orttraining/orttraining/python/training/utils/hooks/merge_activation_summary.py
index ba174712d4e1e..2527b81b7e640 100644
--- a/orttraining/orttraining/python/training/utils/hooks/merge_activation_summary.py
+++ b/orttraining/orttraining/python/training/utils/hooks/merge_activation_summary.py
@@ -19,12 +19,11 @@
 
 """
 
-from pathlib import Path
-
 import argparse
 import logging
 import os
 import shutil
+from pathlib import Path
 
 logger = logging.getLogger(__name__)
 
diff --git a/orttraining/orttraining/test/external_custom_ops/setup.py b/orttraining/orttraining/test/external_custom_ops/setup.py
index 57ba10b91ad2d..435b83b818380 100644
--- a/orttraining/orttraining/test/external_custom_ops/setup.py
+++ b/orttraining/orttraining/test/external_custom_ops/setup.py
@@ -1,13 +1,15 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
-import sys
 import os
 import subprocess
-from setuptools import setup, Extension
-from setuptools.command.build_ext import build_ext
-from subprocess import CalledProcessError
-import pybind11
+import sys
+from subprocess import CalledProcessError  # noqa: F401
+
 import onnx
+import pybind11
+from setuptools import Extension, setup
+from setuptools.command.build_ext import build_ext
+
 import onnxruntime
 
 
@@ -29,12 +31,12 @@ def build_extension(self, ext):
                 "-DPYBIND11_PYTHON_VERSION={}.{}.{}".format(
                     sys.version_info.major, sys.version_info.minor, sys.version_info.micro
                 ),
-                "-Dpybind11_DIR={}".format(pybind11.get_cmake_dir()),
-                "-DONNX_INCLUDE={}".format(os.path.dirname(os.path.dirname(onnx.__file__))),
+                f"-Dpybind11_DIR={pybind11.get_cmake_dir()}",
+                f"-DONNX_INCLUDE={os.path.dirname(os.path.dirname(onnx.__file__))}",
                 "-DONNXRUNTIME_EXTERNAL_INCLUDE={}".format(
                     os.path.join(os.path.join(os.path.dirname(onnxruntime.__file__), "external"), "include")
                 ),
-                "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={}".format(extdir),
+                f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}",
                 ext.sourcedir,
             ],
             cwd=self.build_temp,
diff --git a/orttraining/orttraining/test/external_custom_ops/test.py b/orttraining/orttraining/test/external_custom_ops/test.py
index 7d3e4edf48bd8..7af665acd4d9c 100644
--- a/orttraining/orttraining/test/external_custom_ops/test.py
+++ b/orttraining/orttraining/test/external_custom_ops/test.py
@@ -1,18 +1,19 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-import os
-import sys
+import os  # noqa: F401
+import sys  # noqa: F401
+
 import numpy as np
 
+# Restore dlopen flags.
+import orttraining_external_custom_ops  # noqa: F401
+
 # Expose available (onnx::* and protobuf::*) symbols from onnxruntime to resolve references in
 # the custom ops shared library. Deepbind flag is required to avoid conflicts with other
 # instances of onnx/protobuf libraries.
 import onnxruntime
 
-# Restore dlopen flags.
-import orttraining_external_custom_ops
-
 so = onnxruntime.SessionOptions()
 sess = onnxruntime.InferenceSession("testdata/model.onnx", so)
 input = np.random.rand(2, 2).astype(np.float32)
diff --git a/orttraining/orttraining/test/external_transformer/test/external_transformers_test.py b/orttraining/orttraining/test/external_transformer/test/external_transformers_test.py
index a1377d2448bfd..f57f55d14eb1b 100644
--- a/orttraining/orttraining/test/external_transformer/test/external_transformers_test.py
+++ b/orttraining/orttraining/test/external_transformer/test/external_transformers_test.py
@@ -3,7 +3,7 @@
 import time
 
 
-class OutputGrabber(object):
+class OutputGrabber:
     """
     Class used to grab standard output or another stream.
     """
@@ -78,14 +78,16 @@ def readOutput(self):
             self.capturedtext += char
 
 
-import torch
-from onnxruntime.capi import _pybind_state as torch_ort_eager
-import torch.nn as nn
-import torch.nn.functional as F
-import numpy as np
-import os
-from onnxruntime.training import optim, orttrainer, orttrainer_options
-import unittest
+import os  # noqa: E402
+import unittest  # noqa: E402
+
+import numpy as np  # noqa: E402, F401
+import torch  # noqa: E402
+import torch.nn as nn  # noqa: E402
+import torch.nn.functional as F  # noqa: E402
+
+from onnxruntime.capi import _pybind_state as torch_ort_eager  # noqa: E402, F401
+from onnxruntime.training import optim, orttrainer, orttrainer_options  # noqa: E402, F401
 
 
 def my_loss(x, target):
@@ -94,7 +96,7 @@ def my_loss(x, target):
 
 class NeuralNet(nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNet, self).__init__()
+        super().__init__()
         self.fc1 = nn.Linear(input_size, hidden_size)
         self.relu = nn.ReLU()
         self.fc2 = nn.Linear(hidden_size, num_classes)
@@ -134,7 +136,7 @@ def test_external_graph_transformer_triggering(self):
         target = torch.randint(0, 10, (batch_size,))
 
         with OutputGrabber() as out:
-            loss = model.train_step(data, target)
+            model.train_step(data, target)
         assert "******************Trigger Customized Graph Transformer:  MyGraphTransformer!" in out.capturedtext
 
 
diff --git a/orttraining/orttraining/test/python/_orttraining_ortmodule_models.py b/orttraining/orttraining/test/python/_orttraining_ortmodule_models.py
index 12fa318c545ad..b31a31298328b 100644
--- a/orttraining/orttraining/test/python/_orttraining_ortmodule_models.py
+++ b/orttraining/orttraining/test/python/_orttraining_ortmodule_models.py
@@ -14,7 +14,7 @@ def forward(self, x, custom_class_obj):
 
 class NeuralNetSinglePositionalArgument(torch.nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNetSinglePositionalArgument, self).__init__()
+        super().__init__()
 
         self.fc1 = torch.nn.Linear(input_size, hidden_size)
         self.relu = torch.nn.ReLU()
@@ -28,14 +28,14 @@ def forward(self, input1):
 
 
 class NeuralNetCustomClassOutput(torch.nn.Module):
-    class CustomClass(object):
+    class CustomClass:
         def __init__(self, out1, out2, out3):
             self.out1 = out1
             self.out2 = out2
             self.out3 = out3
 
     def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNetCustomClassOutput, self).__init__()
+        super().__init__()
 
         self.fc1_1 = torch.nn.Linear(input_size, hidden_size)
         self.relu1 = torch.nn.ReLU()
diff --git a/orttraining/orttraining/test/python/_test_commons.py b/orttraining/orttraining/test/python/_test_commons.py
index 10dfab8834351..1413d59096832 100644
--- a/orttraining/orttraining/test/python/_test_commons.py
+++ b/orttraining/orttraining/test/python/_test_commons.py
@@ -26,9 +26,9 @@ def is_windows():
     return sys.platform.startswith("win")
 
 
-def run_subprocess(args, cwd=None, capture=False, dll_path=None, shell=False, env={}, log=None):
+def run_subprocess(args, cwd=None, capture=False, dll_path=None, shell=False, env={}, log=None):  # noqa: B006
     if log:
-        log.info("Running subprocess in '{0}'\n{1}".format(cwd or os.getcwd(), args))
+        log.info(f"Running subprocess in '{cwd or os.getcwd()}'\n{args}")
     my_env = os.environ.copy()
     if dll_path:
         if is_windows():
@@ -93,7 +93,7 @@ def legacy_poly_lr_scheduler(global_step, initial_lr, total_steps, warmup, power
 
 def generate_dummy_optim_state(model, optimizer):
     np.random.seed(0)
-    if not (isinstance(optimizer, optim.AdamConfig) or isinstance(optimizer, optim.LambConfig)):
+    if not (isinstance(optimizer, (optim.AdamConfig, optim.LambConfig))):
         return dict()
 
     moment_keys = ["Moment_1", "Moment_2"]
@@ -167,7 +167,7 @@ def generate_random_input_from_bart_model_desc(desc, seed=1, device="cuda:0"):
     dtype = torch.int64
     vocab_size = 30528
     sample_input = []
-    for index, input in enumerate(desc["inputs"]):
+    for _index, input in enumerate(desc["inputs"]):
         size = []
         for s in input[1]:
             if isinstance(s, (int)):
diff --git a/orttraining/orttraining/test/python/_test_helpers.py b/orttraining/orttraining/test/python/_test_helpers.py
index 73f2940467f57..33dbe66b96069 100644
--- a/orttraining/orttraining/test/python/_test_helpers.py
+++ b/orttraining/orttraining/test/python/_test_helpers.py
@@ -11,7 +11,9 @@
 try:
     from onnxruntime.training.ortmodule import ORTModule
     from onnxruntime.training.ortmodule._fallback import ORTModuleInitException
-    from onnxruntime.training.ortmodule._graph_execution_manager_factory import GraphExecutionManagerFactory
+    from onnxruntime.training.ortmodule._graph_execution_manager_factory import (  # noqa: F401
+        GraphExecutionManagerFactory,
+    )
 except ImportError:
     # Some pipelines do not contain ORTModule
     pass
@@ -57,11 +59,11 @@ def assert_model_outputs(output_a, output_b, verbose=False, rtol=1e-7, atol=0):
     assert isinstance(output_a, list) and isinstance(output_b, list), "output_a and output_b must be list of numbers"
     if len(output_a) != len(output_b):
         raise AssertionError(
-            "output_a and output_b must have the same length (%r != %r)." % (len(output_a), len(output_b))
+            f"output_a and output_b must have the same length ({len(output_a)!r} != {len(output_b)!r})."
         )
 
     # for idx in range(len(output_a)):
-    assert_allclose(output_a, output_b, rtol=rtol, atol=atol, err_msg=f"Model output value mismatch")
+    assert_allclose(output_a, output_b, rtol=rtol, atol=atol, err_msg="Model output value mismatch")
 
 
 def assert_onnx_weights(model_a, model_b, verbose=False, rtol=1e-7, atol=0):
@@ -115,7 +117,7 @@ def _assert_state_dict_weights(state_dict_a, state_dict_b, verbose, rtol, atol):
         atol (float, default is 1e-4): Max absolute difference
     """
 
-    for (a_name, a_val), (b_name, b_val) in zip(state_dict_a.items(), state_dict_b.items()):
+    for (a_name, a_val), (_b_name, b_val) in zip(state_dict_a.items(), state_dict_b.items()):
         np_a_vals = np.array(a_val).flatten()
         np_b_vals = np.array(b_val).flatten()
         assert np_a_vals.shape == np_b_vals.shape
@@ -192,13 +194,13 @@ def _get_name(name):
     res = os.path.join(data, name)
     if os.path.exists(res):
         return res
-    raise FileNotFoundError("Unable to find '{0}' or '{1}' or '{2}'".format(name, rel, res))
+    raise FileNotFoundError(f"Unable to find '{name}' or '{rel}' or '{res}'")
 
 
 # Depending on calling backward() from which outputs, it's possible that grad of some weights are not calculated.
 # none_pt_params is to tell what these weights are, so we will not compare the tensors.
 def assert_gradients_match_and_reset_gradient(
-    ort_model, pt_model, none_pt_params=[], reset_gradient=True, rtol=1e-04, atol=1e-05
+    ort_model, pt_model, none_pt_params=[], reset_gradient=True, rtol=1e-04, atol=1e-05  # noqa: B006
 ):
     ort_named_params = list(ort_model.named_parameters())
     pt_named_params = list(pt_model.named_parameters())
@@ -225,10 +227,10 @@ def assert_values_are_close(input, other, rtol=1e-04, atol=1e-05):
     if not are_close:
         abs_diff = torch.abs(input - other)
         abs_other = torch.abs(other)
-        max_atol = torch.max((abs_diff - rtol * abs_other))
+        max_atol = torch.max(abs_diff - rtol * abs_other)
         max_rtol = torch.max((abs_diff - atol) / abs_other)
-        err_msg = "The maximum atol is {}, maximum rtol is {}".format(max_atol, max_rtol)
-        assert False, err_msg
+        err_msg = f"The maximum atol is {max_atol}, maximum rtol is {max_rtol}"
+        raise AssertionError(err_msg)
 
 
 def _run_model_on_device(device, model, input_list, label_input, is_eval_mode=False, run_forward_twice=False):
@@ -266,7 +268,7 @@ def generate_inputs(input_list_, label_input_):
             loss += criterion(output2, target2)
 
         loss.backward()
-        for name, param in model.named_parameters():
+        for _name, param in model.named_parameters():
             if param.requires_grad:
                 grad_outputs.append(param.grad)
     return forward_outputs, grad_outputs
@@ -299,8 +301,8 @@ def run_training_test_and_compare(
     pt_model_label_input,
     run_forward_twice=False,
     ignore_grad_compare=False,
-    expected_outputs=[],
-    expected_grads=[],
+    expected_outputs=[],  # noqa: B006
+    expected_grads=[],  # noqa: B006
 ):
     cpu = torch.device("cpu")
 
@@ -344,11 +346,11 @@ def run_training_test_on_device_and_compare(
     barrier_func,
     run_forward_twice=False,
     ignore_grad_compare=False,
-    expected_outputs=[],
-    expected_grads=[],
+    expected_outputs=[],  # noqa: B006
+    expected_grads=[],  # noqa: B006
 ):
     repeats = 16
-    for i in range(repeats):
+    for _i in range(repeats):
         m = pt_model_builder_func()
         x = pt_model_inputs_generator()
 
@@ -424,7 +426,7 @@ def run_evaluate_test_on_device_and_compare(
     run_forward_twice=False,
 ):
     repeats = 16
-    for i in range(repeats):
+    for _i in range(repeats):
         m = pt_model_builder_func()
         x = pt_model_inputs_generator()
 
diff --git a/orttraining/orttraining/test/python/launch_test.py b/orttraining/orttraining/test/python/launch_test.py
index d183f3189511c..4c796c00863fc 100755
--- a/orttraining/orttraining/test/python/launch_test.py
+++ b/orttraining/orttraining/test/python/launch_test.py
@@ -2,14 +2,13 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-import os
-import sys
 import argparse
+import logging
+import os  # noqa: F401
+import sys
 
 from _test_commons import run_subprocess
 
-import logging
-
 logging.basicConfig(format="%(asctime)s %(name)s [%(levelname)s] - %(message)s", level=logging.DEBUG)
 log = logging.getLogger("Build")
 
diff --git a/orttraining/orttraining/test/python/onnxruntime_test_postprocess.py b/orttraining/orttraining/test/python/onnxruntime_test_postprocess.py
index 83bd524e7d6f3..d5298cf8e860e 100644
--- a/orttraining/orttraining/test/python/onnxruntime_test_postprocess.py
+++ b/orttraining/orttraining/test/python/onnxruntime_test_postprocess.py
@@ -1,27 +1,27 @@
-import copy
 import os
-import sys
 import unittest
 
-import onnx
-import pytest
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
-from numpy.testing import assert_allclose, assert_array_equal
 from orttraining_test_bert_postprocess import postprocess_model
 from orttraining_test_data_loader import create_ort_test_dataloader
 from orttraining_test_transformers import BertForPreTraining, BertModelTest
 from orttraining_test_utils import map_optimizer_attributes
 
 import onnxruntime
-from onnxruntime.capi.ort_trainer import IODescription, LossScaler, ModelDescription, ORTTrainer, generate_sample
+from onnxruntime.capi.ort_trainer import (  # noqa: F401
+    IODescription,
+    LossScaler,
+    ModelDescription,
+    ORTTrainer,
+    generate_sample,
+)
 
 torch.manual_seed(1)
 onnxruntime.set_seed(1)
 
 
-class Test_PostPasses(unittest.TestCase):
+class Test_PostPasses(unittest.TestCase):  # noqa: N801
     def get_onnx_model(
         self, model, model_desc, inputs, device, _enable_internal_postprocess=True, _extra_postprocess=None
     ):
@@ -47,7 +47,7 @@ def get_onnx_model(
             _extra_postprocess=_extra_postprocess,
         )
 
-        train_output = model.train_step(*inputs)
+        model.train_step(*inputs)
         return model.onnx_model_
 
     def count_all_nodes(self, model):
@@ -78,12 +78,12 @@ def get_name(self, name):
         res = os.path.join(data, name)
         if os.path.exists(res):
             return res
-        raise FileNotFoundError("Unable to find '{0}' or '{1}' or '{2}'".format(name, rel, res))
+        raise FileNotFoundError(f"Unable to find '{name}' or '{rel}' or '{res}'")
 
     def test_layer_norm(self):
         class LayerNormNet(nn.Module):
             def __init__(self, target):
-                super(LayerNormNet, self).__init__()
+                super().__init__()
                 self.ln_1 = nn.LayerNorm(10)
                 self.loss = nn.CrossEntropyLoss()
                 self.target = target
@@ -117,7 +117,7 @@ def forward(self, x):
     def test_expand(self):
         class ExpandNet(nn.Module):
             def __init__(self, target):
-                super(ExpandNet, self).__init__()
+                super().__init__()
                 self.loss = nn.CrossEntropyLoss()
                 self.target = target
                 self.linear = torch.nn.Linear(2, 2)
@@ -213,9 +213,7 @@ def test_bert(self):
             batch = b
             break
         learning_rate = torch.tensor([1.00e00]).to(device)
-        inputs = batch + [
-            learning_rate,
-        ]
+        inputs = [*batch, learning_rate]
 
         onnx_model = self.get_onnx_model(model, model_desc, inputs, device, _extra_postprocess=postprocess_model)
 
@@ -261,7 +259,7 @@ def postpass_replace_first_add_with_sub(model):
 
         class MultiAdd(nn.Module):
             def __init__(self, target):
-                super(MultiAdd, self).__init__()
+                super().__init__()
                 self.loss = nn.CrossEntropyLoss()
                 self.target = target
                 self.linear = torch.nn.Linear(2, 2, bias=False)
diff --git a/orttraining/orttraining/test/python/onnxruntime_test_register_ep.py b/orttraining/orttraining/test/python/onnxruntime_test_register_ep.py
index 5f71125cff413..b4595b10c053b 100644
--- a/orttraining/orttraining/test/python/onnxruntime_test_register_ep.py
+++ b/orttraining/orttraining/test/python/onnxruntime_test_register_ep.py
@@ -1,6 +1,7 @@
+import os
 import unittest
+
 import onnxruntime_pybind11_state as C
-import os
 
 
 class EPRegistrationTests(unittest.TestCase):
@@ -17,7 +18,7 @@ def test_register_custom_eps(self):
         this = os.path.dirname(__file__)
         custom_op_model = os.path.join(this, "testdata", "custom_execution_provider_library", "test_model.onnx")
         if not os.path.exists(custom_op_model):
-            raise FileNotFoundError("Unable to find '{0}'".format(custom_op_model))
+            raise FileNotFoundError(f"Unable to find '{custom_op_model}'")
 
         session_options = C.get_default_session_options()
         sess = C.InferenceSession(session_options, custom_op_model, True, True)
diff --git a/orttraining/orttraining/test/python/orttraining_ortmodule_distributed_tests.py b/orttraining/orttraining/test/python/orttraining_ortmodule_distributed_tests.py
index 4f778444b88f0..08b304cb0e3b2 100644
--- a/orttraining/orttraining/test/python/orttraining_ortmodule_distributed_tests.py
+++ b/orttraining/orttraining/test/python/orttraining_ortmodule_distributed_tests.py
@@ -2,13 +2,12 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-import sys
 import argparse
+import logging
+import sys
 
 from _test_commons import run_subprocess
 
-import logging
-
 logging.basicConfig(format="%(asctime)s %(name)s [%(levelname)s] - %(message)s", level=logging.DEBUG)
 log = logging.getLogger("ORTModuleDistributedTests")
 
diff --git a/orttraining/orttraining/test/python/orttraining_ortmodule_tests.py b/orttraining/orttraining/test/python/orttraining_ortmodule_tests.py
index 901015c09dfe3..d97b3b5e2b6a2 100644
--- a/orttraining/orttraining/test/python/orttraining_ortmodule_tests.py
+++ b/orttraining/orttraining/test/python/orttraining_ortmodule_tests.py
@@ -58,7 +58,7 @@ def run_ortmodule_fallback_tests(cwd, log, transformers_cache):
 
 
 def run_ortmodule_poc_net(cwd, log, no_cuda, data_dir):
-    log.debug("Running: ORTModule POCNet for MNIST with --no-cuda arg {}.".format(no_cuda))
+    log.debug(f"Running: ORTModule POCNet for MNIST with --no-cuda arg {no_cuda}.")
 
     command = [sys.executable, "orttraining_test_ortmodule_poc.py"]
     if no_cuda:
@@ -88,7 +88,7 @@ def run_ortmodule_torch_lightning(cwd, log, data_dir):
 
 
 def run_ortmodule_hf_bert_for_sequence_classification_from_pretrained(cwd, log, no_cuda, data_dir, transformers_cache):
-    log.debug("Running: ORTModule HuggingFace BERT for sequence classification with --no-cuda arg {}.".format(no_cuda))
+    log.debug(f"Running: ORTModule HuggingFace BERT for sequence classification with --no-cuda arg {no_cuda}.")
 
     env = get_env_with_transformers_cache(transformers_cache)
 
diff --git a/orttraining/orttraining/test/python/orttraining_run_bert_pretrain.py b/orttraining/orttraining/test/python/orttraining_run_bert_pretrain.py
index a087a97da5a54..dcb480030d41d 100644
--- a/orttraining/orttraining/test/python/orttraining_run_bert_pretrain.py
+++ b/orttraining/orttraining/test/python/orttraining_run_bert_pretrain.py
@@ -1,37 +1,27 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
 # ==================
-import os
-import shutil
-import logging
-import random
-import h5py
-from tqdm import tqdm
-import datetime
-import numpy as np
 import dataclasses
-from dataclasses import dataclass, field
-from typing import Optional, Any, Dict
-import json
+import datetime
 import glob
-
+import json
+import logging
+import os
+import random
+import shutil
 import unittest
+from concurrent.futures import ProcessPoolExecutor
+from dataclasses import dataclass, field
+from typing import Any, Dict, Optional
 
+import h5py
+import numpy as np
 import torch
-from torch.utils.data import DataLoader, RandomSampler, Dataset
 import torch.distributed as dist
+from torch.utils.data import DataLoader, Dataset, RandomSampler
 from torch.utils.tensorboard import SummaryWriter
-
-from transformers import BertForPreTraining, BertConfig, HfArgumentParser
-
-from concurrent.futures import ProcessPoolExecutor
+from tqdm import tqdm
+from transformers import BertConfig, BertForPreTraining, HfArgumentParser
 
 import onnxruntime as ort
-from onnxruntime.training import amp, optim, orttrainer
-from onnxruntime.training.optim import PolyWarmupLRScheduler, LinearWarmupLRScheduler
-from onnxruntime.training.checkpoint import aggregate_checkpoints
 
 # need to override torch.onnx.symbolic_opset12.nll_loss to handle ignore_index == -100 cases.
 # the fix for ignore_index == -100 cases is already in pytorch master.
@@ -39,6 +29,9 @@
 # eventually we will use pytorch with fixed nll_loss once computation
 # issues are understood and solved.
 import onnxruntime.capi.pt_patch
+from onnxruntime.training import amp, optim, orttrainer
+from onnxruntime.training.checkpoint import aggregate_checkpoints
+from onnxruntime.training.optim import LinearWarmupLRScheduler, PolyWarmupLRScheduler  # noqa: F401
 
 # we cannot make full convergence run in nightly pipeling because of its timeout limit,
 # max_steps is still needed to calculate learning rate. force_to_stop_max_steps is used to
@@ -109,7 +102,6 @@ def bert_model_description(config):
 
 
 def create_pretraining_dataset(input_file, max_pred_length, args):
-
     train_data = pretraining_dataset(input_file=input_file, max_pred_length=max_pred_length)
     train_sampler = RandomSampler(train_data)
     train_dataloader = DataLoader(
@@ -118,7 +110,7 @@ def create_pretraining_dataset(input_file, max_pred_length, args):
     return train_dataloader, input_file
 
 
-class pretraining_dataset(Dataset):
+class pretraining_dataset(Dataset):  # noqa: N801
     def __init__(self, input_file, max_pred_length):
         logger.info("pretraining_dataset: %s, max_pred_length: %d", input_file, max_pred_length)
         self.input_file = input_file
@@ -158,11 +150,10 @@ def __getitem__(self, index):
         return [input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels]
 
 
-import argparse
+import argparse  # noqa: E402
 
 
 def parse_arguments():
-
     parser = argparse.ArgumentParser()
 
     # batch size test config parameters
@@ -340,7 +331,7 @@ def to_json_string(self):
 
     def to_sanitized_dict(self) -> Dict[str, Any]:
         """
-        Sanitized serialization to use with TensorBoard’s hparams
+        Sanitized serialization to use with TensorBoard`s hparams
         """
         d = dataclasses.asdict(self)
         valid_types = [bool, int, float, str, torch.Tensor]
@@ -348,7 +339,6 @@ def to_sanitized_dict(self) -> Dict[str, Any]:
 
 
 def setup_training(args):
-
     assert torch.cuda.is_available()
 
     if args.local_rank == -1:
@@ -362,7 +352,7 @@ def setup_training(args):
 
     if args.gradient_accumulation_steps < 1:
         raise ValueError(
-            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(args.gradient_accumulation_steps)
+            f"Invalid gradient_accumulation_steps parameter: {args.gradient_accumulation_steps}, should be >= 1"
         )
     if args.train_batch_size % args.gradient_accumulation_steps != 0:
         raise ValueError(
@@ -384,7 +374,7 @@ def setup_torch_distributed(world_rank, world_size):
     os.environ["RANK"] = str(world_rank)
     os.environ["WORLD_SIZE"] = str(world_size)
     os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = str("12345")
+    os.environ["MASTER_PORT"] = "12345"
     torch.distributed.init_process_group(backend="nccl", world_size=world_size, rank=world_rank)
     return
 
@@ -493,7 +483,6 @@ def do_pretrain(args):
 
     logger.info("Running training: Batch size = %d, initial LR = %f", args.train_batch_size, args.learning_rate)
 
-    most_recent_ckpts_paths = []
     average_loss = 0.0
     epoch = 0
     training_steps = 0
@@ -524,7 +513,7 @@ def do_pretrain(args):
             )
 
             train_iter = tqdm(train_dataloader, desc="Iteration") if is_main_process(args) else train_dataloader
-            for step, batch in enumerate(train_iter):
+            for _step, batch in enumerate(train_iter):
                 training_steps += 1
                 batch = [t.to(device) for t in batch]
                 input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch
@@ -547,7 +536,7 @@ def do_pretrain(args):
                                 # tb_writer.add_scalar('train/summary/scalar/all_fp16_gradients_finite_859', all_finite, global_step)
                             tb_writer.add_scalar("train/summary/total_loss", average_loss / divisor, global_step)
 
-                        print("Step:{} Average Loss = {}".format(global_step, average_loss / divisor))
+                        print(f"Step:{global_step} Average Loss = {average_loss / divisor}")
 
                     if global_step >= args.max_steps or global_step >= force_to_stop_max_steps:
                         if tb_writer:
@@ -555,9 +544,7 @@ def do_pretrain(args):
 
                     if global_step >= args.max_steps:
                         if args.save_checkpoint:
-                            model.save_checkpoint(
-                                os.path.join(args.output_dir, "checkpoint-{}.ortcp".format(args.world_rank))
-                            )
+                            model.save_checkpoint(os.path.join(args.output_dir, f"checkpoint-{args.world_rank}.ortcp"))
                         final_loss = average_loss / (args.log_freq * args.gradient_accumulation_steps)
                         return final_loss
 
@@ -696,7 +683,7 @@ def test_pretrain_zero(self):
             deepspeed_zero_stage=self.deepspeed_zero_stage,
             save_checkpoint=True,
         )
-        train_loss = do_pretrain(args)
+        do_pretrain(args)
 
         # ensure all workers reach this point before loading the checkpointed state
         torch.distributed.barrier()
@@ -733,12 +720,9 @@ def test_pretrain_zero(self):
     # calling unpublished get_mpi_context_xxx to get rank/size numbers.
     try:
         # In case ORT is not built with MPI/NCCL, there are no get_mpi_context_xxx internal apis.
-        from onnxruntime.capi._pybind_state import (
-            get_mpi_context_local_rank,
-            get_mpi_context_local_size,
-            get_mpi_context_world_rank,
-            get_mpi_context_world_size,
-        )
+        from onnxruntime.capi._pybind_state import get_mpi_context_local_size  # noqa: F401
+        from onnxruntime.capi._pybind_state import get_mpi_context_world_rank  # noqa: F401
+        from onnxruntime.capi._pybind_state import get_mpi_context_local_rank, get_mpi_context_world_size
 
         has_get_mpi_context_internal_api = True
     except ImportError:
diff --git a/orttraining/orttraining/test/python/orttraining_run_frontend_batch_size_test.py b/orttraining/orttraining/test/python/orttraining_run_frontend_batch_size_test.py
index db03a636d046e..e96b90138c3d5 100644
--- a/orttraining/orttraining/test/python/orttraining_run_frontend_batch_size_test.py
+++ b/orttraining/orttraining/test/python/orttraining_run_frontend_batch_size_test.py
@@ -1,6 +1,6 @@
-import sys
 import collections
 import subprocess
+import sys
 
 Config = collections.namedtuple(
     "Config",
diff --git a/orttraining/orttraining/test/python/orttraining_run_glue.py b/orttraining/orttraining/test/python/orttraining_run_glue.py
index a9b514599fb78..794e2f8cc7240 100644
--- a/orttraining/orttraining/test/python/orttraining_run_glue.py
+++ b/orttraining/orttraining/test/python/orttraining_run_glue.py
@@ -1,14 +1,14 @@
 # adapted from run_glue.py of huggingface transformers
 
-import dataclasses
+import dataclasses  # noqa: F401
 import logging
 import os
+import unittest
 from dataclasses import dataclass, field
 from typing import Dict, Optional
-import unittest
+
 import numpy as np
 from numpy.testing import assert_allclose
-
 from transformers import (
     AutoConfig,
     AutoModelForSequenceClassification,
@@ -24,15 +24,12 @@
 )
 
 import onnxruntime
-from onnxruntime.capi.ort_trainer import ORTTrainer, LossScaler, ModelDescription, IODescription
+from onnxruntime.capi.ort_trainer import IODescription, LossScaler, ModelDescription, ORTTrainer  # noqa: F401
 
 try:
-    from onnxruntime.capi._pybind_state import (
-        get_mpi_context_local_rank,
-        get_mpi_context_local_size,
-        get_mpi_context_world_rank,
-        get_mpi_context_world_size,
-    )
+    from onnxruntime.capi._pybind_state import get_mpi_context_local_size  # noqa: F401
+    from onnxruntime.capi._pybind_state import get_mpi_context_world_rank  # noqa: F401
+    from onnxruntime.capi._pybind_state import get_mpi_context_local_rank, get_mpi_context_world_size
 
     has_get_mpi_context_internal_api = True
 except ImportError:
@@ -40,17 +37,16 @@
     pass
 
 
+import torch  # noqa: F401
 from orttraining_transformer_trainer import ORTTransformerTrainer
 
-import torch
-
 logger = logging.getLogger(__name__)
 
 
 def verify_old_and_new_api_are_equal(results_per_api):
     new_api_results = results_per_api[True]
     old_api_results = results_per_api[False]
-    for key in new_api_results.keys():
+    for key in new_api_results:
         assert_allclose(new_api_results[key], old_api_results[key])
 
 
@@ -183,7 +179,7 @@ def model_to_desc(self, model_name, model):
                 "outputs": [("loss", [], True), ("logits", ["batch", 2])],
             }
         else:
-            raise RuntimeError("unsupported base model name {}.".format(model_name))
+            raise RuntimeError(f"unsupported base model name {model_name}.")
 
         return model_desc
 
@@ -230,7 +226,7 @@ def run_glue(self, model_name, task_name, fp16):
             num_labels = glue_tasks_num_labels[data_args.task_name]
             output_mode = glue_output_modes[data_args.task_name]
         except KeyError:
-            raise ValueError("Task not found: %s" % (data_args.task_name))
+            raise ValueError("Task not found: %s" % (data_args.task_name))  # noqa: B904
 
         config = AutoConfig.from_pretrained(
             model_args.config_name if model_args.config_name else model_args.model_name_or_path,
@@ -285,7 +281,7 @@ def compute_metrics(p: EvalPrediction) -> Dict:
 
             result = trainer.evaluate()
 
-            logger.info("***** Eval results {} *****".format(data_args.task_name))
+            logger.info(f"***** Eval results {data_args.task_name} *****")
             for key, value in result.items():
                 logger.info("  %s = %s", key, value)
 
diff --git a/orttraining/orttraining/test/python/orttraining_run_multiple_choice.py b/orttraining/orttraining/test/python/orttraining_run_multiple_choice.py
index a4c069c683e1c..92db204593bcd 100644
--- a/orttraining/orttraining/test/python/orttraining_run_multiple_choice.py
+++ b/orttraining/orttraining/test/python/orttraining_run_multiple_choice.py
@@ -1,35 +1,32 @@
 # adapted from run_multiple_choice.py of huggingface transformers
 # https://github.com/huggingface/transformers/blob/master/examples/multiple-choice/run_multiple_choice.py
 
-import dataclasses
+import dataclasses  # noqa: F401
 import logging
 import os
+import unittest
 from dataclasses import dataclass, field
 from typing import Dict, Optional
-import unittest
-import numpy as np
-from numpy.testing import assert_allclose
 
+import numpy as np
+import torch  # noqa: F401
+from numpy.testing import assert_allclose  # noqa: F401
+from orttraining_run_glue import verify_old_and_new_api_are_equal  # noqa: F401
+from orttraining_transformer_trainer import ORTTransformerTrainer
+from transformers import HfArgumentParser  # noqa: F401
+from transformers import Trainer  # noqa: F401
 from transformers import (
     AutoConfig,
     AutoModelForMultipleChoice,
     AutoTokenizer,
     EvalPrediction,
-    HfArgumentParser,
-    Trainer,
     TrainingArguments,
     set_seed,
 )
+from utils_multiple_choice import MultipleChoiceDataset, Split, SwagProcessor
 
 import onnxruntime
-from onnxruntime.capi.ort_trainer import ORTTrainer, LossScaler, ModelDescription, IODescription
-
-from orttraining_transformer_trainer import ORTTransformerTrainer
-
-import torch
-
-from utils_multiple_choice import MultipleChoiceDataset, Split, SwagProcessor
-from orttraining_run_glue import verify_old_and_new_api_are_equal
+from onnxruntime.capi.ort_trainer import IODescription, LossScaler, ModelDescription, ORTTrainer  # noqa: F401
 
 logger = logging.getLogger(__name__)
 
@@ -155,7 +152,7 @@ def run_multiple_choice(self, model_name, task_name, fp16):
             label_list = processor.get_labels()
             num_labels = len(label_list)
         except KeyError:
-            raise ValueError("Task not found: %s" % (data_args.task_name))
+            raise ValueError("Task not found: %s" % (data_args.task_name))  # noqa: B904
 
         config = AutoConfig.from_pretrained(
             model_args.config_name if model_args.config_name else model_args.model_name_or_path,
@@ -271,7 +268,7 @@ def compute_metrics(p: EvalPrediction) -> Dict:
 
             result = trainer.evaluate()
 
-            logger.info("***** Eval results {} *****".format(data_args.task_name))
+            logger.info(f"***** Eval results {data_args.task_name} *****")
             for key, value in result.items():
                 logger.info("  %s = %s", key, value)
 
diff --git a/orttraining/orttraining/test/python/orttraining_test_bert_postprocess.py b/orttraining/orttraining/test/python/orttraining_test_bert_postprocess.py
index 66de14dce6852..71e6bb8e4d2f2 100644
--- a/orttraining/orttraining/test/python/orttraining_test_bert_postprocess.py
+++ b/orttraining/orttraining/test/python/orttraining_test_bert_postprocess.py
@@ -1,5 +1,5 @@
-from orttraining_test_model_transform import add_name, fix_transpose, add_expand_shape
-from orttraining_test_layer_norm_transform import layer_norm_transform
+from orttraining_test_layer_norm_transform import layer_norm_transform  # noqa: F401
+from orttraining_test_model_transform import add_expand_shape, add_name, fix_transpose  # noqa: F401
 
 
 def postprocess_model(model):
diff --git a/orttraining/orttraining/test/python/orttraining_test_checkpoint_storage.py b/orttraining/orttraining/test/python/orttraining_test_checkpoint_storage.py
index 2ef8322bd9cfd..77ba7c41c1268 100644
--- a/orttraining/orttraining/test/python/orttraining_test_checkpoint_storage.py
+++ b/orttraining/orttraining/test/python/orttraining_test_checkpoint_storage.py
@@ -2,12 +2,13 @@
 # Licensed under the MIT License.
 # orttraining_test_checkpoint_storage.py
 
-import pytest
-import torch
-import numpy as np
 import os
-import shutil
 import pickle
+import shutil
+
+import numpy as np
+import pytest
+import torch
 
 from onnxruntime.training import _checkpoint_storage
 
@@ -17,10 +18,7 @@
 def _equals(a, b):
     """Checks recursively if two dictionaries are equal"""
     if isinstance(a, dict):
-        for key in a:
-            if key not in b or not _equals(a[key], b[key]):
-                return False
-        return True
+        return all(not (key not in b or not _equals(a[key], b[key])) for key in a)
     else:
         if isinstance(a, bytes):
             a = a.decode()
@@ -42,12 +40,9 @@ def _numpy_types(obj_value):
     False if any other type
     """
     if not isinstance(obj_value, dict):
-        return type(obj_value).__module__ == np.__name__ or isinstance(obj_value, str) or isinstance(obj_value, bytes)
+        return isinstance(obj_value, (str, bytes)) or type(obj_value).__module__ == np.__name__
 
-    for _, value in obj_value.items():
-        if not _numpy_types(value):
-            return False
-    return True
+    return all(_numpy_types(value) for _, value in obj_value.items())
 
 
 def _get_dict(separated_key):
@@ -98,7 +93,7 @@ def _get_dict(separated_key):
     return test_dict, {"key": key} if len(separated_key) > 0 else dict(), expected_val
 
 
-class _CustomClass(object):
+class _CustomClass:
     """Custom object that encpsulates dummy values for loss, epoch and train_step"""
 
     def __init__(self):
@@ -252,7 +247,7 @@ def test_checkpoint_storage_for_custom_user_dict_succeeds(checkpoint_storage_tes
 
     loaded_dict = _checkpoint_storage.load(pytest.checkpoint_path)
     assert (loaded_dict["a"] == to_save["a"].numpy()).all()
-    try:
+    try:  # noqa: SIM105
         loaded_dict["user_dict"] = loaded_dict["user_dict"].decode()
     except AttributeError:
         pass
diff --git a/orttraining/orttraining/test/python/orttraining_test_data_loader.py b/orttraining/orttraining/test/python/orttraining_test_data_loader.py
index 2df5a3964bc94..aa15b44ae0d66 100644
--- a/orttraining/orttraining/test/python/orttraining_test_data_loader.py
+++ b/orttraining/orttraining/test/python/orttraining_test_data_loader.py
@@ -1,7 +1,9 @@
-from enum import Enum
 import random
+from enum import Enum
+
 import torch
-from torch.utils.data import Dataset, DataLoader
+from torch.utils.data import DataLoader, Dataset
+
 from onnxruntime.capi.ort_trainer import generate_sample
 
 global_rng = random.Random()
@@ -48,9 +50,7 @@ def __init__(self, input_desc, seq_len, dataset_len, device):
             shape_ = []
             for i, axis in enumerate(input_desc.shape_):
                 if axis == "max_seq_len_in_batch":
-                    shape_ = shape_ + [
-                        seq_len,
-                    ]
+                    shape_ = [*shape_, seq_len]
                 elif axis != "batch":
                     shape_ = input_desc.shape_[i]
             input_desc.shape_ = shape_
@@ -85,7 +85,7 @@ def split_batch(batch, input_desc, args_count):
     args = []  # (input_ids[batch, seglen], attention_mask[batch, seglen])
     kwargs = {}  # {'token_type_ids': token_type_ids[batch,seglen], 'position_ids': token_type_ids[batch, seglen]}
     for i in range(args_count):
-        args = args + [batch[i]]
+        args = [*args, batch[i]]
 
     for i in range(args_count, total_argument_count):
         kwargs[input_desc[i].name_] = batch[i]
diff --git a/orttraining/orttraining/test/python/orttraining_test_debuggability.py b/orttraining/orttraining/test/python/orttraining_test_debuggability.py
index d3d6987f47c2a..44c06287ffa88 100644
--- a/orttraining/orttraining/test/python/orttraining_test_debuggability.py
+++ b/orttraining/orttraining/test/python/orttraining_test_debuggability.py
@@ -1,33 +1,10 @@
-import inspect
-import onnx
-import os
 import pytest
 import torch
-import torchvision
-
-from numpy.testing import assert_allclose
-
-from onnxruntime import set_seed
-from onnxruntime.capi.ort_trainer import (
-    IODescription as Legacy_IODescription,
-    ModelDescription as Legacy_ModelDescription,
-    LossScaler as Legacy_LossScaler,
-    ORTTrainer as Legacy_ORTTrainer,
-)
-from onnxruntime.training import (
-    _utils,
-    amp,
-    optim,
-    orttrainer,
-    TrainStepInfo,
-    model_desc_validation as md_val,
-    orttrainer_options as orttrainer_options,
-)
-
 from _test_commons import _load_pytorch_transformer_model
 
-import _test_helpers
-
+from onnxruntime import set_seed
+from onnxruntime.training import optim, orttrainer
+from onnxruntime.training import orttrainer_options as orttrainer_options
 
 ###############################################################################
 # Testing starts here #########################################################
diff --git a/orttraining/orttraining/test/python/orttraining_test_dort.py b/orttraining/orttraining/test/python/orttraining_test_dort.py
index 3c2166a735e8f..ae6d1ac3c46f4 100644
--- a/orttraining/orttraining/test/python/orttraining_test_dort.py
+++ b/orttraining/orttraining/test/python/orttraining_test_dort.py
@@ -7,7 +7,7 @@
 from torch import nn
 from torch.nn import functional as F
 
-from onnxruntime.training.torchdynamo.register_backend import ort, aot_ort
+from onnxruntime.training.torchdynamo.register_backend import aot_ort, ort
 
 
 class TestTorchDynamoOrt(unittest.TestCase):
diff --git a/orttraining/orttraining/test/python/orttraining_test_experimental_gradient_graph.py b/orttraining/orttraining/test/python/orttraining_test_experimental_gradient_graph.py
index c67de052753ad..dd26448f0c596 100644
--- a/orttraining/orttraining/test/python/orttraining_test_experimental_gradient_graph.py
+++ b/orttraining/orttraining/test/python/orttraining_test_experimental_gradient_graph.py
@@ -4,8 +4,9 @@
 
 import numpy as np
 import onnx
-import onnxruntime
 import torch
+
+import onnxruntime
 from onnxruntime.training.experimental import export_gradient_graph
 
 
@@ -15,7 +16,7 @@ class NeuralNet(torch.nn.Module):
     """
 
     def __init__(self, input_size: int, embedding_size: int, hidden_size: int, num_classes: int):
-        super(NeuralNet, self).__init__()
+        super().__init__()
 
         self.frozen_layer = torch.nn.Linear(input_size, embedding_size, bias=False)
         # Freeze a layer (mainly to test that gradients don't get output for it).
diff --git a/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py b/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py
index 42daff79bd7d1..8afbafccb8241 100644
--- a/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py
+++ b/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py
@@ -8,14 +8,14 @@
 import torch.nn.functional as F
 from torch.utils.checkpoint import checkpoint
 
-from onnxruntime.training.ortmodule import ORTModule
+from onnxruntime.training.ortmodule import ORTModule  # noqa: F401
 from onnxruntime.training.ortmodule.experimental.hierarchical_ortmodule import HierarchicalORTModule
 
 
 class A(nn.Module):
     # A supported module.
     def __init__(self):
-        super(A, self).__init__()
+        super().__init__()
         self.l1 = nn.Linear(2, 2)
 
     def forward(self, x):
@@ -27,7 +27,7 @@ class B(nn.Module):
     # uses gradient-checkpointing. However, its two sub-module's
     # are exportable, so ORTModule should be used to compute them.
     def __init__(self):
-        super(B, self).__init__()
+        super().__init__()
         self.l1 = nn.Linear(2, 2)
         self.a = A()
 
@@ -45,7 +45,7 @@ def custom_forward(x_):
 class C(nn.Module):
     # A supported module.
     def __init__(self):
-        super(C, self).__init__()
+        super().__init__()
         self.l1 = nn.Linear(2, 2)
 
     def forward(self, x):
@@ -57,7 +57,7 @@ class D(nn.Module):
     # This module is not exportable to ONNX because it
     # inner module self.b uses gradient-checkpointing.
     def __init__(self):
-        super(D, self).__init__()
+        super().__init__()
         self.b = B()
 
     def forward(self, x):
@@ -68,7 +68,7 @@ def forward(self, x):
 class Main(nn.Module):
     # Main module.
     def __init__(self):
-        super(Main, self).__init__()
+        super().__init__()
         self.alpha = nn.Parameter(torch.tensor(0.941736), requires_grad=True)
         self.a = A()
         self.b = B()
@@ -83,7 +83,7 @@ def forward(self, x):
 class MainWithNonTensorInput(nn.Module):
     # Module for testing non-tensor input.
     def __init__(self):
-        super(MainWithNonTensorInput, self).__init__()
+        super().__init__()
         self.alpha = nn.Parameter(torch.tensor(0.941736), requires_grad=True)
         self.a = A()
         self.b = B()
@@ -101,7 +101,7 @@ def forward(self, x, case):
 class E(nn.Module):
     # Sub-modules are stored in nn.ModuleList.
     def __init__(self):
-        super(E, self).__init__()
+        super().__init__()
         self.my_layers = nn.ModuleList([A(), B(), C(), D()])
 
     def forward(self, x):
@@ -114,7 +114,7 @@ def forward(self, x):
 class MainWithModuleList(nn.Module):
     # Sub-modules are stored in nn.ModuleList.
     def __init__(self):
-        super(MainWithModuleList, self).__init__()
+        super().__init__()
         self.my_layers = nn.ModuleList([E(), E()])
 
     def forward(self, x):
@@ -128,7 +128,7 @@ class MainWithMultiModuleOutputs(nn.Module):
     # Module with repeated sub-modules and producing
     # multiple outputs.
     def __init__(self):
-        super(MainWithMultiModuleOutputs, self).__init__()
+        super().__init__()
         self.layer_list1 = nn.ModuleList([D(), A(), B()])
         self.layer_list2 = nn.ModuleList([C(), B(), D()])
 
@@ -144,7 +144,7 @@ def forward(self, x):
 
 class G(nn.Module):
     def __init__(self):
-        super(G, self).__init__()
+        super().__init__()
         self.l1 = nn.Linear(2, 2)
 
     def forward(self, x):
@@ -161,7 +161,7 @@ def forward_fp16(self, x):
 class MainWithModuleMultipleCalls(nn.Module):
     # Module with mixed precision.
     def __init__(self):
-        super(MainWithModuleMultipleCalls, self).__init__()
+        super().__init__()
         self.b = B()
         self.g = G()
 
@@ -174,7 +174,7 @@ def forward(self, x):
 class MainWithNonForwardCall(nn.Module):
     # Module with mixed precision.
     def __init__(self):
-        super(MainWithNonForwardCall, self).__init__()
+        super().__init__()
         self.b = B()
         self.g = G()
 
diff --git a/orttraining/orttraining/test/python/orttraining_test_hooks.py b/orttraining/orttraining/test/python/orttraining_test_hooks.py
index 4a889d07d662b..80f29ad88105f 100644
--- a/orttraining/orttraining/test/python/orttraining_test_hooks.py
+++ b/orttraining/orttraining/test/python/orttraining_test_hooks.py
@@ -3,11 +3,12 @@
 
 import os
 import tempfile
+
 import pytest
 import torch
 
 from onnxruntime.training.ortmodule import ORTModule
-from onnxruntime.training.utils.hooks import SubscriberManager, StatisticsSubscriber
+from onnxruntime.training.utils.hooks import StatisticsSubscriber, SubscriberManager
 
 
 class NeuralNetSingleOutput(torch.nn.Module):
diff --git a/orttraining/orttraining/test/python/orttraining_test_layer_norm_transform.py b/orttraining/orttraining/test/python/orttraining_test_layer_norm_transform.py
index 241a963e28498..35d59c1750de4 100644
--- a/orttraining/orttraining/test/python/orttraining_test_layer_norm_transform.py
+++ b/orttraining/orttraining/test/python/orttraining_test_layer_norm_transform.py
@@ -14,10 +14,10 @@ def find_node(graph_proto, op_type):
 
 
 def gen_attribute(key, value):
-    attr = AttributeProto()
+    attr = AttributeProto()  # noqa: F821
     attr.name = key
     attr.ints.extend(int(v) for v in value)
-    attr.type = AttributeProto.INTS
+    attr.type = AttributeProto.INTS  # noqa: F821
     return attr
 
 
@@ -48,21 +48,21 @@ def layer_norm_transform(model_proto):
 
     graph_proto = model_proto.graph
 
-    _, map_input_Div = find_node(graph_proto, "Div")
+    _, map_input_Div = find_node(graph_proto, "Div")  # noqa: N806
 
-    _, map_input_Sqrt = find_node(graph_proto, "Sqrt")
+    _, map_input_Sqrt = find_node(graph_proto, "Sqrt")  # noqa: N806
 
-    _, map_input_Add = find_node(graph_proto, "Add")
+    _, map_input_Add = find_node(graph_proto, "Add")  # noqa: N806
 
-    nodes_ReduceMean, map_input_ReduceMean = find_node(graph_proto, "ReduceMean")
+    nodes_ReduceMean, map_input_ReduceMean = find_node(graph_proto, "ReduceMean")  # noqa: N806
 
-    _, map_input_Pow = find_node(graph_proto, "Pow")
+    _, map_input_Pow = find_node(graph_proto, "Pow")  # noqa: N806
 
-    _, map_input_Mul = find_node(graph_proto, "Mul")
+    _, map_input_Mul = find_node(graph_proto, "Mul")  # noqa: N806
 
     # find right side Sub (see the layer norm subgrapg)
-    nodes_Sub = []
-    map_input_Sub = {}
+    nodes_Sub = []  # noqa: N806
+    map_input_Sub = {}  # noqa: N806
     for node in graph_proto.node:
         if node.op_type == "Sub":
             if node.output[0] in map_input_Pow:
@@ -70,16 +70,16 @@ def layer_norm_transform(model_proto):
                 map_input_Sub[node.input[1]] = node
 
     # find first ReduceMean
-    first_ReduceMean = []
-    first_ReduceMean_outputs = []
+    first_ReduceMean = []  # noqa: N806
+    first_ReduceMean_outputs = []  # noqa: N806
     for node in nodes_ReduceMean:
         if node.output[0] in map_input_Sub:
             first_ReduceMean.append(node)
             first_ReduceMean_outputs.append(node.output[0])
 
     # find constant node
-    nodes_Constant = []
-    map_output_Constant = {}
+    nodes_Constant = []  # noqa: N806
+    map_output_Constant = {}  # noqa: N806
     for node in graph_proto.node:
         if node.op_type == "Constant":
             nodes_Constant.append(node)
@@ -111,19 +111,19 @@ def layer_norm_transform(model_proto):
         if node_reduce.output[0] not in map_input_Add:
             continue
 
-        node_Add = map_input_Add[node_reduce.output[0]]
+        node_Add = map_input_Add[node_reduce.output[0]]  # noqa: N806
         if node_Add.output[0] not in map_input_Sqrt:
             continue
 
-        node_Sqrt = map_input_Sqrt[node_Add.output[0]]
+        node_Sqrt = map_input_Sqrt[node_Add.output[0]]  # noqa: N806
         if node_Sqrt.output[0] not in map_input_Div:
             continue
 
-        node_Div = map_input_Div[node_Sqrt.output[0]]
+        node_Div = map_input_Div[node_Sqrt.output[0]]  # noqa: N806
         if node_Div.output[0] not in map_input_Mul:
             continue
 
-        node_Mul = map_input_Mul[node_Div.output[0]]
+        node_Mul = map_input_Mul[node_Div.output[0]]  # noqa: N806
 
         if node_Mul.input[0] != node_Div.output[0]:
             layer_norm_input.append(node_Mul.input[0])
@@ -133,7 +133,7 @@ def layer_norm_transform(model_proto):
         if node_Mul.output[0] not in map_input_Add:
             continue
 
-        node_Add1 = map_input_Add[node_Mul.output[0]]
+        node_Add1 = map_input_Add[node_Mul.output[0]]  # noqa: N806
         layer_norm_input.append(node_Add1.input[1])
 
         removed_nodes.append(node)
diff --git a/orttraining/orttraining/test/python/orttraining_test_lort.py b/orttraining/orttraining/test/python/orttraining_test_lort.py
index 8202ceef7445c..ccd06e1a3ab62 100644
--- a/orttraining/orttraining/test/python/orttraining_test_lort.py
+++ b/orttraining/orttraining/test/python/orttraining_test_lort.py
@@ -14,8 +14,8 @@
 init_ts_backend()
 
 # Handle ORT dependencies.
-import onnxruntime as ort
-from onnxruntime.capi import _pybind_state as C
+import onnxruntime as ort  # noqa: E402
+from onnxruntime.capi import _pybind_state as C  # noqa: E402
 
 # Set up ORT as torch.jit's sub-executor.
 C.register_ort_as_torch_jit_executor()
diff --git a/orttraining/orttraining/test/python/orttraining_test_model_transform.py b/orttraining/orttraining/test/python/orttraining_test_model_transform.py
index d6984dcf08425..af287d163a97a 100644
--- a/orttraining/orttraining/test/python/orttraining_test_model_transform.py
+++ b/orttraining/orttraining/test/python/orttraining_test_model_transform.py
@@ -107,7 +107,7 @@ def add_expand_shape(model):
 
     expand_node = [n for n in model.graph.node if n.op_type == "Expand"]
     if len(expand_node) != 1:
-        raise "cannot find the single expand node in the BERT model."
+        raise "cannot find the single expand node in the BERT model."  # noqa: B016
         return
     expand_out = model.graph.value_info.add()
     expand_out.name = expand_node[0].output[0]  # base: '421' # tiny: '85'
diff --git a/orttraining/orttraining/test/python/orttraining_test_onnx_ops_ortmodule.py b/orttraining/orttraining/test/python/orttraining_test_onnx_ops_ortmodule.py
index 4c84d7ffb601c..4f0925c5c855b 100644
--- a/orttraining/orttraining/test/python/orttraining_test_onnx_ops_ortmodule.py
+++ b/orttraining/orttraining/test/python/orttraining_test_onnx_ops_ortmodule.py
@@ -16,9 +16,9 @@ def assert_values_are_close(self, tensor, other, rtol=1e-05, atol=1e-06):
         if not are_close:
             abs_diff = torch.abs(tensor - other)
             abs_other = torch.abs(other)
-            max_atol = torch.max((abs_diff - rtol * abs_other))
+            max_atol = torch.max(abs_diff - rtol * abs_other)
             max_rtol = torch.max((abs_diff - atol) / abs_other)
-            raise AssertionError("The maximum atol is %r, maximum rtol is %r." % (max_atol, max_rtol))
+            raise AssertionError(f"The maximum atol is {max_atol!r}, maximum rtol is {max_rtol!r}.")
 
     def assert_gradients_match_and_reset_gradient(
         self, ort_model, pt_model, none_pt_params=None, reset_gradient=True, rtol=1e-05, atol=1e-06
@@ -49,9 +49,9 @@ def gradient_correctness(self, name, device, debug=False):
         pt_model_cls, op_grad_type, kwargs = self.get_torch_model_name(name, device)
         if kwargs is None:
             kwargs = {}
-        N = 32
+        N = 32  # noqa: N806
         pt_model = pt_model_cls().to(device)
-        D_in = pt_model.fc1.in_features
+        D_in = pt_model.fc1.in_features  # noqa: N806
         ort_model = ORTModule(copy.deepcopy(pt_model))
 
         def run_step(model, x):
@@ -84,7 +84,7 @@ def run_step(model, x):
             if isinstance(op_grad_type, tuple):
                 text = str(onnx_graph_train)
                 if all(map(lambda op: ('op_type: "%s"' % op) not in text, op_grad_type)):
-                    raise AssertionError("Operator %s not found in %s." % (" or ".join(op_grad_type), text))
+                    raise AssertionError("Operator {} not found in {}.".format(" or ".join(op_grad_type), text))
             else:
                 self.assertIn('op_type: "%s"' % op_grad_type, str(onnx_graph_train))
 
diff --git a/orttraining/orttraining/test/python/orttraining_test_onnxblock.py b/orttraining/orttraining/test/python/orttraining_test_onnxblock.py
index 2dafe7e8f1338..8878b820abbcf 100644
--- a/orttraining/orttraining/test/python/orttraining_test_onnxblock.py
+++ b/orttraining/orttraining/test/python/orttraining_test_onnxblock.py
@@ -18,7 +18,7 @@
 
 class SimpleNet(torch.nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
-        super(SimpleNet, self).__init__()
+        super().__init__()
 
         self.fc1 = torch.nn.Linear(input_size, hidden_size)
         self.relu = torch.nn.ReLU()
@@ -36,7 +36,7 @@ def forward(self, model_input):
 
 class SimpleModelWithMSELoss(onnxblock.Model):
     def __init__(self):
-        super(SimpleModelWithMSELoss, self).__init__()
+        super().__init__()
         self.loss = onnxblock.loss.MSELoss()
 
     def build(self, output_name):
@@ -45,7 +45,7 @@ def build(self, output_name):
 
 class SimpleModelWithCrossEntropyLoss(onnxblock.Model):
     def __init__(self):
-        super(SimpleModelWithCrossEntropyLoss, self).__init__()
+        super().__init__()
         self.loss = onnxblock.loss.CrossEntropyLoss()
 
     def build(self, output_name):
@@ -54,7 +54,7 @@ def build(self, output_name):
 
 class SimpleTrainingModelWithMSELoss(onnxblock.TrainingModel):
     def __init__(self):
-        super(SimpleTrainingModelWithMSELoss, self).__init__()
+        super().__init__()
         self.loss = onnxblock.loss.MSELoss()
 
     def build(self, output_name):
@@ -63,7 +63,7 @@ def build(self, output_name):
 
 class SimpleTrainingModelWithCrossEntropyLoss(onnxblock.TrainingModel):
     def __init__(self):
-        super(SimpleTrainingModelWithCrossEntropyLoss, self).__init__()
+        super().__init__()
         self.loss = onnxblock.loss.CrossEntropyLoss()
 
     def build(self, output_name):
@@ -72,7 +72,7 @@ def build(self, output_name):
 
 class SimpleModelWithBCEWithLogitsLoss(onnxblock.Model):
     def __init__(self):
-        super(SimpleModelWithBCEWithLogitsLoss, self).__init__()
+        super().__init__()
         self.loss = onnxblock.loss.BCEWithLogitsLoss()
 
     def build(self, output_name):
@@ -81,7 +81,7 @@ def build(self, output_name):
 
 class SimpleTrainingModelWithBCEWithLogitsLoss(onnxblock.TrainingModel):
     def __init__(self):
-        super(SimpleTrainingModelWithBCEWithLogitsLoss, self).__init__()
+        super().__init__()
         self.loss = onnxblock.loss.BCEWithLogitsLoss()
 
     def build(self, output_name):
@@ -642,7 +642,7 @@ def test_weighted_average_model_composition(model_type):
     # Given
     class TwoOutputNet(torch.nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(TwoOutputNet, self).__init__()
+            super().__init__()
 
             self.fc1_1 = torch.nn.Linear(input_size, hidden_size)
             self.relu1 = torch.nn.ReLU()
@@ -659,7 +659,7 @@ def forward(self, model_input1, model_input2):
 
     class WeightedAvg(model_type):
         def __init__(self, w1, w2):
-            super(WeightedAvg, self).__init__()
+            super().__init__()
 
             self.loss1 = onnxblock.loss.CrossEntropyLoss()
             self.loss2 = onnxblock.loss.CrossEntropyLoss()
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index b291056f0efdb..fcf9203e0a12d 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -48,7 +48,7 @@
 
 class NeuralNetSinglePositionalArgument(torch.nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNetSinglePositionalArgument, self).__init__()
+        super().__init__()
 
         self.fc1 = torch.nn.Linear(input_size, hidden_size)
         self.relu = torch.nn.ReLU()
@@ -63,7 +63,7 @@ def forward(self, input1):
 
 class NeuralNetMultiplePositionalArgumentsMultiOutputsWithoutDependency(torch.nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNetMultiplePositionalArgumentsMultiOutputsWithoutDependency, self).__init__()
+        super().__init__()
 
         self.fc1 = torch.nn.Linear(input_size, hidden_size)
         self.fc2 = torch.nn.Linear(input_size, hidden_size)
@@ -85,7 +85,7 @@ def forward(self, input1, input2):
 
 class NeuralNetMultiplePositionalArgumentsMultiOutputsWithDependency(torch.nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNetMultiplePositionalArgumentsMultiOutputsWithDependency, self).__init__()
+        super().__init__()
 
         self.fc1 = torch.nn.Linear(input_size, hidden_size)
         self.softmax = torch.nn.Softmax(dim=1)
@@ -104,7 +104,7 @@ def forward(self, input1, input2):
 
 class NeuralNetMultiplePositionalArguments(torch.nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNetMultiplePositionalArguments, self).__init__()
+        super().__init__()
 
         self.fc1 = torch.nn.Linear(input_size, hidden_size)
         self.relu = torch.nn.ReLU()
@@ -120,7 +120,7 @@ def forward(self, input1, input2):
 
 class NeuralNetMultiplePositionalArgumentsVarKeyword(torch.nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNetMultiplePositionalArgumentsVarKeyword, self).__init__()
+        super().__init__()
 
         self.fc1 = torch.nn.Linear(input_size, hidden_size)
         self.relu = torch.nn.ReLU()
@@ -136,7 +136,7 @@ def forward(self, input1, input2, **kwargs):
 
 class NeuralNetPositionalArguments(torch.nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNetPositionalArguments, self).__init__()
+        super().__init__()
 
         self.fc1 = torch.nn.Linear(input_size, hidden_size)
         self.relu = torch.nn.ReLU()
@@ -152,7 +152,7 @@ def forward(self, *model_inputs):
 
 class NeuralNetKeywordArguments(torch.nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNetKeywordArguments, self).__init__()
+        super().__init__()
 
         self.fc1 = torch.nn.Linear(input_size, hidden_size)
         self.relu = torch.nn.ReLU()
@@ -168,7 +168,7 @@ def forward(self, x=None, y=None, z=None):
 
 class NeuralNetPositionalAndKeywordArguments(torch.nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNetPositionalAndKeywordArguments, self).__init__()
+        super().__init__()
 
         self.fc1 = torch.nn.Linear(input_size, hidden_size)
         self.relu = torch.nn.ReLU()
@@ -184,7 +184,7 @@ def forward(self, model_input, x=None, y=None, z=None):
 
 class NeuralNetSimplePositionalAndKeywordArguments(torch.nn.Module):
     def __init__(self):
-        super(NeuralNetSimplePositionalAndKeywordArguments, self).__init__()
+        super().__init__()
         self.a = torch.nn.Parameter(torch.FloatTensor([-1.0, 1.0]))
 
     def forward(self, x, y=None, z=None):
@@ -197,7 +197,7 @@ def forward(self, x, y=None, z=None):
 
 class NeuralNetNonDifferentiableOutput(torch.nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNetNonDifferentiableOutput, self).__init__()
+        super().__init__()
         self.fc1 = torch.nn.Linear(input_size, hidden_size)
         self.relu = torch.nn.ReLU()
         self.fc2 = torch.nn.Linear(hidden_size, num_classes)
@@ -218,7 +218,7 @@ def forward(self, input1):
 
 class NeuralNetChainedLayersWithNonDifferentiableOutput(torch.nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNetChainedLayersWithNonDifferentiableOutput, self).__init__()
+        super().__init__()
         self.fc1 = torch.nn.Linear(input_size, hidden_size)
         self.relu = torch.nn.ReLU()
         self.fc2 = torch.nn.Linear(hidden_size, num_classes)
@@ -235,7 +235,7 @@ def forward(self, input1, mask1):
 
 class NeuralNetPartialNoGradModel(torch.nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNetPartialNoGradModel, self).__init__()
+        super().__init__()
 
         self.fc1 = torch.nn.Linear(input_size, hidden_size).requires_grad_(False)
         self.relu = torch.nn.ReLU()
@@ -249,7 +249,7 @@ def forward(self, model_input):
 
 class UnusedEndParameterNet(torch.nn.Module):
     def __init__(self, input_size, hidden_size1, hidden_size2, num_classes):
-        super(UnusedEndParameterNet, self).__init__()
+        super().__init__()
 
         self.fc1 = torch.nn.Linear(input_size, hidden_size1)
         self.relu = torch.nn.ReLU()
@@ -267,7 +267,7 @@ def forward(self, input1):
 
 class UnusedBeginParameterNet(torch.nn.Module):
     def __init__(self, input_size, hidden_size1, hidden_size2, num_classes):
-        super(UnusedBeginParameterNet, self).__init__()
+        super().__init__()
 
         # fc1 is an unused initializer (which is in the begining of initializer list)
         # which will be dropped after export
@@ -285,7 +285,7 @@ def forward(self, input1):
 
 class UnusedMiddleParameterNet(torch.nn.Module):
     def __init__(self, input_size, hidden_size1, hidden_size2, num_classes):
-        super(UnusedMiddleParameterNet, self).__init__()
+        super().__init__()
 
         self.fc1 = torch.nn.Linear(input_size, hidden_size1)
         self.relu = torch.nn.ReLU()
@@ -305,21 +305,21 @@ def forward(self, input1):
 
 class StatelessModel(torch.nn.Module):
     def __init__(self):
-        super(StatelessModel, self).__init__()
+        super().__init__()
 
     def forward(self, x):
         return x
 
 
 class NeuralNetCustomClassOutput(torch.nn.Module):
-    class CustomClass(object):
+    class CustomClass:
         def __init__(self, out1, out2, out3):
             self.out1 = out1
             self.out2 = out2
             self.out3 = out3
 
     def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNetCustomClassOutput, self).__init__()
+        super().__init__()
 
         self.fc1_1 = torch.nn.Linear(input_size, hidden_size)
         self.relu1 = torch.nn.ReLU()
@@ -349,7 +349,7 @@ def forward(self, x, my_str):
 
 class SerializationNet(torch.nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
-        super(SerializationNet, self).__init__()
+        super().__init__()
 
         self.fc1 = torch.nn.Linear(input_size, hidden_size)
         self.relu = torch.nn.ReLU()
@@ -451,7 +451,7 @@ def _get_bert_for_sequence_classification_sample_data_with_random_shapes(device)
 def test_forward_call_single_positional_argument():
     device = "cuda"
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     ort_model = ORTModule(model)
     # Check that the original forward signature is preserved.
@@ -467,7 +467,7 @@ def test_forward_call_single_positional_argument():
 def test_forward_call_multiple_positional_arguments():
     device = "cuda"
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetMultiplePositionalArguments(input_size=D_in, hidden_size=H, num_classes=D_out).to(device)
     ort_model = ORTModule(model)
     # Check that the original forward signature is preserved.
@@ -485,7 +485,7 @@ def test_forward_call_multiple_positional_arguments():
 def test_forward_call_positional_arguments():
     device = "cuda"
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetPositionalArguments(input_size=D_in, hidden_size=H, num_classes=D_out).to(device)
     model = ORTModule(model)
     args = [
@@ -504,7 +504,7 @@ def test_forward_call_positional_arguments():
 def test_forward_call_keyword_arguments():
     device = "cuda"
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetKeywordArguments(D_in, H, D_out).to(device)
     model = ORTModule(model)
     x = torch.randn(N, D_in, device=device)
@@ -521,7 +521,7 @@ def test_forward_call_keyword_arguments():
 def test_forward_call_positional_and_keyword_arguments():
     device = "cuda"
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetPositionalAndKeywordArguments(D_in, H, D_out).to(device)
     model = ORTModule(model)
     a = torch.randn(N, D_in, device=device)
@@ -579,7 +579,7 @@ def test_torch_nn_module_cuda_method():
     original_device = "cpu"
     to_device = "cuda"
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out)
     model = ORTModule(model)
     for _, parameter_value in model.named_parameters():
@@ -598,7 +598,7 @@ def test_torch_nn_module_cpu_method(set_gpu_on_original_module):
     original_device = "cuda"
     to_device = "cpu"
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     if set_gpu_on_original_module:
         model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(original_device)
         model = ORTModule(model)
@@ -618,7 +618,7 @@ def test_torch_nn_module_cpu_method(set_gpu_on_original_module):
 @pytest.mark.parametrize("original_device", ["cpu", "cuda"])
 @pytest.mark.parametrize("to_argument", ["cpu", "cuda", "cuda:0", torch.device("cpu"), torch.device("cuda")])
 def test_torch_nn_module_to_api(original_device, to_argument):
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(original_device)
     model = ORTModule(model)
     x = torch.randn(N, D_in, device=original_device)
@@ -635,7 +635,7 @@ def test_torch_nn_module_to_api(original_device, to_argument):
 
 def test_model_without_device():
     # Model doesn't have device (CPU is assumed)
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out)
     model = ORTModule(model)
 
@@ -656,20 +656,20 @@ def test_model_without_device():
 
 
 def test_model_and_input_without_device():
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out)
     model = ORTModule(model)
     x = torch.randn(N, D_in)
 
     # CPU is assumed for both model and user input
     out = model(x)
-    out is not None
+    out is not None  # noqa: B015
 
 
 def test_model_with_different_devices_same_session():
     os.environ["ORTMODULE_SKIPCHECK_POLICY"] = "SKIP_CHECK_DISABLED"
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out)
     model = ORTModule(model)
 
@@ -681,14 +681,14 @@ def test_model_with_different_devices_same_session():
 
         model.to(device)
         x = torch.randn(N, D_in, device=device)
-        y = model(x)
+        model(x)
 
     del os.environ["ORTMODULE_SKIPCHECK_POLICY"]
 
 
 @pytest.mark.parametrize("device", ["cuda", "cpu"])
 def test_input_requires_grad_saved(device):
-    N, D_in, H, D_out = 32, 784, 500, 10
+    N, D_in, H, D_out = 32, 784, 500, 10  # noqa: N806
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     model = ORTModule(model)
     x = torch.randn(N, D_in, device=device, requires_grad=True) + 1
@@ -698,7 +698,7 @@ def test_input_requires_grad_saved(device):
 
 @pytest.mark.parametrize("device", ["cuda", "cpu"])
 def test_input_requires_grad_backward_creates_input_grad(device):
-    N, D_in, H, D_out = 32, 784, 500, 10
+    N, D_in, H, D_out = 32, 784, 500, 10  # noqa: N806
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     model = ORTModule(model)
     x = torch.randn(N, D_in, device=device, requires_grad=True)
@@ -711,7 +711,7 @@ def test_input_requires_grad_backward_creates_input_grad(device):
 
 def test_gradient_correctness():
     device = "cuda"
-    N, D_in, H, D_out = 32, 128, 500, 10
+    N, D_in, H, D_out = 32, 128, 500, 10  # noqa: N806
     pt_model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
 
@@ -721,7 +721,7 @@ def run_step(model, x):
         loss.backward()
         return prediction
 
-    for step in range(10):
+    for _step in range(10):
         x = torch.randn(N, D_in, device=device)
         pt_prediction = run_step(pt_model, x)
         ort_prediction = run_step(ort_model, x)
@@ -735,7 +735,7 @@ def run_step(model, x):
 def test_scatternd_correctness(device, indices):
     class NeuralNetScatterND(torch.nn.Module):
         def __init__(self):
-            super(NeuralNetScatterND, self).__init__()
+            super().__init__()
 
         def forward(self, rerouted_output, dispatch_mask, expert_output):
             rerouted_output[dispatch_mask] = expert_output
@@ -765,7 +765,7 @@ def run_step(model, rerouted_output, dispatch_mask, expert_output):
 def test_gradient_correctness_conv1d(use_fp16, input_requires_grad, conv_algo_search):
     class NeuralNetConv1D(torch.nn.Module):
         def __init__(self, in_channels, out_channels, kernel_size, padding=0, groups=1):
-            super(NeuralNetConv1D, self).__init__()
+            super().__init__()
             self.conv1 = torch.nn.Conv1d(in_channels, out_channels, kernel_size, padding=padding, groups=groups)
             self.conv2 = torch.nn.Conv1d(in_channels, out_channels, kernel_size, padding=padding, groups=groups)
 
@@ -782,7 +782,7 @@ def forward(self, input):
         os.environ["ORTMODULE_CONV_ALGO_SEARCH"] = conv_algo_search
 
     device = "cuda"
-    N, seq_len, C_in, C_out, kernel_size = 32, 128, 1536, 1536, 3
+    N, seq_len, C_in, C_out, kernel_size = 32, 128, 1536, 1536, 3  # noqa: N806
     pt_model = NeuralNetConv1D(C_in, C_out, kernel_size, padding=1).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
 
@@ -826,7 +826,7 @@ def run_step(model, x):
 def _run_gradient_correctness_transpose(perm, shape):
     class NeuralNetTranspose(torch.nn.Module):
         def __init__(self, perm):
-            super(NeuralNetTranspose, self).__init__()
+            super().__init__()
             self.perm = perm
 
         def forward(self, input):
@@ -978,14 +978,14 @@ def test_gradient_correctness_transpose4d(perm, shape):
 def test_gradient_correctness_embedding(device, padding_idx):
     class NeuralNetEmbedding(torch.nn.Module):
         def __init__(self, num_embeddings, embedding_dim, hidden_size):
-            super(NeuralNetEmbedding, self).__init__()
+            super().__init__()
             self.embedding = torch.nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
             self.linear = torch.nn.Linear(embedding_dim, hidden_size)
 
         def forward(self, input):
             return self.linear(self.embedding(input))
 
-    N, num_embeddings, embedding_dim, hidden_size = 64, 32, 128, 128
+    N, num_embeddings, embedding_dim, hidden_size = 64, 32, 128, 128  # noqa: N806
     pt_model = NeuralNetEmbedding(num_embeddings, embedding_dim, hidden_size).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
 
@@ -1066,7 +1066,7 @@ def _run_step(model, input, target):
 def test_gradient_correctness_cross_entropy_loss(use_fp16):
     class NeuralNetCrossEntropyLoss(torch.nn.Module):
         def __init__(self, num_embeddings, embedding_dim):
-            super(NeuralNetCrossEntropyLoss, self).__init__()
+            super().__init__()
             self.embedding = torch.nn.Embedding(num_embeddings, embedding_dim, padding_idx=1)
 
         def forward(self, input, positions):
@@ -1087,7 +1087,7 @@ def run_step(model, input, positions):
         return loss
 
     for _ in range(10):
-        N = random.randint(16, 32)
+        N = random.randint(16, 32)  # noqa: N806
         input = torch.randint(high=num_embeddings, size=(N,), dtype=torch.int64, device=device)
         positions = torch.randint(high=N, size=(embedding_dim,), dtype=torch.int64, device=device)
         pt_prediction = run_step(pt_model, input, positions)
@@ -1101,7 +1101,7 @@ def run_step(model, input, positions):
 def test_gradient_correctness_pool2d(pool_type):
     class NeuralNetPool2d(torch.nn.Module):
         def __init__(self):
-            super(NeuralNetPool2d, self).__init__()
+            super().__init__()
             self.conv = torch.nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
             if pool_type == "MaxPool":
                 self.pool = torch.nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
@@ -1113,7 +1113,7 @@ def __init__(self):
         def forward(self, input):
             return self.pool(self.conv(input))
 
-    N, C, H, W = 8, 3, 224, 224
+    N, C, H, W = 8, 3, 224, 224  # noqa: N806
     device = "cuda"
     pt_model = NeuralNetPool2d().to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
@@ -1138,7 +1138,7 @@ def run_step(model, input):
 def test_export_correctness_pool2d(pool_type, stride):
     class NeuralNetPool2d(torch.nn.Module):
         def __init__(self):
-            super(NeuralNetPool2d, self).__init__()
+            super().__init__()
             self.conv = torch.nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
             self.pool_type = pool_type
 
@@ -1150,7 +1150,7 @@ def forward(self, input):
                 output = torch.nn.functional.avg_pool2d(x, kernel_size=3, stride=stride)
             return output
 
-    N, C, H, W = 8, 3, 224, 224
+    N, C, H, W = 8, 3, 224, 224  # noqa: N806
     device = "cuda"
     pt_model = NeuralNetPool2d().to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
@@ -1198,7 +1198,7 @@ def forward(self, input):
             # torch.max(input, dim, keepdim) returns (max_values, max_indices)
             return func(input, dim=dim, keepdim=keepdim)
 
-    N, C, D = 16, 256, 128
+    N, C, D = 16, 256, 128  # noqa: N806
     device = "cuda"
     pt_model = NeuralNetMax().to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
@@ -1234,7 +1234,7 @@ class NeuralNetMaxTwoTensors(torch.nn.Module):
         def forward(self, input, other):
             return func(input, other)
 
-    N, C, D = 16, 256, 128
+    N, C, D = 16, 256, 128  # noqa: N806
     device = "cuda"
     pt_model = NeuralNetMaxTwoTensors().to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
@@ -1273,7 +1273,7 @@ def run_step(model, *input):
 def test_gradient_correctness_argmax_unfold():
     class NeuralNetUnfold(torch.nn.Module):
         def __init__(self, input_size, hidden_size, unfold_dim, unfold_size, unfold_step):
-            super(NeuralNetUnfold, self).__init__()
+            super().__init__()
             self.linear = torch.nn.Linear(input_size, hidden_size)
             self.unfold_dim = unfold_dim
             self.unfold_size = unfold_size
@@ -1284,7 +1284,7 @@ def forward(self, input):
                 dimension=self.unfold_dim, size=self.unfold_size, step=self.unfold_step
             )
 
-    N, D, H = 16, 256, 128
+    N, D, H = 16, 256, 128  # noqa: N806
     device = "cuda"
     pt_model = NeuralNetUnfold(D, H, 1, 50, 30).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
@@ -1306,12 +1306,12 @@ def run_step(model, input):
 
 @pytest.mark.parametrize("high", [1, 2, 10])
 def test_correctness_argmax_bitwise_or(high):
-    N, D, H, M = 16, 256, 128, 4
+    N, D, H, M = 16, 256, 128, 4  # noqa: N806
     device = "cuda"
 
     class NeuralNetBitwiseOr(torch.nn.Module):
         def __init__(self, high):
-            super(NeuralNetBitwiseOr, self).__init__()
+            super().__init__()
             self.other = torch.randint(0, high, (N, D, H), device=device)
 
         def forward(self, input):
@@ -1339,7 +1339,7 @@ def run_step(model, input):
 def test_gradient_correctness_argmax_diagonal(offset, dim1, dim2):
     class NeuralNetDiagonal(torch.nn.Module):
         def __init__(self, offset=0, dim1=0, dim2=1):
-            super(NeuralNetDiagonal, self).__init__()
+            super().__init__()
             self.offset = offset
             self.dim1 = dim1
             self.dim2 = dim2
@@ -1347,7 +1347,7 @@ def __init__(self, offset=0, dim1=0, dim2=1):
         def forward(self, input):
             return torch.diagonal(input, self.offset, self.dim1, self.dim2)
 
-    N, D, H = 16, 256, 128
+    N, D, H = 16, 256, 128  # noqa: N806
     device = "cuda"
     pt_model = NeuralNetDiagonal(offset, dim1, dim2).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
@@ -1373,7 +1373,7 @@ def run_step(model, input):
 def test_gradient_correctness_reducesum(dim, keepdim):
     class NeuralNetReduceSum(torch.nn.Module):
         def __init__(self, input_size, hidden_size, dim, keepdim):
-            super(NeuralNetReduceSum, self).__init__()
+            super().__init__()
             self.linear = torch.nn.Linear(input_size, hidden_size)
             self.dim = dim
             self.keepdim = keepdim
@@ -1385,7 +1385,7 @@ def forward(self, input):
             else:
                 return torch.sum(t, self.dim, keepdim=self.keepdim)
 
-    N, D, H, W = 16, 256, 128, 64
+    N, D, H, W = 16, 256, 128, 64  # noqa: N806
     device = "cuda"
     pt_model = NeuralNetReduceSum(H, W, dim, keepdim).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
@@ -1414,7 +1414,7 @@ def run_step(model, input):
 def test_gradient_correctness_chunk(dim, chunks):
     class NeuralNetChunk(torch.nn.Module):
         def __init__(self, dim):
-            super(NeuralNetChunk, self).__init__()
+            super().__init__()
             self.dim = dim
 
         def forward(self, input):
@@ -1432,7 +1432,7 @@ def run_step(model, input):
         loss.backward()
         return results
 
-    N, D, H = 16, 17, 18
+    N, D, H = 16, 17, 18  # noqa: N806
     for _ in range(10):
         pt_input = torch.rand((N, D, H), device=device, requires_grad=True)
         ort_input = copy.deepcopy(pt_input)
@@ -1487,7 +1487,7 @@ def run_step(model, input):
 def test_gradient_correctness_einsum(equation):
     class NeuralNetEinsum(torch.nn.Module):
         def __init__(self, bias_size):
-            super(NeuralNetEinsum, self).__init__()
+            super().__init__()
             self.register_parameter(name="bias", param=torch.nn.Parameter(torch.randn(bias_size)))
 
         def forward(self, left, right):
@@ -1495,10 +1495,10 @@ def forward(self, left, right):
             return torch.einsum(equation, left, right)
 
     device = "cuda"
-    K, S, M, E = 16, 1024, 768, 64
-    C = int(S / E * 2)
+    K, S, M, E = 16, 1024, 768, 64  # noqa: N806
+    C = int(S / E * 2)  # noqa: N806
 
-    SIZE_MAP = {"K": K, "S": S, "E": E, "C": C, "M": M}
+    SIZE_MAP = {"K": K, "S": S, "E": E, "C": C, "M": M}  # noqa: N806
 
     pos1 = equation.find(",")
     pos2 = equation.find("->")
@@ -1536,7 +1536,7 @@ def run_step(model, input_left, input_right):
 def test_gradient_correctness_einsum_2():
     class NeuralNetEinsum(torch.nn.Module):
         def __init__(self, bias_size):
-            super(NeuralNetEinsum, self).__init__()
+            super().__init__()
             self.register_parameter(name="bias", param=torch.nn.Parameter(torch.randn(bias_size)))
 
         def forward(self, left, right):
@@ -1544,9 +1544,9 @@ def forward(self, left, right):
             return torch.einsum(equation, left, right)
 
     device = "cuda"
-    A, B, C, D = 16, 32, 8, 64
+    A, B, C, D = 16, 32, 8, 64  # noqa: N806
 
-    SIZE_MAP = {"A": A, "B": B, "C": C, "D": D}
+    SIZE_MAP = {"A": A, "B": B, "C": C, "D": D}  # noqa: N806
 
     def to_string(perm):
         result = ""
@@ -1631,7 +1631,7 @@ def run_step(model, input_left, input_right):
 def test_aten_multinomial(input_shape, num_samples, replacement):
     class NeuralNetDiagonal(torch.nn.Module):
         def __init__(self, num_samples, replacement):
-            super(NeuralNetDiagonal, self).__init__()
+            super().__init__()
             self.num_samples = num_samples
             self.replacement = replacement
 
@@ -1688,7 +1688,7 @@ def forward(self, input: torch.Tensor):
 def test_numpy_T(input_shape):
     class NeuralNet(torch.nn.Module):
         def __init__(self):
-            super(NeuralNet, self).__init__()
+            super().__init__()
 
         def forward(self, input):
             return input.T
@@ -1715,7 +1715,7 @@ def run_step(model, input):
 def test_aten_group_norm():
     class NeuralNetGroupNorm(torch.nn.Module):
         def __init__(self, num_groups, num_channels):
-            super(NeuralNetGroupNorm, self).__init__()
+            super().__init__()
             self.group_norm = torch.nn.GroupNorm(
                 num_groups=num_groups, num_channels=num_channels, eps=1e-5, affine=True
             )
@@ -1752,7 +1752,7 @@ def run_step(model, x, y):
 def test_aten_upsample_nearest(input_rank, use_factor):
     class _NeuralNetUpsampleNearest(torch.nn.Module):
         def __init__(self):
-            super(_NeuralNetUpsampleNearest, self).__init__()
+            super().__init__()
 
         def forward(self, input):
             return (
@@ -1785,7 +1785,7 @@ def run_step(model, input):
 def test_aten_upsample_bilinear():
     class _NeuralNetUpsampleBilinear(torch.nn.Module):
         def __init__(self):
-            super(_NeuralNetUpsampleBilinear, self).__init__()
+            super().__init__()
 
         def forward(self, input):
             return torch.nn.functional.interpolate(input, size=(8, 12), mode="bilinear")
@@ -1813,7 +1813,7 @@ def run_step(model, input):
 def test_gradient_correctness_cast_chain():
     class NeuralNetCast(torch.nn.Module):
         def __init__(self, D):
-            super(NeuralNetCast, self).__init__()
+            super().__init__()
             self.a = torch.nn.parameter.Parameter(torch.rand(D))
 
         def forward(self, b):
@@ -1821,7 +1821,7 @@ def forward(self, b):
             output = self.a + b + mask
             return output
 
-    D = 16
+    D = 16  # noqa: N806
     device = "cuda"
     pt_model = NeuralNetCast(D).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
@@ -1845,7 +1845,7 @@ def run_step(model, input):
 
 def test_module_with_non_differential_output():
     device = "cuda"
-    N, D_in, H, D_out = 32, 128, 64, 10
+    N, D_in, H, D_out = 32, 128, 64, 10  # noqa: N806
     pt_model = NeuralNetNonDifferentiableOutput(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
 
@@ -1855,7 +1855,7 @@ def run_step(model, x):
         loss.backward()
         return prediction1, mask1, prediction2, mask2
 
-    for step in range(10):
+    for _step in range(10):
         x = torch.randn(N, D_in, device=device)
         pt_prediction1, pt_mask1, pt_prediction2, pt_mask2 = run_step(pt_model, x)
         ort_prediction1, ort_mask1, ort_prediction2, ort_mask2 = run_step(ort_model, x)
@@ -1871,7 +1871,7 @@ def run_step(model, x):
 
 def test_multiple_chained_ortmodules_with_non_differential_output():
     device = "cuda"
-    N, D_in, H, D_out = 32, 128, 64, 10
+    N, D_in, H, D_out = 32, 128, 64, 10  # noqa: N806
     pt_model = NeuralNetChainedLayersWithNonDifferentiableOutput(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
 
@@ -1902,14 +1902,14 @@ def run_step(layer1, layer2, x, mask1):
 def test_duplicated_output(loss_with_duplicated_output):
     class NeuralNet(torch.nn.Module):
         def __init__(self):
-            super(NeuralNet, self).__init__()
+            super().__init__()
             self.fc1 = torch.nn.Linear(128, 16)
 
         def forward(self, input):
             out = self.fc1(input)
             return out, out  # duplicated output
 
-    N, C, H = 8, 4, 128
+    N, C, H = 8, 4, 128  # noqa: N806
     device = "cuda"
     pt_model = NeuralNet().to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
@@ -1934,11 +1934,11 @@ def run_step(model, input):
 
 def test_multiple_forward_only_calls():
     device = "cuda"
-    N, D_in, H, D_out = 32, 784, 500, 10
+    N, D_in, H, D_out = 32, 784, 500, 10  # noqa: N806
     pt_model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
 
-    for step in range(10):
+    for _step in range(10):
         x = torch.randn(N, D_in, device=device, requires_grad=False)
         pt_prediction = pt_model(x)
         ort_prediction = ort_model(x)
@@ -1948,7 +1948,7 @@ def test_multiple_forward_only_calls():
 
 def test_nesting_forward_backward_calls():
     device = "cuda"
-    N, D_in, H, D_out = 32, 784, 500, 10
+    N, D_in, H, D_out = 32, 784, 500, 10  # noqa: N806
     pt_model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
 
@@ -1982,7 +1982,7 @@ def test_nesting_forward_backward_calls():
 
 def test_multiple_overlapping_forward_backward_calls():
     device = "cuda"
-    N, D_in, H, D_out = 32, 784, 500, 10
+    N, D_in, H, D_out = 32, 784, 500, 10  # noqa: N806
     pt_model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
 
@@ -1997,7 +1997,7 @@ def run_step(model, x1, x2):
         loss2.backward()
         return prediction1, prediction2
 
-    for step in range(10):
+    for _step in range(10):
         pt_x1 = torch.randn(N, D_in, device=device, requires_grad=True)
         pt_x2 = torch.randn(N, D_in, device=device, requires_grad=True)
 
@@ -2018,7 +2018,7 @@ def run_step(model, x1, x2):
 
 def test_multiple_ortmodules_training():
     device = "cuda"
-    N, D_in, H, D_out = 32, 784, 128, 10
+    N, D_in, H, D_out = 32, 784, 128, 10  # noqa: N806
     pt_model1 = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     pt_model2 = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     ort_model1 = ORTModule(copy.deepcopy(pt_model1))
@@ -2034,7 +2034,7 @@ def run_step(model1, model2, x1, x2):
         loss2.backward()
         return prediction1, prediction2
 
-    for step in range(10):
+    for _step in range(10):
         x1 = torch.randn(N, D_in, device=device)
         x2 = torch.randn(N, D_in, device=device)
         pt_prediction1, pt_prediction2 = run_step(pt_model1, pt_model2, x1, x2)
@@ -2048,7 +2048,7 @@ def run_step(model1, model2, x1, x2):
 
 def test_multiple_ortmodules_common_backbone_training():
     device = "cuda"
-    N, D_in, H, D_out = 32, 64, 128, 64
+    N, D_in, H, D_out = 32, 64, 128, 64  # noqa: N806
     pt_model0 = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     pt_model1 = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     pt_model2 = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
@@ -2063,7 +2063,7 @@ def run_step(backbone_layers, task_layers, x):
         loss.backward()
         return prediction
 
-    for step in range(10):
+    for _step in range(10):
         # Run task 1
         x1 = torch.randn(N, D_in, device=device)
         pt_prediction = run_step(pt_model0, pt_model1, x1)
@@ -2074,7 +2074,7 @@ def run_step(backbone_layers, task_layers, x):
         _test_helpers.assert_gradients_match_and_reset_gradient(ort_model1, pt_model1)
 
         # Run task 2
-        x2 = torch.randn(N, D_in, device=device)
+        torch.randn(N, D_in, device=device)
         pt_prediction = run_step(pt_model0, pt_model2, x1)
         ort_prediction = run_step(ort_model0, ort_model2, x1)
 
@@ -2085,7 +2085,7 @@ def run_step(backbone_layers, task_layers, x):
 
 def test_multiple_chained_ortmodules_training():
     device = "cuda"
-    N, D_in, H, D_out = 32, 128, 500, 128
+    N, D_in, H, D_out = 32, 128, 500, 128  # noqa: N806
     pt_model1 = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     pt_model2 = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     ort_model1 = ORTModule(copy.deepcopy(pt_model1))
@@ -2097,7 +2097,7 @@ def run_step(layers1, layers2, x):
         loss.backward()
         return prediction
 
-    for step in range(10):
+    for _step in range(10):
         x = torch.randn(N, D_in, device=device, requires_grad=True)
         pt_prediction = run_step(pt_model1, pt_model2, x)
         ort_prediction = run_step(ort_model1, ort_model2, x)
@@ -2109,7 +2109,7 @@ def run_step(layers1, layers2, x):
 
 def test_mixed_nnmodule_ortmodules_training():
     device = "cuda"
-    N, D_in, H, D_out = 32, 128, 500, 128
+    N, D_in, H, D_out = 32, 128, 500, 128  # noqa: N806
     pt_model1 = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     pt_model2 = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     pt_model3 = NeuralNetMultiplePositionalArguments(D_in, H, D_out).to(device)
@@ -2125,7 +2125,7 @@ def run_step(model1, model2, model3, x1, x2):
         loss.backward()
         return a1, a2, a3
 
-    for step in range(10):
+    for _step in range(10):
         x1 = torch.randn(N, D_in, device=device)
         x2 = torch.randn(N, D_in, device=device)
         pt_p1, pt_p2, pt_p3 = run_step(pt_model1, pt_model2, pt_model3, x1, x2)
@@ -2142,7 +2142,7 @@ def run_step(model1, model2, model3, x1, x2):
 def test_identity_elimination():
     class NeuralNetSimpleIdentity(torch.nn.Module):
         def __init__(self, input_size, num_classes):
-            super(NeuralNetSimpleIdentity, self).__init__()
+            super().__init__()
 
             self.fc = torch.nn.Linear(input_size, num_classes)
 
@@ -2154,7 +2154,7 @@ def forward(self, x):
             return z
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: F841, N806
     model = NeuralNetSimpleIdentity(D_in, D_out).to(device)
     model = ORTModule(model)
     x = torch.randn(N, D_in, device=device)
@@ -2165,7 +2165,7 @@ def forward(self, x):
 
 
 def test_ortmodule_inputs_with_dynamic_shape():
-    D_in, H, D_out = 784, 500, 10
+    D_in, H, D_out = 784, 500, 10  # noqa: N806
 
     pt_model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to("cuda")
     ort_model = ORTModule(copy.deepcopy(pt_model))
@@ -2176,8 +2176,8 @@ def run_step(model, x):
         loss.backward()
         return p
 
-    for step in range(10):
-        N = random.randint(1, 100)
+    for _step in range(10):
+        N = random.randint(1, 100)  # noqa: N806
         x = torch.randn(N, D_in, device="cuda", requires_grad=True)
         assert x.grad is None
 
@@ -2201,7 +2201,7 @@ def run_step(model, x, y, z):
         loss.backward()
         return outputs[0]
 
-    for step in range(10):
+    for _step in range(10):
         x, y, z = _get_bert_for_sequence_classification_sample_data_with_random_shapes("cuda")
 
         pt_p = run_step(pt_model, x, y, z)
@@ -2217,7 +2217,7 @@ def run_step(model, x, y, z):
 def test_changes_input_requires_grad_reinitializes_module_gradient_graph_builder(device):
     os.environ["ORTMODULE_SKIPCHECK_POLICY"] = "SKIP_CHECK_DISABLED"
 
-    N, D_in, H, D_out = 32, 784, 500, 10
+    N, D_in, H, D_out = 32, 784, 500, 10  # noqa: N806
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     model = ORTModule(model)
     x = torch.randn(N, D_in, device=device)
@@ -2240,7 +2240,7 @@ def test_changes_input_requires_grad_reinitializes_module_gradient_graph_builder
 
 @pytest.mark.parametrize("device", ["cuda"])
 def test_input_requires_grad_backward_creates_input_grad_as_required0(device):
-    N, D_in, H, D_out = 32, 784, 500, 10
+    N, D_in, H, D_out = 32, 784, 500, 10  # noqa: N806
     pt_model = NeuralNetMultiplePositionalArgumentsMultiOutputsWithoutDependency(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
     pt_x1 = torch.randn(N, D_in, device=device, requires_grad=True)
@@ -2289,7 +2289,7 @@ def run_step1(model, x1, x2):
 def test_model_output_with_inplace_update(device):
     class NeuralNetWithGradNeedOutput(torch.nn.Module):
         def __init__(self, input_size, hidden_size):
-            super(NeuralNetWithGradNeedOutput, self).__init__()
+            super().__init__()
             self.fc1_1 = torch.nn.Linear(input_size, hidden_size)
             # Softmax's gradient is depending on its output
             self.act = torch.nn.Softmax(dim=1)
@@ -2305,7 +2305,7 @@ def run_step(model, x1):
         y1.backward()
         return y1
 
-    N, D_in, H = 32, 784, 500
+    N, D_in, H = 32, 784, 500  # noqa: N806
     pt_model = NeuralNetWithGradNeedOutput(D_in, H).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
 
@@ -2313,11 +2313,11 @@ def run_step(model, x1):
     ort_x1 = pt_x1.clone()
 
     with pytest.raises(Exception) as ex_info:
-        pt_y1 = run_step(pt_model, pt_x1)
+        run_step(pt_model, pt_x1)
     assert "modified by an inplace operation" in str(ex_info.value)
 
     with pytest.raises(Exception) as ex_info:
-        ort_y1 = run_step(ort_model, ort_x1)
+        run_step(ort_model, ort_x1)
     assert "modified by an inplace operation" in str(ex_info.value)
 
 
@@ -2329,7 +2329,7 @@ def run_step(model, x1, x2):
         loss.backward()
         return y1, y2
 
-    N, D_in, H, D_out = 32, 784, 500, 10
+    N, D_in, H, D_out = 32, 784, 500, 10  # noqa: N806
     pt_model = NeuralNetMultiplePositionalArgumentsMultiOutputsWithDependency(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
 
@@ -2356,7 +2356,7 @@ def run_step(model, x1, x2):
         s.backward()
         return y1, y2
 
-    N, D_in, H, D_out = 32, 784, 500, 10
+    N, D_in, H, D_out = 32, 784, 500, 10  # noqa: N806
     device = "cuda"
     pt_model = NeuralNetMultiplePositionalArgumentsMultiOutputsWithDependency(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
@@ -2384,7 +2384,7 @@ def run_step(model, x1, x2):
 def test_model_with_bypass_input(device):
     class NeuralNetWithBypassInput(torch.nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(NeuralNetWithBypassInput, self).__init__()
+            super().__init__()
 
             self.fc1_1 = torch.nn.Linear(input_size, hidden_size)
             self.relu1 = torch.nn.ReLU()
@@ -2402,7 +2402,7 @@ def run_step(model, x1, x2):
         loss.backward()
         return y1, y2
 
-    N, D_in, H, D_out = 32, 784, 500, 10
+    N, D_in, H, D_out = 32, 784, 500, 10  # noqa: N806
     pt_model = NeuralNetWithBypassInput(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
 
@@ -2451,7 +2451,7 @@ def test_gpu_reserved_memory_with_torch_no_grad():
 def test_dict_return_value_module(return_type, device):
     class NeuralNetDictOutput(torch.nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(NeuralNetDictOutput, self).__init__()
+            super().__init__()
 
             self.fc1_1 = torch.nn.Linear(input_size, hidden_size)
             self.relu1 = torch.nn.ReLU()
@@ -2471,7 +2471,7 @@ def forward(self, input1, input2, input3):
             out3 = self.fc3_2(self.relu3(self.fc3_1(input3)))
             return return_type([("loss", out1), ("logits", out2), ("hidden_states", out3)])
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetDictOutput(D_in, H, D_out).to(device)
     model = ORTModule(model)
     x = torch.randn(N, D_in, device=device)
@@ -2487,7 +2487,7 @@ def forward(self, input1, input2, input3):
 def test_dict_of_tuple_return_value_module(device):
     class NeuralNetDictOfTuplesOutput(torch.nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(NeuralNetDictOfTuplesOutput, self).__init__()
+            super().__init__()
 
             self.fc1_1 = torch.nn.Linear(input_size, hidden_size)
             self.relu1 = torch.nn.ReLU()
@@ -2507,7 +2507,7 @@ def forward(self, input1, input2, input3):
             out3 = self.fc3_2(self.relu3(self.fc3_1(input3)))
             return {"loss": (out1, out2, out3)}
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetDictOfTuplesOutput(D_in, H, D_out).to(device)
     model = ORTModule(model)
     x = torch.randn(N, D_in, device=device)
@@ -2523,7 +2523,7 @@ def forward(self, input1, input2, input3):
 def test_tuple_of_tuple_return_value_module(device):
     class NeuralNetTupleOfTuplesOutput(torch.nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(NeuralNetTupleOfTuplesOutput, self).__init__()
+            super().__init__()
 
             self.fc1_1 = torch.nn.Linear(input_size, hidden_size)
             self.relu1 = torch.nn.ReLU()
@@ -2543,7 +2543,7 @@ def forward(self, input1, input2, input3):
             out3 = self.fc3_2(self.relu3(self.fc3_1(input3)))
             return ((out1, out2), out3)
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetTupleOfTuplesOutput(D_in, H, D_out).to(device)
     model = ORTModule(model)
     x = torch.randn(N, D_in, device=device)
@@ -2563,7 +2563,7 @@ def test_named_tuple_return_value_module(device):
 
     class NeuralNetNamedTupleOutput(torch.nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(NeuralNetNamedTupleOutput, self).__init__()
+            super().__init__()
 
             self.fc1_1 = torch.nn.Linear(input_size, hidden_size)
             self.relu1 = torch.nn.ReLU()
@@ -2584,7 +2584,7 @@ def forward(self, input1, input2, input3):
 
             return ReturnValue(out1, out2, out3)
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetNamedTupleOutput(D_in, H, D_out).to(device)
     model = ORTModule(model)
     x = torch.randn(N, D_in, device=device)
@@ -2600,7 +2600,7 @@ def forward(self, input1, input2, input3):
 def test_exception_raised_for_custom_class_return_value_module(device):
     os.environ["ORTMODULE_SKIPCHECK_POLICY"] = "SKIP_CHECK_DISABLED"
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     pt_model = NeuralNetCustomClassOutput(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
     x = torch.randn(N, D_in, device=device)
@@ -2628,7 +2628,7 @@ def test_dynamic_axes_config():
     device = "cuda"
 
     # Model 1
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     model = ORTModule(model)
     x = torch.randn(N, D_in, device=device)
@@ -2666,7 +2666,7 @@ def forward(self, x):
         ort_model = ORTModule(copy.deepcopy(pt_model))
         with pytest.raises(RuntimeError) as runtime_error:
             ort_model(x)
-        assert f"Expected all tensors to be on the same device, but found at least two devices" in str(
+        assert "Expected all tensors to be on the same device, but found at least two devices" in str(
             runtime_error.value
         )
     else:
@@ -2696,7 +2696,7 @@ def forward(self, x):
         with pytest.raises(RuntimeError) as runtime_error:
             ort_model = ORTModule(copy.deepcopy(pt_model))
             ort_model(x)
-        assert f"Expected all tensors to be on the same device, but found at least two devices" in str(
+        assert "Expected all tensors to be on the same device, but found at least two devices" in str(
             runtime_error.value
         )
     else:
@@ -2726,7 +2726,7 @@ def forward(self, x):
         ort_model = ORTModule(copy.deepcopy(pt_model))
         with pytest.raises(RuntimeError) as runtime_error:
             ort_model(x)
-        assert f"Expected all tensors to be on the same device, but found at least two devices" in str(
+        assert "Expected all tensors to be on the same device, but found at least two devices" in str(
             runtime_error.value
         )
     else:
@@ -2756,7 +2756,7 @@ def forward(self, x):
         ort_model = ORTModule(copy.deepcopy(pt_model))
         with pytest.raises(RuntimeError) as runtime_error:
             ort_model(x)
-        assert f"Expected all tensors to be on the same device, but found at least two devices" in str(
+        assert "Expected all tensors to be on the same device, but found at least two devices" in str(
             runtime_error.value
         )
     else:
@@ -2771,10 +2771,9 @@ def test_model_with_different_cuda_devices(device):
     # Trick to run this test in single GPU machines
     device_id = _utils.get_device_index(device)
     if device_id >= torch.cuda.device_count():
-        warnings.warn("Skipping test_model_with_different_cuda_devices(cuda:{})".format(device_id))
-        return
+        pytest.skip(f"Skipping test_model_with_different_cuda_devices(cuda:{device_id})")
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     model = ORTModule(model)
     model.to(device)
@@ -2840,7 +2839,7 @@ def test_wrap_ortmodule_and_change_device():
     # Quick train
     loss_fn = torch.nn.MSELoss(reduction="sum")
     learning_rate = 1e-6
-    for t in range(2000):
+    for _t in range(2000):
         y_pred = model(xx)
         loss = loss_fn(y_pred, y)
         model.zero_grad()
@@ -2884,7 +2883,7 @@ def test_hf_model_output_with_tuples(return_dict):
 def test_nested_return_value_module(device):
     class NeuralNetNestedOutput(torch.nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(NeuralNetNestedOutput, self).__init__()
+            super().__init__()
 
             self.fc1_1 = torch.nn.Linear(input_size, hidden_size)
             self.relu1 = torch.nn.ReLU()
@@ -2905,7 +2904,7 @@ def forward(self, input1, input2, input3):
             out3 = self.fc3_2(self.relu(self.relu3(self.fc3_1(input3))))
             return {"a": {"b": {"c": out1}, "d": (out2, out3)}}
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetNestedOutput(D_in, H, D_out).to(device)
     model = ORTModule(model)
 
@@ -2926,7 +2925,7 @@ def forward(self, input1, input2, input3):
 def test_forward_data_and_model_on_different_devices(data_device, model_device):
     os.environ["ORTMODULE_SKIPCHECK_POLICY"] = "SKIP_CHECK_DISABLED"
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(model_device)
     ort_model = ORTModule(model)
     # When exporting the model, ensure device is same between input data and model (else pytorch will raise while exporting)
@@ -2940,7 +2939,7 @@ def test_forward_data_and_model_on_different_devices(data_device, model_device):
         # Fallback
         with pytest.raises(RuntimeError) as runtime_error:
             ort_model(x)
-        assert f"Expected all tensors to be on the same device, but found at least two devices" in str(
+        assert "Expected all tensors to be on the same device, but found at least two devices" in str(
             runtime_error.value
         )
     else:
@@ -2958,7 +2957,7 @@ def test_forward_data_and_model_on_different_devices(data_device, model_device):
 def test_forward_returns_none_type_as_output():
     class NeuralNetNoneTypeOutput(torch.nn.Module):
         def __init__(self, input_size, num_classes):
-            super(NeuralNetNoneTypeOutput, self).__init__()
+            super().__init__()
 
             self.fc1 = torch.nn.Linear(input_size, num_classes)
             self.relu1 = torch.nn.ReLU()
@@ -2969,7 +2968,7 @@ def forward(self, input1):
             return {"out": out1, "none_output": None}
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: F841, N806
     model = NeuralNetNoneTypeOutput(D_in, D_out).to(device)
     model = ORTModule(model)
     x = torch.randn(N, D_in, device=device)
@@ -2982,7 +2981,7 @@ def forward(self, input1):
 def test_bool_input_and_output():
     class NeuralNetBoolInputOutput(torch.nn.Module):
         def __init__(self, input_size, num_classes):
-            super(NeuralNetBoolInputOutput, self).__init__()
+            super().__init__()
             self.fc = torch.nn.Linear(input_size, num_classes)
             self.relu = torch.nn.ReLU()
 
@@ -2992,7 +2991,7 @@ def forward(self, condition, x1, x2):
             return out1, out2
 
     device = "cuda"
-    N, D_in, D_out = 64, 784, 10
+    N, D_in, D_out = 64, 784, 10  # noqa: N806
     model = NeuralNetBoolInputOutput(D_in, D_out).to(device)
     model = ORTModule(model)
     condition = torch.randint(2, (N, D_in), dtype=torch.bool, device=device)
@@ -3007,7 +3006,7 @@ def forward(self, condition, x1, x2):
 def test_uint8_input_and_output():
     class NeuralNetUInt8InputOutput(torch.nn.Module):
         def __init__(self, input_size, num_classes):
-            super(NeuralNetUInt8InputOutput, self).__init__()
+            super().__init__()
             self.fc = torch.nn.Linear(input_size, num_classes)
             self.relu = torch.nn.ReLU()
 
@@ -3017,7 +3016,7 @@ def forward(self, mask, x1, x2):
             return out1, out2
 
     device = "cuda"
-    N, D_in, D_out = 64, 784, 10
+    N, D_in, D_out = 64, 784, 10  # noqa: N806
     model = NeuralNetUInt8InputOutput(D_in, D_out).to(device)
     model = ORTModule(model)
     condition = torch.randint(2, (N, D_in), dtype=torch.uint8, device=device)
@@ -3031,7 +3030,7 @@ def forward(self, mask, x1, x2):
 
 def test_model_partially_requires_grad():
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetPartialNoGradModel(D_in, H, D_out).to(device)
     model = ORTModule(model)
     x = torch.randn(N, D_in, device=device)
@@ -3045,21 +3044,21 @@ def test_model_partially_requires_grad():
 
 def test_model_wrapped_inside_torch_no_grad():
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     model = ORTModule(model)
     x = torch.randn(N, D_in, device=device)
 
     # Make sure no exception is raised
     with torch.no_grad():
-        output = model(x)
+        model(x)
 
 
 def test_model_initializer_requires_grad_changes_from_one_forward_to_next():
     os.environ["ORTMODULE_SKIPCHECK_POLICY"] = "SKIP_CHECK_DISABLED"
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetPartialNoGradModel(D_in, H, D_out).to(device)
     model.fc1.requires_grad_(True)
     model = ORTModule(model)
@@ -3095,7 +3094,7 @@ def test_model_initializer_requires_grad_changes_from_one_forward_to_next():
 def test_model_with_registered_buffers():
     class NeuralNetWithRegisteredBuffer(torch.nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(NeuralNetWithRegisteredBuffer, self).__init__()
+            super().__init__()
 
             self.fc1 = torch.nn.Linear(input_size, hidden_size)
             self.relu = torch.nn.ReLU()
@@ -3113,7 +3112,7 @@ def forward(self, input1):
 
     device = "cuda"
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetWithRegisteredBuffer(D_in, H, D_out).to(device)
     ort_model = ORTModule(model)
     # Check that the original forward signature is preserved.
@@ -3127,7 +3126,7 @@ def forward(self, input1):
 def test_model_with_unused_registered_buffers():
     class UnusedBufferNet(torch.nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(UnusedBufferNet, self).__init__()
+            super().__init__()
 
             self.fc1 = torch.nn.Linear(input_size, hidden_size)
             self.relu = torch.nn.ReLU()
@@ -3145,7 +3144,7 @@ def forward(self, input1):
 
     device = "cuda"
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = UnusedBufferNet(D_in, H, D_out).to(device)
     ort_model = ORTModule(model)
     # Check that the original forward signature is preserved.
@@ -3159,7 +3158,7 @@ def forward(self, input1):
 def test_model_with_constant_and_registered_parameters():
     class NeuralNetWithRegisteredParamsWithConstant(torch.nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(NeuralNetWithRegisteredParamsWithConstant, self).__init__()
+            super().__init__()
 
             self.fc1 = torch.nn.Linear(input_size, hidden_size)
             self.relu = torch.nn.ReLU()
@@ -3178,7 +3177,7 @@ def forward(self, input1):
 
     device = "cuda"
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetWithRegisteredParamsWithConstant(D_in, H, D_out).to(device)
     ort_model = ORTModule(model)
     # Check that the original forward signature is preserved.
@@ -3191,11 +3190,11 @@ def forward(self, input1):
 
 def test_state_dict():
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     pt_model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
     x = torch.randn(N, D_in, device=device)
-    y = x.clone()
+    x.clone()
 
     state_dict_ort = ort_model.state_dict()
     state_dict_pt = pt_model.state_dict()
@@ -3220,11 +3219,11 @@ def test_state_dict():
 
 def test_load_state_dict():
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     pt_model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
     x = torch.randn(N, D_in, device=device)
-    y = x.clone()
+    x.clone()
 
     state_dict_pt = pt_model.state_dict()
     list(next(iter(state_dict_pt.items())))[1] += 10
@@ -3254,7 +3253,7 @@ def test_load_state_dict():
 
 def test_named_parameters():
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: F841, N806
     pt_model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
     named_parameters_pt = [name for name, _ in pt_model.named_parameters()]
@@ -3266,7 +3265,7 @@ def test_named_parameters():
 
 def test_parameters():
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: F841, N806
     pt_model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
     parameters_pt = [param for param in pt_model.parameters()]
@@ -3279,7 +3278,7 @@ def test_parameters():
 
 def test_named_buffers():
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     pt_model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     pt_model.register_buffer("sample_buffer_pt", torch.tensor(torch.randn(N, D_in, device=device)))
     ort_model = ORTModule(copy.deepcopy(pt_model))
@@ -3296,7 +3295,7 @@ def test_named_buffers():
 
 def test_buffers():
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     pt_model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     pt_model.register_buffer("sample_buffer_pt", torch.tensor(torch.randn(N, D_in, device=device)))
     ort_model = ORTModule(copy.deepcopy(pt_model))
@@ -3317,7 +3316,7 @@ def test_buffers():
 def test_eval_with_dropout():
     class NeuralNetDropout(torch.nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(NeuralNetDropout, self).__init__()
+            super().__init__()
 
             self.fc1 = torch.nn.Linear(input_size, hidden_size)
             self.relu = torch.nn.ReLU()
@@ -3333,7 +3332,7 @@ def forward(self, input1):
 
     device = "cuda"
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetDropout(D_in, H, D_out).to(device)
     model.eval()
     ort_model = ORTModule(copy.deepcopy(model))
@@ -3355,7 +3354,7 @@ def forward(self, input1):
 def test_with_torch_no_grad_context():
     device = "cuda"
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(model))
 
@@ -3379,7 +3378,7 @@ def test_with_torch_no_grad_context():
 def test_unused_layer():
     class Net(torch.nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(Net, self).__init__()
+            super().__init__()
 
             self.fc1 = torch.nn.Linear(input_size, hidden_size)
             self.relu = torch.nn.ReLU()
@@ -3391,7 +3390,7 @@ def forward(self, input1):
             return out
 
     device = torch.device("cuda")
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     pt_model = Net(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
 
@@ -3404,7 +3403,7 @@ def forward(self, input1):
 def test_train_eval_with_various_outputs():
     class Net(torch.nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(Net, self).__init__()
+            super().__init__()
             self.fc1 = torch.nn.Linear(input_size, hidden_size)
             self.relu = torch.nn.ReLU()
 
@@ -3424,7 +3423,7 @@ def train_step(model, x):
         return out1, out2
 
     device = torch.device("cuda")
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     pt_model = Net(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
 
@@ -3452,7 +3451,7 @@ def test_forward_dynamic_args():
 
     device = "cuda"
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetPositionalArguments(input_size=D_in, hidden_size=H, num_classes=D_out).to(device)
     model = ORTModule(model)
     args_size1 = [torch.randn(N, D_in, device=device)] * 4
@@ -3590,7 +3589,7 @@ def test_forward_dynamic_kwargs():
 def test_forward_call_kwargs_input(forward_function):
     class KwargsNet(torch.nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(KwargsNet, self).__init__()
+            super().__init__()
 
             self.fc1 = torch.nn.Linear(input_size, hidden_size)
             self.relu = torch.nn.ReLU()
@@ -3617,7 +3616,7 @@ def forward(self, pos_0, pos_1, *args, kw_0=None, kw_1=None, **kwargs):
 
     # Modeling
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = KwargsNet(input_size=D_in, hidden_size=H, num_classes=D_out).to(device)
     model = ORTModule(model)
 
@@ -3639,7 +3638,7 @@ def forward(self, pos_0, pos_1, *args, kw_0=None, kw_1=None, **kwargs):
 def test_repro_iscontiguous():
     class SimpleNet(torch.nn.Module):
         def __init__(self):
-            super(SimpleNet, self).__init__()
+            super().__init__()
             self.a = torch.nn.Parameter(torch.FloatTensor([-1.0, 1.0]))
 
         def forward(self, x):
@@ -3730,7 +3729,7 @@ def forward(self, a, b, c, d, *args, kw_0=None, **kwargs):
 def test_forward_call_kwargs_input_unexpected_order():
     class OrderlyNet(torch.nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(OrderlyNet, self).__init__()
+            super().__init__()
             self.fc1 = torch.nn.Linear(input_size, hidden_size)
             self.relu = torch.nn.ReLU()
             self.fc2 = torch.nn.Linear(hidden_size, num_classes)
@@ -3747,7 +3746,7 @@ def forward(self, input1=None, input2=None):
             return out1, out2
 
     device = "cuda"
-    N, D_in, H, D_out = 32, 784, 500, 10
+    N, D_in, H, D_out = 32, 784, 500, 10  # noqa: N806
     model = OrderlyNet(D_in, H, D_out).to(device)
     model = ORTModule(model)
 
@@ -3892,7 +3891,7 @@ def run_step(expected, a, b, c, d, e, f, y, z):
 def test_primitive_inputs(bool_argument, int_argument, float_argument):
     class PrimitiveTypesInputNet(torch.nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(PrimitiveTypesInputNet, self).__init__()
+            super().__init__()
 
             self.fc1 = torch.nn.Linear(input_size, hidden_size)
             self.relu = torch.nn.ReLU()
@@ -3915,7 +3914,7 @@ def forward(self, input1, bool_argument, int_argument, float_argument):
     assert type(float_argument) is float
 
     device = "cuda"
-    N, D_in, H, D_out = 32, 784, 500, 10
+    N, D_in, H, D_out = 32, 784, 500, 10  # noqa: N806
     pt_model = PrimitiveTypesInputNet(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
 
@@ -3931,7 +3930,7 @@ def test_changing_bool_input_re_exports_model(bool_arguments):
 
     class PrimitiveTypesInputNet(torch.nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(PrimitiveTypesInputNet, self).__init__()
+            super().__init__()
 
             self.fc1 = torch.nn.Linear(input_size, hidden_size)
             self.relu = torch.nn.ReLU()
@@ -3952,7 +3951,7 @@ def forward(self, input1, bool_argument):
     assert type(bool_arguments[1]) is bool
 
     device = "cuda"
-    N, D_in, H, D_out = 32, 784, 500, 10
+    N, D_in, H, D_out = 32, 784, 500, 10  # noqa: N806
     pt_model = PrimitiveTypesInputNet(D_in, H, D_out).to(device)
     ort_model = ORTModule(pt_model)
 
@@ -3971,7 +3970,7 @@ def forward(self, input1, bool_argument):
 def test_model_with_registered_buffer_and_dropped_parameters():
     class ModelWithBufferAndDroppedParameter(torch.nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(ModelWithBufferAndDroppedParameter, self).__init__()
+            super().__init__()
 
             self.fc1 = torch.nn.Linear(input_size, hidden_size)
             self.relu = torch.nn.ReLU()
@@ -3992,7 +3991,7 @@ def forward(self, bool_argument, input1):
             return out
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = ModelWithBufferAndDroppedParameter(D_in, H, D_out).to(device)
     model = ORTModule(model)
 
@@ -4000,7 +3999,7 @@ def forward(self, bool_argument, input1):
     x = torch.randn(N, D_in, device=device)
 
     # Ensure that no exceptions are raised
-    out = model(bool_argument, x)
+    model(bool_argument, x)
 
 
 @pytest.mark.parametrize(
@@ -4014,7 +4013,7 @@ def forward(self, bool_argument, input1):
 def test_unused_parameters(model, none_pt_params):
     device = "cuda"
 
-    N, D_in, H1, H2, D_out = 64, 784, 500, 400, 10
+    N, D_in, H1, H2, D_out = 64, 784, 500, 400, 10  # noqa: F841, N806
     model = model.to(device)
     ort_model = ORTModule(copy.deepcopy(model))
 
@@ -4048,7 +4047,7 @@ def test_unused_parameters(model, none_pt_params):
 def test_output_order():
     class OutputOrderNet(torch.nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(OutputOrderNet, self).__init__()
+            super().__init__()
 
             self.fc1 = torch.nn.Linear(input_size, hidden_size)
             self.fc2 = torch.nn.Linear(input_size, hidden_size)
@@ -4082,7 +4081,7 @@ def forward(
             )
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = OutputOrderNet(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(model))
 
@@ -4099,7 +4098,7 @@ def forward(
 
 @pytest.mark.parametrize("device", ["cuda", "cpu", None])
 def test_stateless_model_specified_device(device):
-    N, D_in, H, D_out = 32, 784, 500, 10
+    N, D_in, H, D_out = 32, 784, 500, 10  # noqa: F841, N806
     pt_model = StatelessModel().to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
 
@@ -4113,7 +4112,7 @@ def test_stateless_model_specified_device(device):
 
 
 def test_stateless_model_unspecified_device():
-    N, D_in, H, D_out = 32, 784, 500, 10
+    N, D_in, H, D_out = 32, 784, 500, 10  # noqa: F841, N806
     pt_model = StatelessModel()
     ort_model = ORTModule(copy.deepcopy(pt_model))
 
@@ -4137,7 +4136,7 @@ def test_stateless_model_unspecified_device():
 def test_unused_parameters_does_not_unnecessarily_reinitialize(model):
     device = "cuda"
 
-    N, D_in, H1, H2, D_out = 64, 784, 500, 400, 10
+    N, D_in, H1, H2, D_out = 64, 784, 500, 400, 10  # noqa: F841, N806
     model = model.to(device)
     ort_model = ORTModule(copy.deepcopy(model))
     training_manager = ort_model._torch_module._execution_manager(ort_model._is_training())
@@ -4159,14 +4158,14 @@ def test_unused_parameters_does_not_unnecessarily_reinitialize(model):
 def test_load_state_dict_for_wrapped_ortmodule():
     class WrapperModule(torch.nn.Module):
         def __init__(self, ortmodule):
-            super(WrapperModule, self).__init__()
+            super().__init__()
             self._ortmodule = ortmodule
 
         def forward(self, x):
             return self._ortmodule(x)
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     model = ORTModule(copy.deepcopy(model))
     wrapper_module = WrapperModule(model)
@@ -4236,7 +4235,7 @@ def test_ortmodule_string_inputs_are_ignored():
 def test_ortmodule_list_input():
     class ListNet(torch.nn.Module):
         def __init__(self):
-            super(ListNet, self).__init__()
+            super().__init__()
             self.dummy = torch.nn.Parameter(torch.FloatTensor([0]))
 
         def forward(self, batch):
@@ -4245,7 +4244,7 @@ def forward(self, batch):
             return self.dummy + a + b
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: F841, N806
     pt_model = ListNet().to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
     x = [torch.randn(N, D_in, device=device), torch.randn(N, D_in, device=device)]
@@ -4257,16 +4256,16 @@ def forward(self, batch):
 def test_ortmodule_list_input_with_unused_values():
     class ListNet(torch.nn.Module):
         def __init__(self):
-            super(ListNet, self).__init__()
+            super().__init__()
             self.dummy = torch.nn.Parameter(torch.FloatTensor([0]))
 
         def forward(self, batch):
-            a = batch[0]
+            batch[0]
             b = batch[1]
             return self.dummy + b
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: F841, N806
     pt_model = ListNet().to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
     x = [torch.randn(N, D_in, device=device), torch.randn(N, D_in, device=device)]
@@ -4278,7 +4277,7 @@ def forward(self, batch):
 def test_ortmodule_list_input_with_none_values():
     class ListNet(torch.nn.Module):
         def __init__(self):
-            super(ListNet, self).__init__()
+            super().__init__()
             self.dummy = torch.nn.Parameter(torch.FloatTensor([0]))
 
         def forward(self, batch):
@@ -4287,7 +4286,7 @@ def forward(self, batch):
             return self.dummy + a + b
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: F841, N806
     pt_model = ListNet().to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
     x = [None, torch.randn(N, D_in, device=device)]
@@ -4299,7 +4298,7 @@ def forward(self, batch):
 def test_ortmodule_nested_list_input():
     class ListNet(torch.nn.Module):
         def __init__(self):
-            super(ListNet, self).__init__()
+            super().__init__()
             self.dummy = torch.nn.Parameter(torch.FloatTensor([0]))
 
         def forward(self, batch):
@@ -4311,7 +4310,7 @@ def forward(self, batch):
             return self.dummy + a + b + c + d + e
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: F841, N806
     pt_model = ListNet().to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
     x = [
@@ -4327,7 +4326,7 @@ def forward(self, batch):
 @pytest.mark.parametrize("mode", ["training", "inference"])
 def test_debug_options_save_onnx_models_os_environment(mode):
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     # Create a temporary directory for the onnx_models
     with tempfile.TemporaryDirectory() as temporary_dir:
         os.environ["ORTMODULE_SAVE_ONNX_PATH"] = temporary_dir
@@ -4350,7 +4349,7 @@ def test_debug_options_save_onnx_models_os_environment(mode):
 @pytest.mark.parametrize("mode", ["training", "inference"])
 def test_debug_options_save_onnx_models_cwd(mode):
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     ort_model = ORTModule(model, DebugOptions(save_onnx=True, onnx_prefix="my_cwd_model"))
     if mode == "inference":
@@ -4390,14 +4389,14 @@ def test_debug_options_save_onnx_models_validate_fail_on_non_str_prefix():
 def test_debug_options_save_onnx_models_validate_fail_on_no_prefix():
     with pytest.raises(Exception) as ex_info:
         _ = DebugOptions(save_onnx=True)
-    assert f"onnx_prefix must be provided when save_onnx is set." in str(ex_info.value)
+    assert "onnx_prefix must be provided when save_onnx is set." in str(ex_info.value)
 
 
 def test_debug_options_log_level():
     # NOTE: This test will output verbose logging
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     ort_model = ORTModule(model, DebugOptions(log_level=LogLevel.VERBOSE))
     x = torch.randn(N, D_in, device=device)
@@ -4412,7 +4411,7 @@ def test_debug_options_log_level_os_environment():
 
     os.environ["ORTMODULE_LOG_LEVEL"] = "INFO"
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     ort_model = ORTModule(model)
     x = torch.randn(N, D_in, device=device)
@@ -4433,7 +4432,7 @@ def test_debug_options_log_level_validation_fails_on_type_mismatch():
 def test_ortmodule_gradient_accumulation_optimization_correctness():
     class NeuralNetWithCast(torch.nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(NeuralNetWithCast, self).__init__()
+            super().__init__()
 
             self.fc1 = torch.nn.Linear(input_size, hidden_size)
             self.relu = torch.nn.ReLU()
@@ -4446,7 +4445,7 @@ def forward(self, input1):
             return out
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     pt_model = NeuralNetWithCast(D_in, H, D_out).to(device)
 
     # baseline model with optimization disabled
@@ -4469,7 +4468,7 @@ def run_optim_step(optimizer):
         optimizer.step()
         optimizer.zero_grad()
 
-    GA_steps = 2
+    GA_steps = 2  # noqa: N806
     tgt_model.zero_grad()
     opt_model.zero_grad()
 
@@ -4488,7 +4487,7 @@ def run_optim_step(optimizer):
 def test_ortmodule_dict_input():
     class DictNet(torch.nn.Module):
         def __init__(self):
-            super(DictNet, self).__init__()
+            super().__init__()
             self.dummy = torch.nn.Parameter(torch.FloatTensor([0]))
 
         def forward(self, batch):
@@ -4497,7 +4496,7 @@ def forward(self, batch):
             return self.dummy + a + b
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: F841, N806
     pt_model = DictNet().to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
     x = {"one_value": torch.randn(N, D_in, device=device), "two_value": torch.randn(N, D_in, device=device)}
@@ -4509,16 +4508,16 @@ def forward(self, batch):
 def test_ortmodule_dict_input_with_unused_values():
     class DictNet(torch.nn.Module):
         def __init__(self):
-            super(DictNet, self).__init__()
+            super().__init__()
             self.dummy = torch.nn.Parameter(torch.FloatTensor([0]))
 
         def forward(self, batch):
-            b = batch["b"]
+            batch["b"]
             a = batch["a"]
             return self.dummy + a
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: F841, N806
     pt_model = DictNet().to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
     x = {"a": torch.randn(N, D_in, device=device), "b": torch.randn(N, D_in, device=device)}
@@ -4530,7 +4529,7 @@ def forward(self, batch):
 def test_ortmodule_dict_input_with_none_values():
     class DictNet(torch.nn.Module):
         def __init__(self):
-            super(DictNet, self).__init__()
+            super().__init__()
             self.dummy = torch.nn.Parameter(torch.FloatTensor([0]))
 
         def forward(self, batch):
@@ -4539,7 +4538,7 @@ def forward(self, batch):
             return self.dummy + a + b
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: F841, N806
     pt_model = DictNet().to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
     x = {"a": None, "b": torch.randn(N, D_in, device=device)}
@@ -4551,7 +4550,7 @@ def forward(self, batch):
 def test_ortmodule_dict_input_with_nested_values():
     class DictNet(torch.nn.Module):
         def __init__(self):
-            super(DictNet, self).__init__()
+            super().__init__()
             self.dummy = torch.nn.Parameter(torch.FloatTensor([0]))
 
         def forward(self, batch):
@@ -4563,7 +4562,7 @@ def forward(self, batch):
             return self.dummy + a + b + c + d + e
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: F841, N806
     pt_model = DictNet().to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
     x = {
@@ -4585,7 +4584,7 @@ def forward(self, batch):
 def test_ortmodule_list_dict_input_with_nested_values():
     class ListDictNet(torch.nn.Module):
         def __init__(self):
-            super(ListDictNet, self).__init__()
+            super().__init__()
             self.dummy = torch.nn.Parameter(torch.FloatTensor([3]))
 
         def forward(self, batch):
@@ -4597,7 +4596,7 @@ def forward(self, batch):
             return self.dummy + a + b + c + d + e
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: F841, N806
     pt_model = ListDictNet().to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
     x = {
@@ -4613,7 +4612,7 @@ def forward(self, batch):
 def test_ortmodule_list_dict_input_with_kwargs_and_registered_buffer():
     class ListDictKwargsNet(torch.nn.Module):
         def __init__(self, N, D_in):
-            super(ListDictKwargsNet, self).__init__()
+            super().__init__()
             self.register_buffer("buffer", torch.ones(N, D_in, device="cuda"))
             self.dummy = torch.nn.Parameter(torch.FloatTensor([3]))
 
@@ -4633,7 +4632,7 @@ def forward(self, batch, **kwargs):
             return out
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: F841, N806
     pt_model = ListDictKwargsNet(N, D_in).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model), DebugOptions(save_onnx=True, onnx_prefix="kwargsanddict"))
     x = {
@@ -4651,7 +4650,7 @@ def forward(self, batch, **kwargs):
 def test_ortmodule_user_defined_method():
     class UserDefinedMethodsNet(torch.nn.Module):
         def __init__(self):
-            super(UserDefinedMethodsNet, self).__init__()
+            super().__init__()
             self.dummy = torch.nn.Parameter(torch.FloatTensor([12]))
 
         def forward(self, a):
@@ -4661,7 +4660,7 @@ def custom_method_returns_input(self, user_input):
             return user_input
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: F841, N806
     pt_model = UserDefinedMethodsNet().to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
     x = torch.randn(N, D_in, device=device)
@@ -4678,7 +4677,7 @@ def custom_method_returns_input(self, user_input):
 def test_ortmodule_user_getattr_gets_successfully():
     class UserDefinedMethodsNet(torch.nn.Module):
         def __init__(self):
-            super(UserDefinedMethodsNet, self).__init__()
+            super().__init__()
             self.dummy = torch.nn.Parameter(torch.FloatTensor([12]))
 
         def forward(self, a):
@@ -4688,7 +4687,7 @@ def custom_method_returns_input(self, user_input):
             return user_input
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: F841, N806
     pt_model = UserDefinedMethodsNet().to(device)
     ort_model = ORTModule(pt_model)
 
@@ -4701,7 +4700,7 @@ def custom_method_returns_input(self, user_input):
 def test_ortmodule_setattr_new_attribute(attribute):
     class UserNet(torch.nn.Module):
         def __init__(self):
-            super(UserNet, self).__init__()
+            super().__init__()
             self.dummy = torch.nn.Parameter(torch.FloatTensor([0]))
 
         def forward(self, a):
@@ -4720,7 +4719,7 @@ def forward(self, a):
 def test_ortmodule_setattr_on_ortmodule_copied_user_model_attribute():
     class UserNet(torch.nn.Module):
         def __init__(self):
-            super(UserNet, self).__init__()
+            super().__init__()
             self.dummy = torch.nn.Parameter(torch.FloatTensor([0]))
 
         def forward(self, a):
@@ -4753,7 +4752,7 @@ def my_new_custom_method(self, a, b, c):
 def test_ortmodule_setattr_ortmodule_attribute():
     class UserNet(torch.nn.Module):
         def __init__(self):
-            super(UserNet, self).__init__()
+            super().__init__()
             self.dummy = torch.nn.Parameter(torch.FloatTensor([0]))
 
         def forward(self, a):
@@ -4766,7 +4765,7 @@ def forward(self, a):
 
     assert not hasattr(pt_model, "_torch_module")
     assert "_torch_module" in ort_model.__dict__
-    assert ort_model._torch_module == True
+    assert ort_model._torch_module is True
 
 
 def test_ortmodule_setattr_signals_model_changed():
@@ -4774,7 +4773,7 @@ def test_ortmodule_setattr_signals_model_changed():
 
     class UserNet(torch.nn.Module):
         def __init__(self, input_flag):
-            super(UserNet, self).__init__()
+            super().__init__()
             self.dummy = torch.nn.Parameter(torch.FloatTensor([10]))
             self.input_flag = input_flag
 
@@ -4785,7 +4784,7 @@ def forward(self, a):
                 return a
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: F841, N806
     pt_model = UserNet(True).to(device)
     ort_model = ORTModule(pt_model)
 
@@ -4793,11 +4792,11 @@ def forward(self, a):
     exported_model1 = ort_model._torch_module._execution_manager(True)._onnx_models.exported_model
 
     for training_mode in [False, True]:
-        assert ort_model._torch_module._execution_manager(training_mode)._original_model_has_changed == False
+        assert ort_model._torch_module._execution_manager(training_mode)._original_model_has_changed is False
     ort_model.input_flag = False
 
     for training_mode in [False, True]:
-        assert ort_model._torch_module._execution_manager(training_mode)._original_model_has_changed == True
+        assert ort_model._torch_module._execution_manager(training_mode)._original_model_has_changed is True
 
     _ = ort_model(torch.randn(N, D_in, device=device))
     exported_model2 = ort_model._torch_module._execution_manager(True)._onnx_models.exported_model
@@ -4810,7 +4809,7 @@ def forward(self, a):
 def test_ortmodule_attribute_name_collision_warning():
     class UserNet(torch.nn.Module):
         def __init__(self):
-            super(UserNet, self).__init__()
+            super().__init__()
             self.dummy = torch.nn.Parameter(torch.FloatTensor([0]))
             self._torch_module = True
 
@@ -4823,7 +4822,7 @@ def load_state_dict(self):
     device = "cuda"
     pt_model = UserNet().to(device)
     with pytest.warns(UserWarning) as warning_record:
-        ort_model = ORTModule(pt_model)
+        ORTModule(pt_model)
 
     # FutureWarning('The first argument to symbolic functions is deprecated in 1.13 and will be removed in the future.
     # Please annotate treat the first argument (g) as GraphContext and use context information from the object
@@ -4838,7 +4837,7 @@ def load_state_dict(self):
 def test_ortmodule_ortmodule_method_attribute_copy():
     class UserNetWithSelfCallingForward(torch.nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(UserNetWithSelfCallingForward, self).__init__()
+            super().__init__()
 
             self.fc1 = torch.nn.Linear(input_size, hidden_size)
             self.relu = torch.nn.ReLU()
@@ -4854,7 +4853,7 @@ def run_forward(self, *args, **kwargs):
             return self(*args, **kwargs)
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     pt_model = UserNetWithSelfCallingForward(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
 
@@ -4891,7 +4890,7 @@ def run_forward(self, *args, **kwargs):
 )
 def test_ortmodule_skip_check_load_from_os_env(policy_str, policy):
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: F841, N806
     os.environ["ORTMODULE_SKIPCHECK_POLICY"] = policy_str
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     ort_model = ORTModule(model)
@@ -4906,7 +4905,7 @@ def test_ortmodule_skip_check_load_from_os_env(policy_str, policy):
 def test_ortmodule_determinism_flag(is_training, deterministic):
     torch.use_deterministic_algorithms(deterministic)
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out)
     model = ORTModule(model)
     model.train(is_training)
@@ -4921,7 +4920,7 @@ def test_ortmodule_determinism_flag(is_training, deterministic):
 def test_ortmodule_gradient_builder():
     class Model(torch.nn.Module):
         def __init__(self):
-            super(Model, self).__init__()
+            super().__init__()
 
         def forward(self, x):
             return torch.cos(x)
@@ -4956,7 +4955,7 @@ def run_step(model, x):
 def test_override_pytorch_exporter_kwargs():
     device = "cuda"
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     x = torch.randn(N, D_in, device=device)
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
 
@@ -4973,7 +4972,7 @@ def test_override_pytorch_exporter_kwargs():
 def test_override_pytorch_exporter_kwargs__invalid():
     device = "cuda"
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     x = torch.randn(N, D_in, device=device)
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
 
@@ -4993,7 +4992,7 @@ def __init__(self, module, debug_options=None):
             for training_mode in [False, True]:
                 self._torch_module._execution_manager(training_mode)._export_extra_kwargs = {"verbose": None}
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     x = torch.randn(N, D_in, device=device)
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     ort_model = ORTModuleExtension(model)
@@ -5013,7 +5012,7 @@ def __init__(self, module, debug_options=None):
             for training_mode in [False, True]:
                 self._torch_module._execution_manager(training_mode)._export_extra_kwargs = {"custom_opsets": None}
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     x = torch.randn(N, D_in, device=device)
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     ort_model = ORTModuleExtension(model)
@@ -5029,7 +5028,7 @@ def test_ortmodule_fused_adam_optimizer_correctness():
     torch.manual_seed(8888)
 
     device = "cuda"
-    N, D_in, H, D_out = 32, 128, 500, 10
+    N, D_in, H, D_out = 32, 128, 500, 10  # noqa: N806
 
     pt_model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     transformers_adamw_optimizer = AdamW(pt_model.parameters(), lr=1)
@@ -5077,7 +5076,7 @@ def test_ortmodule_fused_adam_optimizer_correctness_torch():
     torch.manual_seed(8888)
 
     device = "cuda"
-    N, D_in, H, D_out = 4, 4, 8, 4
+    N, D_in, H, D_out = 4, 4, 8, 4  # noqa: N806
 
     pt_model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
     adamw_optimizer = torch.optim.AdamW(pt_model.parameters(), lr=1e-3)
@@ -5128,7 +5127,7 @@ def run_optim_step(optimizer):
 def test_sigmoid_grad():
     class NeuralNetSigmoid(torch.nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(NeuralNetSigmoid, self).__init__()
+            super().__init__()
 
             self.fc1 = torch.nn.Linear(input_size, hidden_size)
             self.sigmoid = torch.nn.Sigmoid()
@@ -5146,11 +5145,11 @@ def run_step(model, x):
 
     device = "cuda"
 
-    N, D_in, H, D_out = 120, 15360, 500, 15360
+    N, D_in, H, D_out = 120, 15360, 500, 15360  # noqa: N806
     pt_model = NeuralNetSigmoid(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
 
-    for step in range(1000):
+    for _step in range(1000):
         pt_x = torch.randn(N, D_in, device=device, requires_grad=True)
         ort_x = copy.deepcopy(pt_x)
         ort_prediction, ort_loss = run_step(ort_model, ort_x)
@@ -5163,7 +5162,7 @@ def run_step(model, x):
 def test_tanh_grad():
     class NeuralNetTanh(torch.nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(NeuralNetTanh, self).__init__()
+            super().__init__()
 
             self.fc1 = torch.nn.Linear(input_size, hidden_size)
             self.tanh = torch.nn.Tanh()
@@ -5181,11 +5180,11 @@ def run_step(model, x):
 
     device = "cuda"
 
-    N, D_in, H, D_out = 120, 1536, 500, 1536
+    N, D_in, H, D_out = 120, 1536, 500, 1536  # noqa: N806
     pt_model = NeuralNetTanh(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
 
-    for step in range(10):
+    for _step in range(10):
         pt_x = torch.randn(N, D_in, device=device, requires_grad=True)
         ort_x = copy.deepcopy(pt_x)
         ort_prediction, ort_loss = run_step(ort_model, ort_x)
@@ -5210,7 +5209,7 @@ def test__defined_from_envvar():
 def test_sigmoid_grad_opset13():
     class NeuralNetSigmoid(torch.nn.Module):
         def __init__(self, input_size, hidden_size, num_classes):
-            super(NeuralNetSigmoid, self).__init__()
+            super().__init__()
 
             self.fc1 = torch.nn.Linear(input_size, hidden_size)
             self.sigmoid = torch.nn.Sigmoid()
@@ -5228,7 +5227,7 @@ def run_step(model, x):
 
     device = "cuda"
 
-    N, D_in, H, D_out = 120, 15360, 500, 15360
+    N, D_in, H, D_out = 120, 15360, 500, 15360  # noqa: N806
     pt_model = NeuralNetSigmoid(D_in, H, D_out).to(device)
 
     old_opst_cst = ortmodule_module.ONNX_OPSET_VERSION
@@ -5268,7 +5267,7 @@ def run_step(model, x):
 def test_opset_version_change(opset_version):
     device = "cuda"
 
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     x = torch.randn(N, D_in, device=device)
     model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
 
@@ -5289,7 +5288,7 @@ def test_opset_version_change(opset_version):
 
 def test_serialize_ortmodule():
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     pt_model = SerializationNet(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
 
@@ -5322,7 +5321,7 @@ def test_serialize_ortmodule():
 def test_trilu_grad(batch_size, M, N, k, has_upper, upper):
     class NeuralNetTrilu(torch.nn.Module):
         def __init__(self, has_upper, upper):
-            super(NeuralNetTrilu, self).__init__()
+            super().__init__()
             self.upper = upper
             self.has_upper = has_upper
 
@@ -5359,7 +5358,7 @@ def run_step(model, x, k):
 def test_softmax(M, N):
     class NeuralNetSoftmax(torch.nn.Module):
         def __init__(self):
-            super(NeuralNetSoftmax, self).__init__()
+            super().__init__()
             self.m = torch.nn.Softmax(dim=1)
 
         def forward(self, x):
@@ -5386,11 +5385,11 @@ def run_step(model, x):
 
 
 def test_check_opset_is_default_opset_after_training():
-    M, N = 24, 6
+    M, N = 24, 6  # noqa: N806
 
     class NeuralNetSoftmax(torch.nn.Module):
         def __init__(self):
-            super(NeuralNetSoftmax, self).__init__()
+            super().__init__()
             self.m = torch.nn.Softmax(dim=1)
 
         def forward(self, x):
@@ -5422,7 +5421,7 @@ def test_random_states_unchanged_for_ortmodule():
 
     class NeuralNetSlice(torch.nn.Module):
         def __init__(self):
-            super(NeuralNetSlice, self).__init__()
+            super().__init__()
             self.dim = 32
 
         def forward(self, x):
@@ -5458,7 +5457,7 @@ def random_state_equal(a, b):
 def test_squeeze_custom_symbolic_registry():
     class SqueezeModel(torch.nn.Module):
         def __init__(self):
-            super(SqueezeModel, self).__init__()
+            super().__init__()
             self.conv = torch.nn.Conv2d(in_channels=3, out_channels=32, kernel_size=14, stride=14, bias=False)
 
         def forward(self, x):
@@ -5494,7 +5493,7 @@ def test_eval_model_mode():
     for initial_mode in (True, False):
         model = copy.deepcopy(ort_model)
         model.train(initial_mode)
-        for step in range(10):
+        for _step in range(10):
             for new_mode in (True, False):
                 model.train(new_mode)
                 model(x)
@@ -5506,7 +5505,7 @@ def test_eval_model_mode():
 def test_eval_onnx_models():
     class NeuralNetBatchNorm(torch.nn.Module):
         def __init__(self, num_features):
-            super(NeuralNetBatchNorm, self).__init__()
+            super().__init__()
             self.bn = torch.nn.BatchNorm1d(num_features)
 
         def forward(self, input):
@@ -5514,7 +5513,7 @@ def forward(self, input):
 
     device = "cuda"
 
-    N, H = 64, 128
+    N, H = 64, 128  # noqa: N806
     model = ORTModule(NeuralNetBatchNorm(H).to(device))
 
     x1 = torch.randn(N, H, device=device, requires_grad=True)
@@ -5535,7 +5534,7 @@ def forward(self, input):
 def test_kwargs_dict_input():
     class DictNet(torch.nn.Module):
         def __init__(self):
-            super(DictNet, self).__init__()
+            super().__init__()
             self.dummy = torch.nn.Parameter(torch.FloatTensor([0]))
 
         def forward(self, *args, **kwargs):
@@ -5548,7 +5547,7 @@ def forward(self, *args, **kwargs):
             return self.dummy + a + b + c + d + e
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: F841, N806
     pt_model = DictNet().to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
     x = torch.randn(N, D_in, device=device)
@@ -5590,7 +5589,7 @@ def forward(self, *args, named_kwarg, **kwargs):
             return self.dummy + a + b + c + d + e + f + g + h + i + j
 
     device = "cuda"
-    N, D_in = 64, 784
+    N, D_in = 64, 784  # noqa: N806
     pt_model = DictNet().to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
     x = torch.randn(N, D_in, device=device)
@@ -5628,7 +5627,7 @@ def forward(self, *args, named_kwarg, **kwargs):
 def test_non_contiguous_tensors_as_inputs(training_mode):
     class NonContigousNet(torch.nn.Module):
         def __init__(self):
-            super(NonContigousNet, self).__init__()
+            super().__init__()
             self.dummy = torch.nn.Parameter(torch.FloatTensor([0]))
 
         def forward(self, non_contiguous_tensor):
@@ -5648,14 +5647,14 @@ def forward(self, non_contiguous_tensor):
 def test_gradient_correctness_bce_with_logits():
     class NeuralNetBCEWithLogitsLoss(torch.nn.Module):
         def __init__(self, input_size, hidden_size):
-            super(NeuralNetBCEWithLogitsLoss, self).__init__()
+            super().__init__()
             self.linear = torch.nn.Linear(input_size, hidden_size)
 
         def forward(self, input, target):
             loss_fct = torch.nn.BCEWithLogitsLoss()
             return loss_fct(self.linear(input), target)
 
-    N, D, H = 16, 256, 128
+    N, D, H = 16, 256, 128  # noqa: N806
     device = "cuda"
     pt_model = NeuralNetBCEWithLogitsLoss(D, H).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd.py
index 52b838df7b9cb..c6bf327be06a7 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd.py
@@ -8,8 +8,8 @@
 import pytest
 import torch
 
-# Import ORT modules.
-from _test_helpers import *
+# FIXME: Remove star imports
+from _test_helpers import *  # noqa: F403
 from packaging.version import Version
 from torch.nn.parameter import Parameter
 
@@ -28,10 +28,10 @@ def torch_version_lower_than(v):
 @pytest.fixture(scope="session", autouse=True)
 def run_before_test_session(request):
     def insert_disable_fallback_in_env():
-        os.environ["ORTMODULE_FALLBACK_POLICY"] = "FALLBACK_DISABLE"
+        os.environ["ORTMODULE_FALLBACK_POLICY"] = "FALLBACK_DISABLE"  # noqa: F405
 
     def remove_disable_fallback_from_env():
-        del os.environ["ORTMODULE_FALLBACK_POLICY"]
+        del os.environ["ORTMODULE_FALLBACK_POLICY"]  # noqa: F405
 
     insert_disable_fallback_in_env()
     request.addfinalizer(remove_disable_fallback_from_env)
@@ -64,7 +64,7 @@ def backward(ctx, grad_output):
 
     class GeLUModel(torch.nn.Module):
         def __init__(self, output_size):
-            super(GeLUModel, self).__init__()
+            super().__init__()
             self.relu = GeLUFunction1.apply
             self.bias = Parameter(torch.empty(output_size, device=torch.cuda.current_device(), dtype=torch.float))
 
@@ -86,7 +86,7 @@ def input_generator():
     # generate a label that have same shape as forward output.
     label_input = torch.ones([output_size])
 
-    run_training_test_and_compare(model_builder, input_generator, label_input)
+    run_training_test_and_compare(model_builder, input_generator, label_input)  # noqa: F405
 
 
 def test_gelu_custom_func_rets_not_as_module_output():
@@ -116,7 +116,7 @@ def backward(ctx, grad_output):
 
     class GeLUModel(torch.nn.Module):
         def __init__(self, output_size):
-            super(GeLUModel, self).__init__()
+            super().__init__()
             self.relu = GeLUFunction2.apply
             self.bias = Parameter(torch.empty(output_size, device=torch.cuda.current_device(), dtype=torch.float))
 
@@ -144,7 +144,7 @@ def input_generator():
     # generate a label that have same shape as forward output.
     label_input = torch.ones([output_size])
 
-    run_training_test_and_compare(model_builder, input_generator, label_input)
+    run_training_test_and_compare(model_builder, input_generator, label_input)  # noqa: F405
 
 
 def test_gelu_multiple_forward_runs():
@@ -174,7 +174,7 @@ def backward(ctx, grad_output):
 
     class GeLUModel(torch.nn.Module):
         def __init__(self, output_size):
-            super(GeLUModel, self).__init__()
+            super().__init__()
             self.relu = GeLUFunction3.apply
             self.bias = Parameter(torch.empty(output_size, device=torch.cuda.current_device(), dtype=torch.float))
 
@@ -196,7 +196,7 @@ def input_generator():
     # generate a label that have same shape as forward output.
     label_input = torch.ones([output_size])
 
-    run_training_test_and_compare(model_builder, input_generator, label_input, run_forward_twice=True)
+    run_training_test_and_compare(model_builder, input_generator, label_input, run_forward_twice=True)  # noqa: F405
 
 
 def test_megatronf():
@@ -213,7 +213,7 @@ def backward(ctx, grad_output):
 
     class MegatronFModel(torch.nn.Module):
         def __init__(self, output_size):
-            super(MegatronFModel, self).__init__()
+            super().__init__()
             self.copy_ = MegatronFFunction.apply
             self.bias = Parameter(torch.empty(output_size, device=torch.cuda.current_device(), dtype=torch.float))
 
@@ -236,7 +236,7 @@ def input_generator():
     # generate a label that have same shape as forward output.
     label_input = torch.ones([output_size])
 
-    run_training_test_and_compare(model_builder, input_generator, label_input)
+    run_training_test_and_compare(model_builder, input_generator, label_input)  # noqa: F405
 
 
 def test_scalar_and_tuple():
@@ -261,7 +261,7 @@ def backward(ctx, grad_output):
 
     class ScalarAndTupleModel(torch.nn.Module):
         def __init__(self, output_size):
-            super(ScalarAndTupleModel, self).__init__()
+            super().__init__()
             self.activation = ScalarAndTupleFunction.apply
             self.linear_a = torch.nn.Linear(output_size, output_size)
             self.linear_b = torch.nn.Linear(output_size, output_size)
@@ -283,7 +283,7 @@ def input_generator():
     # generate a label that have same shape as forward output.
     label_input = torch.ones([output_size])
 
-    run_training_test_and_compare(model_builder, input_generator, label_input)
+    run_training_test_and_compare(model_builder, input_generator, label_input)  # noqa: F405
 
 
 def test_scalar_and_tuple_reordered():
@@ -308,7 +308,7 @@ def backward(ctx, grad_output):
 
     class ScalarAndTupleReorderedModel(torch.nn.Module):
         def __init__(self, output_size):
-            super(ScalarAndTupleReorderedModel, self).__init__()
+            super().__init__()
             self.activation = ScalarAndTupleReorderedFunction.apply
             self.linear_a = torch.nn.Linear(output_size, output_size)
             self.linear_b = torch.nn.Linear(output_size, output_size)
@@ -330,7 +330,7 @@ def input_generator():
     # generate a label that have same shape as forward output.
     label_input = torch.ones([output_size])
 
-    run_training_test_and_compare(model_builder, input_generator, label_input)
+    run_training_test_and_compare(model_builder, input_generator, label_input)  # noqa: F405
 
 
 def test_pointer_type():
@@ -365,7 +365,7 @@ def input_generator():
     # generate a label that have same shape as forward output.
     label_input = torch.ones([output_size])
 
-    run_training_test_and_compare(model_builder, input_generator, label_input)
+    run_training_test_and_compare(model_builder, input_generator, label_input)  # noqa: F405
 
 
 @pytest.mark.skip(
@@ -389,7 +389,7 @@ def backward(ctx, grad_output):
 
     class InplaceUpdateInputAsOutputNotRequireGradModel(torch.nn.Module):
         def __init__(self, output_size):
-            super(InplaceUpdateInputAsOutputNotRequireGradModel, self).__init__()
+            super().__init__()
             self.inplace_op = InplaceUpdateInputAsOutputNotRequireGradFunction.apply
             self.bias = Parameter(torch.empty(output_size, device=torch.cuda.current_device(), dtype=torch.float))
 
@@ -415,7 +415,7 @@ def input_generator():
     label_input = torch.ones([output_size])
 
     # Test when input is in-place updated, but does not require gradient.
-    run_training_test_and_compare(model_builder, input_generator, label_input, ignore_grad_compare=True)
+    run_training_test_and_compare(model_builder, input_generator, label_input, ignore_grad_compare=True)  # noqa: F405
 
 
 @pytest.mark.skip(
@@ -435,7 +435,7 @@ def backward(ctx, grad_output):
 
     class InplaceUpdateInputNotAsOutputNotRequireGradModel(torch.nn.Module):
         def __init__(self, output_size):
-            super(InplaceUpdateInputNotAsOutputNotRequireGradModel, self).__init__()
+            super().__init__()
             self.inplace_op = InplaceUpdateInputNotAsOutputNotRequireGradFunction.apply
             self.bias = Parameter(torch.empty(output_size, device=torch.cuda.current_device(), dtype=torch.float))
 
@@ -464,7 +464,7 @@ def input_generator():
     # which is a duplicated computation with the PythonOp.
     # So for the weights that are used twice BUT SHOULD only used once, the gradients are almost 2x than PyTorch's grad,
     # this is the reason we ignore the gradient compare here.
-    run_training_test_and_compare(model_builder, input_generator, label_input, ignore_grad_compare=True)
+    run_training_test_and_compare(model_builder, input_generator, label_input, ignore_grad_compare=True)  # noqa: F405
 
 
 @pytest.mark.skip(reason="disable due to exporter bug https://github.com/microsoft/onnx-converters-private/issues/37.")
@@ -486,7 +486,7 @@ def backward(ctx, grad_output):
 
     class InplaceUpdateInputAsOutputNotRequireGradWithMarkDirtyModel(torch.nn.Module):
         def __init__(self, output_size):
-            super(InplaceUpdateInputAsOutputNotRequireGradWithMarkDirtyModel, self).__init__()
+            super().__init__()
             self.inplace_op = InplaceUpdateInputAsOutputNotRequireGradWithMarkDirtyFunction.apply
             self.bias = Parameter(torch.empty(output_size, device=torch.cuda.current_device(), dtype=torch.float))
 
@@ -511,7 +511,7 @@ def input_generator():
     # generate a label that have same shape as forward output.
     label_input = torch.ones([output_size])
 
-    run_training_test_and_compare(model_builder, input_generator, label_input)
+    run_training_test_and_compare(model_builder, input_generator, label_input)  # noqa: F405
 
 
 @pytest.mark.skip(
@@ -532,7 +532,7 @@ def backward(ctx, grad_output):
 
     class InplaceUpdateInputAsOutputRequireGradModel(torch.nn.Module):
         def __init__(self, output_size):
-            super(InplaceUpdateInputAsOutputRequireGradModel, self).__init__()
+            super().__init__()
             self.inplace_op = InplaceUpdateInputAsOutputRequireGradFunction.apply
             self.bias = Parameter(torch.empty(output_size, device=torch.cuda.current_device(), dtype=torch.float))
 
@@ -563,7 +563,7 @@ def input_generator():
     # duplicated computation with the PythonOp.  Thus, for the weights that are used twice BUT SHOULD
     # only used once, the gradients are almost 2x than PyTorch's grad, this is the reason we
     # ignore the gradient compare here.
-    run_training_test_and_compare(model_builder, input_generator, label_input, ignore_grad_compare=True)
+    run_training_test_and_compare(model_builder, input_generator, label_input, ignore_grad_compare=True)  # noqa: F405
 
 
 @pytest.mark.skip(
@@ -586,7 +586,7 @@ def backward(ctx, grad_output):
 
     class InplaceUpdateInputNotAsOutputRequireGradModel(torch.nn.Module):
         def __init__(self, output_size):
-            super(InplaceUpdateInputNotAsOutputRequireGradModel, self).__init__()
+            super().__init__()
             self.inplace_op = InplaceUpdateInputNotAsOutputRequireGradFunction.apply
             self.bias = Parameter(torch.empty(output_size, device=torch.cuda.current_device(), dtype=torch.float))
 
@@ -615,7 +615,7 @@ def input_generator():
     # should reuse the input torch tensor @140214095996104, 140212816617984 but actually not." It seems
     # if we don't have mark_dirty() in auto grad forward, the result is not using the input_,
     # (maybe a view of it, because data address is same)
-    run_training_test_and_compare(model_builder, input_generator, label_input, ignore_grad_compare=True)
+    run_training_test_and_compare(model_builder, input_generator, label_input, ignore_grad_compare=True)  # noqa: F405
 
 
 ##########################################################################################
@@ -640,7 +640,7 @@ def backward(ctx, grad_output):
 
     class InplaceUpdateInputAsOutputRequireGradWithMarkDirtyModel(torch.nn.Module):
         def __init__(self, output_size):
-            super(InplaceUpdateInputAsOutputRequireGradWithMarkDirtyModel, self).__init__()
+            super().__init__()
             self.inplace_op = InplaceUpdateInputAsOutputRequireGradWithMarkDirtyFunction.apply
             self.bias = Parameter(torch.empty(output_size, device=torch.cuda.current_device(), dtype=torch.float))
 
@@ -665,7 +665,7 @@ def input_generator():
     # generate a label that have same shape as forward output.
     label_input = torch.ones([output_size])
 
-    run_training_test_and_compare(model_builder, input_generator, label_input)
+    run_training_test_and_compare(model_builder, input_generator, label_input)  # noqa: F405
 
 
 def test_evaluation():
@@ -678,12 +678,11 @@ def forward(ctx, x):
 
         @staticmethod
         def backward(ctx, grad_output):
-            x = ctx.saved_tensors
             return None
 
     class EvalTestModel(torch.nn.Module):
         def __init__(self, output_size):
-            super(EvalTestModel, self).__init__()
+            super().__init__()
             self.custom_fn = EvalTestFunction.apply
             self.bias = Parameter(torch.empty(output_size, device=torch.cuda.current_device(), dtype=torch.float))
 
@@ -707,7 +706,7 @@ def input_generator():
     label_input = torch.ones([output_size])
 
     # Test pure inferencing scenarios, when inputs don't requires_grad.
-    run_evaluate_test_and_compare(model_builder, input_generator, label_input)
+    run_evaluate_test_and_compare(model_builder, input_generator, label_input)  # noqa: F405
 
 
 @pytest.mark.skipif(
@@ -748,7 +747,7 @@ def backward(ctx, dw, dz):
 
     class TwoOutputModel(torch.nn.Module):
         def __init__(self, output_size):
-            super(TwoOutputModel, self).__init__()
+            super().__init__()
             self.fun = TwoOutputFunction1.apply
             self.bias = Parameter(torch.empty(output_size, device=torch.cuda.current_device(), dtype=torch.float))
 
@@ -771,13 +770,13 @@ def input_generator():
     label_input = torch.ones([output_size])
 
     # Test multi-input and multi-output custom function.
-    run_training_test_and_compare(model_builder, input_generator, label_input)
+    run_training_test_and_compare(model_builder, input_generator, label_input)  # noqa: F405
 
 
 def test_inner_module_call():
     class InnerModel(torch.nn.Module):
         def __init__(self, dim, device):
-            super(InnerModel, self).__init__()
+            super().__init__()
             self.bias = Parameter(torch.FloatTensor([1.0] * dim).to(device))
 
         def forward(self, x):
@@ -809,7 +808,7 @@ def backward(ctx, dv):
 
     class OuterModel(torch.nn.Module):
         def __init__(self, dim, device, use_ort):
-            super(OuterModel, self).__init__()
+            super().__init__()
             self.fun = OuterFunction.apply
             self.dim = dim
             self.device = device
@@ -837,12 +836,12 @@ def get_inner_module_call_result(x, device, use_ort):
     # Test indirect ORTModule call from custom function
     result_pth = get_inner_module_call_result(x.detach(), "cuda:0", False)
     result_ort = get_inner_module_call_result(x.detach(), "cuda:0", True)
-    compare_tensor_list(result_ort, result_pth)
+    compare_tensor_list(result_ort, result_pth)  # noqa: F405
 
     # Test indirect ORTModule call from custom function
     result_ort = get_inner_module_call_result(x.detach(), "cpu", True)
     result_pth = get_inner_module_call_result(x.detach(), "cpu", False)
-    compare_tensor_list(result_ort, result_pth)
+    compare_tensor_list(result_ort, result_pth)  # noqa: F405
 
 
 @pytest.mark.skipif(
@@ -868,7 +867,7 @@ def backward(ctx, dw, dz):
 
     class TwoOutputModel(torch.nn.Module):
         def __init__(self, output_size):
-            super(TwoOutputModel, self).__init__()
+            super().__init__()
             self.fun = TwoOutputFunction2.apply
             self.bias = Parameter(torch.empty(output_size, device=torch.cuda.current_device(), dtype=torch.float))
 
@@ -895,9 +894,9 @@ def input_generator_with_requires_grad():
     label_input = torch.ones([output_size])
 
     # Test multi-input and multi-output custom function.
-    run_training_test_and_compare(model_builder, input_generator, label_input)
+    run_training_test_and_compare(model_builder, input_generator, label_input)  # noqa: F405
 
-    run_training_test_and_compare(model_builder, input_generator_with_requires_grad, label_input)
+    run_training_test_and_compare(model_builder, input_generator_with_requires_grad, label_input)  # noqa: F405
 
 
 def test_multiple_stream_in_forward_function():
@@ -923,7 +922,7 @@ def backward(ctx, grad_output):
 
     class MultipleStreamModel(torch.nn.Module):
         def __init__(self, output_size):
-            super(MultipleStreamModel, self).__init__()
+            super().__init__()
             self.relu = MultipleStreamFunction1.apply
 
         def forward(self, model_input):
@@ -943,7 +942,7 @@ def input_generator():
     label_input = torch.ones([output_size])
 
     # Test multi-input and multi-output custom function.
-    run_training_test_and_compare(
+    run_training_test_and_compare(  # noqa: F405
         model_builder, input_generator, label_input, expected_outputs=[torch.tensor([0.224, 0.272])]
     )
 
@@ -970,7 +969,7 @@ def backward(ctx, grad_output):
 
     class MultipleStreamModel(torch.nn.Module):
         def __init__(self, output_size):
-            super(MultipleStreamModel, self).__init__()
+            super().__init__()
             self.relu = MultipleStreamFunction2.apply
 
         def forward(self, model_input):
@@ -991,7 +990,7 @@ def input_generator():
     label_input = torch.ones([output_size])
 
     # Test multi-input and multi-output custom function.
-    run_training_test_and_compare(
+    run_training_test_and_compare(  # noqa: F405
         model_builder, input_generator, label_input, expected_outputs=[torch.tensor([0.224, 0.272])]
     )
 
@@ -1012,7 +1011,7 @@ def backward(ctx, grad_output):
 
     class MultipleStreamModel(torch.nn.Module):
         def __init__(self, output_size):
-            super(MultipleStreamModel, self).__init__()
+            super().__init__()
             self.relu = MultipleStreamFunction3.apply
 
         def forward(self, model_input):
@@ -1038,7 +1037,7 @@ def input_generator():
     label_input = torch.ones([output_size])
 
     # Test multi-input and multi-output custom function.
-    run_training_test_and_compare(
+    run_training_test_and_compare(  # noqa: F405
         model_builder, input_generator, label_input, expected_outputs=[torch.tensor([0.224, 0.272])]
     )
 
@@ -1066,7 +1065,7 @@ def backward(ctx, grad_output):
 
     class MultipleStreamModel(torch.nn.Module):
         def __init__(self, output_size):
-            super(MultipleStreamModel, self).__init__()
+            super().__init__()
             self.relu = MultipleStreamFunction4.apply
 
         def forward(self, model_input):
@@ -1087,7 +1086,7 @@ def input_generator():
     label_input = torch.ones([output_size])
 
     # Test multi-input and multi-output custom function.
-    run_training_test_and_compare(
+    run_training_test_and_compare(  # noqa: F405
         model_builder, input_generator, label_input, expected_outputs=[torch.tensor([0.224, 0.272])]
     )
 
@@ -1108,7 +1107,7 @@ def backward(ctx, dy):
     class Foo(torch.nn.Module):
         # Module calling non-differentiable function.
         def __init__(self):
-            super(Foo, self).__init__()
+            super().__init__()
             self._linear = torch.nn.Linear(2, 3)
 
         def forward(self, x):
@@ -1147,7 +1146,7 @@ def test_checkpoint_function():
     class A(torch.nn.Module):
         # A supported module.
         def __init__(self):
-            super(A, self).__init__()
+            super().__init__()
             self.l1 = torch.nn.Linear(2, 2)
 
         def forward(self, x):
@@ -1158,7 +1157,7 @@ class B(torch.nn.Module):
         # uses gradient-checkpointing. However, its two sub-module's
         # are exportable, so ORTModule should be used to compute them.
         def __init__(self):
-            super(B, self).__init__()
+            super().__init__()
             self.l1 = torch.nn.Linear(2, 2)
             self.a = A()
 
@@ -1181,7 +1180,7 @@ def run():
         print("Ref:")
         print(y_ref)
 
-        os.environ["ORTMODULE_ALLOW_AUTOGRAD_CHECKPOINT"] = "1"
+        os.environ["ORTMODULE_ALLOW_AUTOGRAD_CHECKPOINT"] = "1"  # noqa: F405
 
         m = ORTModule(m)
 
@@ -1196,7 +1195,7 @@ def run():
         print("Train:")
         assert torch.allclose(y_ref, y_train)
 
-        del os.environ["ORTMODULE_ALLOW_AUTOGRAD_CHECKPOINT"]
+        del os.environ["ORTMODULE_ALLOW_AUTOGRAD_CHECKPOINT"]  # noqa: F405
 
     run()
 
@@ -1211,12 +1210,11 @@ def forward(ctx, x):
 
         @staticmethod
         def backward(ctx, grad_output):
-            x = ctx.saved_tensors
             return None
 
     class TestSkippedModel(torch.nn.Module):
         def __init__(self, output_size):
-            super(TestSkippedModel, self).__init__()
+            super().__init__()
             self.custom_fn = TestSkippedFunction.apply
             self.bias = Parameter(torch.empty(output_size, device=torch.cuda.current_device(), dtype=torch.float))
 
@@ -1230,7 +1228,7 @@ def forward(self, model_input):
 
     output_size = 1024
 
-    os.environ[
+    os.environ[  # noqa: F405
         "ORTMODULE_SKIPPED_AUTOGRAD_FUNCTIONS"
     ] = "orttraining_test_ortmodule_autograd.test_skipped_autograd_function.<locals>.TestSkippedFunction"
 
@@ -1244,7 +1242,7 @@ def forward(self, model_input):
 
     assert not can_run
 
-    del os.environ["ORTMODULE_SKIPPED_AUTOGRAD_FUNCTIONS"]
+    del os.environ["ORTMODULE_SKIPPED_AUTOGRAD_FUNCTIONS"]  # noqa: F405
 
 
 def test_pythonop_training_mode():
@@ -1286,7 +1284,7 @@ def backward(ctx, grad_output):
 
     class TestModel(torch.nn.Module):
         def __init__(self, output_size):
-            super(TestModel, self).__init__()
+            super().__init__()
             self.custom_fn = TestFunction.apply
             self.bias = Parameter(torch.empty(output_size, dtype=torch.float))
 
@@ -1368,4 +1366,4 @@ def input_generator():
         return torch.randn(output_size, output_size, dtype=torch.float).requires_grad_()
 
     label_input = torch.ones([output_size])
-    run_training_test_and_compare(model_builder, input_generator, label_input)
+    run_training_test_and_compare(model_builder, input_generator, label_input)  # noqa: F405
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd_dist.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd_dist.py
index 188e053e4711e..50016515a69e1 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd_dist.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_autograd_dist.py
@@ -3,18 +3,19 @@
 
 
 import copy
-import onnxruntime
 import os
 import sys
+
+import _test_helpers
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
-from onnxruntime.training.ortmodule import ORTModule
-from onnxruntime.training.ortmodule._graph_execution_manager_factory import GraphExecutionManagerFactory
-from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.nn.parallel import DistributedDataParallel as DDP  # noqa: N817
 from torch.nn.parameter import Parameter
 
-import _test_helpers
+import onnxruntime
+from onnxruntime.training.ortmodule import ORTModule
+from onnxruntime.training.ortmodule._graph_execution_manager_factory import GraphExecutionManagerFactory  # noqa: F401
 
 torch.manual_seed(1)
 onnxruntime.set_seed(1)
@@ -44,7 +45,7 @@ def backward(ctx, grad_output):
 
 class ReduceWithMarkDirtyModel(torch.nn.Module):
     def __init__(self, dim):
-        super(ReduceWithMarkDirtyModel, self).__init__()
+        super().__init__()
         self.reduce_op_ = ReduceWithMarkDirtyFunction.apply
         self.bias = Parameter(torch.empty(dim, device=torch.cuda.current_device(), dtype=torch.float))
 
@@ -123,8 +124,8 @@ def run_with_ort_on_gpu(model, args, rank, device):
     size = 2
     try:
         mp.spawn(test_Distributed_ReduceWithMarkDirtyModel, nprocs=size, args=(size,))
-    except:
-        import sys
+    except Exception:
+        import sys  # noqa: F811
 
         sys.stdout.flush()
         sys.stderr.flush()
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py
index 8f1d57ff138a8..6b21249317590 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py
@@ -1,23 +1,28 @@
-import logging
 import argparse
-import torch
-import wget
+import datetime
+import logging
 import os
-import pandas as pd
-import zipfile
-from transformers import BertTokenizer, AutoConfig
-from sklearn.model_selection import train_test_split
-from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
-from transformers import BertForSequenceClassification, AdamW, BertConfig
-from transformers import get_linear_schedule_with_warmup
-import numpy as np
 import random
 import time
-import datetime
+import zipfile
 
+import numpy as np
+import pandas as pd
+import torch
+import wget
+from sklearn.model_selection import train_test_split
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
+from transformers import BertConfig  # noqa: F401
+from transformers import (
+    AdamW,
+    AutoConfig,
+    BertForSequenceClassification,
+    BertTokenizer,
+    get_linear_schedule_with_warmup,
+)
 
 import onnxruntime
-from onnxruntime.training.ortmodule import ORTModule, DebugOptions
+from onnxruntime.training.ortmodule import DebugOptions, ORTModule
 
 
 def train(model, optimizer, scheduler, train_dataloader, epoch, device, args):
@@ -28,7 +33,7 @@ def train(model, optimizer, scheduler, train_dataloader, epoch, device, args):
     # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
 
     # Perform one full pass over the training set.
-    print("\n======== Epoch {:} / {:} with batch size {:} ========".format(epoch + 1, args.epochs, args.batch_size))
+    print(f"\n======== Epoch {epoch + 1} / {args.epochs} with batch size {args.batch_size} ========")
 
     # Measure how long the training epoch takes.
     t0 = time.time()
@@ -45,7 +50,6 @@ def train(model, optimizer, scheduler, train_dataloader, epoch, device, args):
 
     # For each batch of training data...
     for step, batch in enumerate(train_dataloader):
-
         if step == args.train_steps:
             break
 
@@ -80,7 +84,7 @@ def train(model, optimizer, scheduler, train_dataloader, epoch, device, args):
         loss = outputs[0]
 
         # Progress update every 40 batches.
-        if step % args.log_interval == 0 and not step == 0:
+        if step % args.log_interval == 0 and step != 0:
             # Calculate elapsed time in minutes.
             curr_time = time.time()
             elapsed_time = curr_time - start_time
@@ -122,8 +126,8 @@ def train(model, optimizer, scheduler, train_dataloader, epoch, device, args):
     avg_train_loss = total_loss / len(train_dataloader)
 
     epoch_time = time.time() - t0
-    print("\n  Average training loss: {0:.2f}".format(avg_train_loss))
-    print("  Training epoch took: {:.4f}s".format(epoch_time))
+    print(f"\n  Average training loss: {avg_train_loss:.2f}")
+    print(f"  Training epoch took: {epoch_time:.4f}s")
     return epoch_time
 
 
@@ -133,7 +137,7 @@ def test(model, validation_dataloader, device, args):
     # ========================================
     # After the completion of each training epoch, measure our performance on
     # our validation set.
-    print("\nRunning Validation with batch size {:} ...".format(args.test_batch_size))
+    print(f"\nRunning Validation with batch size {args.test_batch_size} ...")
 
     # Put the model in evaluation mode--the dropout layers behave differently
     # during evaluation.
@@ -142,8 +146,8 @@ def test(model, validation_dataloader, device, args):
     t0 = time.time()
 
     # Tracking variables
-    eval_loss, eval_accuracy = 0, 0
-    nb_eval_steps, nb_eval_examples = 0, 0
+    eval_loss, eval_accuracy = 0, 0  # noqa: F841
+    nb_eval_steps, nb_eval_examples = 0, 0  # noqa: F841
 
     # Evaluate data for one epoch
     for batch in validation_dataloader:
@@ -155,7 +159,6 @@ def test(model, validation_dataloader, device, args):
         # Telling the model not to compute or store gradients, saving memory and
         # speeding up validation
         with torch.no_grad():
-
             # Forward pass, calculate logit predictions.
             # This will return the logits rather than the loss because we have
             # not provided labels.
@@ -189,8 +192,8 @@ def test(model, validation_dataloader, device, args):
     # Report the final accuracy for this validation run.
     epoch_time = time.time() - t0
     accuracy = eval_accuracy / nb_eval_steps
-    print("  Accuracy: {0:.2f}".format(accuracy))
-    print("  Validation took: {:.4f}s".format(epoch_time))
+    print(f"  Accuracy: {accuracy:.2f}")
+    print(f"  Validation took: {epoch_time:.4f}s")
     return epoch_time, accuracy
 
 
@@ -234,7 +237,7 @@ def _download_dataset(download_dir):
 
     # Set the max length of encoded sentence.
     # 64 is slightly larger than the maximum training sentence length of 47...
-    MAX_LEN = 64
+    MAX_LEN = 64  # noqa: N806
 
     # Tokenize all of the sentences and map the tokens to their word IDs.
     input_ids = []
@@ -316,7 +319,7 @@ def flat_accuracy(preds, labels):
 def format_time(elapsed):
     """Takes a time in seconds and returns a string hh:mm:ss"""
     # Round to the nearest second.
-    elapsed_rounded = int(round((elapsed)))
+    elapsed_rounded = int(round(elapsed))
 
     # Format as hh:mm:ss
     return str(datetime.timedelta(seconds=elapsed_rounded))
@@ -452,12 +455,12 @@ def main():
         estimated_export = 0
         if args.epochs > 1:
             estimated_export = epoch_0_training - (total_training_time - epoch_0_training) / (args.epochs - 1)
-            print("  Estimated ONNX export took:               {:.4f}s".format(estimated_export))
+            print(f"  Estimated ONNX export took:               {estimated_export:.4f}s")
         else:
             print("  Estimated ONNX export took:               Estimate available when epochs > 1 only")
-        print("  Accumulated training without export took: {:.4f}s".format(total_training_time - estimated_export))
-    print("  Accumulated training took:                {:.4f}s".format(total_training_time))
-    print("  Accumulated validation took:              {:.4f}s".format(total_test_time))
+        print(f"  Accumulated training without export took: {total_training_time - estimated_export:.4f}s")
+    print(f"  Accumulated training took:                {total_training_time:.4f}s")
+    print(f"  Accumulated validation took:              {total_test_time:.4f}s")
 
 
 if __name__ == "__main__":
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py
index 42697766c9815..5994047a6d082 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py
@@ -1,23 +1,23 @@
-import logging
 import argparse
-import torch
-import wget
+import datetime
+import logging
 import os
-import pandas as pd
-import zipfile
-from transformers import BertTokenizer, AutoConfig
-from sklearn.model_selection import train_test_split
-from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
-from transformers import BertForSequenceClassification, AdamW, BertConfig
-from transformers import get_linear_schedule_with_warmup
-import numpy as np
 import random
 import time
-import datetime
+import zipfile
 
+import numpy as np
+import pandas as pd
+import torch
+import wget
+from sklearn.model_selection import train_test_split
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
+from transformers import AdamW  # noqa: F401
+from transformers import BertConfig  # noqa: F401
+from transformers import AutoConfig, BertForSequenceClassification, BertTokenizer, get_linear_schedule_with_warmup
 
 import onnxruntime
-from onnxruntime.training.ortmodule import ORTModule, DebugOptions
+from onnxruntime.training.ortmodule import DebugOptions, ORTModule
 
 
 def train(model, optimizer, scaler, scheduler, train_dataloader, epoch, device, args):
@@ -28,7 +28,7 @@ def train(model, optimizer, scaler, scheduler, train_dataloader, epoch, device,
     # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
 
     # Perform one full pass over the training set.
-    print("\n======== Epoch {:} / {:} with batch size {:} ========".format(epoch + 1, args.epochs, args.batch_size))
+    print(f"\n======== Epoch {epoch + 1} / {args.epochs} with batch size {args.batch_size} ========")
 
     # Measure how long the training epoch takes.
     t0 = time.time()
@@ -51,7 +51,6 @@ def train(model, optimizer, scaler, scheduler, train_dataloader, epoch, device,
 
     # For each batch of training data...
     for step, batch in enumerate(train_dataloader):
-
         if step == args.train_steps:
             break
 
@@ -80,7 +79,7 @@ def train(model, optimizer, scaler, scheduler, train_dataloader, epoch, device,
             loss = outputs[0]
 
         # Progress update every 40 batches.
-        if step % args.log_interval == 0 and not step == 0:
+        if step % args.log_interval == 0 and step != 0:
             # Calculate elapsed time in minutes.
             curr_time = time.time()
             elapsed_time = curr_time - start_time
@@ -125,8 +124,8 @@ def train(model, optimizer, scaler, scheduler, train_dataloader, epoch, device,
     avg_train_loss = total_loss / len(train_dataloader)
 
     epoch_time = time.time() - t0
-    print("\n  Average training loss: {0:.2f}".format(avg_train_loss))
-    print("  Training epoch took: {:.4f}s".format(epoch_time))
+    print(f"\n  Average training loss: {avg_train_loss:.2f}")
+    print(f"  Training epoch took: {epoch_time:.4f}s")
     return epoch_time
 
 
@@ -136,7 +135,7 @@ def test(model, validation_dataloader, device, args):
     # ========================================
     # After the completion of each training epoch, measure our performance on
     # our validation set.
-    print("\nRunning Validation with batch size {:} ...".format(args.test_batch_size))
+    print(f"\nRunning Validation with batch size {args.test_batch_size} ...")
 
     # Put the model in evaluation mode--the dropout layers behave differently
     # during evaluation.
@@ -145,8 +144,8 @@ def test(model, validation_dataloader, device, args):
     t0 = time.time()
 
     # Tracking variables
-    eval_loss, eval_accuracy = 0, 0
-    nb_eval_steps, nb_eval_examples = 0, 0
+    eval_loss, eval_accuracy = 0, 0  # noqa: F841
+    nb_eval_steps, nb_eval_examples = 0, 0  # noqa: F841
 
     # Evaluate data for one epoch
     for batch in validation_dataloader:
@@ -158,7 +157,6 @@ def test(model, validation_dataloader, device, args):
         # Telling the model not to compute or store gradients, saving memory and
         # speeding up validation
         with torch.no_grad():
-
             # Forward pass, calculate logit predictions.
             # This will return the logits rather than the loss because we have
             # not provided labels.
@@ -192,8 +190,8 @@ def test(model, validation_dataloader, device, args):
     # Report the final accuracy for this validation run.
     epoch_time = time.time() - t0
     accuracy = eval_accuracy / nb_eval_steps
-    print("  Accuracy: {0:.2f}".format(accuracy))
-    print("  Validation took: {:.4f}s".format(epoch_time))
+    print(f"  Accuracy: {accuracy:.2f}")
+    print(f"  Validation took: {epoch_time:.4f}s")
     return epoch_time, accuracy
 
 
@@ -237,7 +235,7 @@ def _download_dataset(download_dir):
 
     # Set the max length of encoded sentence.
     # 64 is slightly larger than the maximum training sentence length of 47...
-    MAX_LEN = 64
+    MAX_LEN = 64  # noqa: N806
 
     # Tokenize all of the sentences and map the tokens to their word IDs.
     input_ids = []
@@ -319,7 +317,7 @@ def flat_accuracy(preds, labels):
 def format_time(elapsed):
     """Takes a time in seconds and returns a string hh:mm:ss"""
     # Round to the nearest second.
-    elapsed_rounded = int(round((elapsed)))
+    elapsed_rounded = int(round(elapsed))
 
     # Format as hh:mm:ss
     return str(datetime.timedelta(seconds=elapsed_rounded))
@@ -464,12 +462,12 @@ def main():
         estimated_export = 0
         if args.epochs > 1:
             estimated_export = epoch_0_training - (total_training_time - epoch_0_training) / (args.epochs - 1)
-            print("  Estimated ONNX export took:               {:.4f}s".format(estimated_export))
+            print(f"  Estimated ONNX export took:               {estimated_export:.4f}s")
         else:
             print("  Estimated ONNX export took:               Estimate available when epochs > 1 only")
-        print("  Accumulated training without export took: {:.4f}s".format(total_training_time - estimated_export))
-    print("  Accumulated training took:                {:.4f}s".format(total_training_time))
-    print("  Accumulated validation took:              {:.4f}s".format(total_test_time))
+        print(f"  Accumulated training without export took: {total_training_time - estimated_export:.4f}s")
+    print(f"  Accumulated training took:                {total_training_time:.4f}s")
+    print(f"  Accumulated validation took:              {total_test_time:.4f}s")
 
 
 if __name__ == "__main__":
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_pipeline_parallel.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_pipeline_parallel.py
index 21ebdb52037d4..5d0dcebf88310 100755
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_pipeline_parallel.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_pipeline_parallel.py
@@ -1,13 +1,13 @@
+import argparse
+
+import deepspeed
 import torch
-from torch import nn, optim
 import torch.distributed as dist
-import deepspeed
-from deepspeed.pipe import PipelineModule, LayerSpec
-from deepspeed.utils import RepeatingLoader
-
-from onnxruntime.training.ortmodule import ORTModule, _utils
+from deepspeed.pipe import LayerSpec, PipelineModule  # noqa: F401
+from deepspeed.utils import RepeatingLoader  # noqa: F401
+from torch import nn, optim  # noqa: F401
 
-import argparse
+from onnxruntime.training.ortmodule import ORTModule, _utils  # noqa: F401
 
 # USAGE:
 # pip install deepspeed
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py
index 037558663b428..d848bd31378e8 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py
@@ -9,20 +9,20 @@
 ```
 """
 import argparse
-import torch
 import time
-from torchvision import datasets, transforms
+
+import deepspeed
+import torch
 import torch.distributed as dist
+from torchvision import datasets, transforms
 
 import onnxruntime
-from onnxruntime.training.ortmodule import ORTModule, DebugOptions, LogLevel
-
-import deepspeed
+from onnxruntime.training.ortmodule import DebugOptions, LogLevel, ORTModule
 
 
 class NeuralNet(torch.nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNet, self).__init__()
+        super().__init__()
 
         self.fc1 = torch.nn.Linear(input_size, hidden_size)
         self.relu = torch.nn.ReLU()
@@ -91,8 +91,8 @@ def train(args, model, device, optimizer, loss_fn, train_loader, epoch):
     avg_train_loss = total_loss / len(train_loader)
 
     epoch_time = time.time() - t0
-    print("\n  Average training loss: {0:.2f}".format(avg_train_loss))
-    print("  Training epoch took: {:.4f}s".format(epoch_time))
+    print(f"\n  Average training loss: {avg_train_loss:.2f}")
+    print(f"  Training epoch took: {epoch_time:.4f}s")
     return epoch_time
 
 
@@ -126,8 +126,8 @@ def test(args, model, device, loss_fn, test_loader):
 
     # Report the final accuracy for this validation run.
     epoch_time = time.time() - t0
-    print("  Accuracy: {0:.2f}".format(float(correct) / len(test_loader.dataset)))
-    print("  Validation took: {:.4f}s".format(epoch_time))
+    print(f"  Accuracy: {float(correct) / len(test_loader.dataset):.2f}")
+    print(f"  Validation took: {epoch_time:.4f}s")
     return epoch_time
 
 
@@ -263,12 +263,12 @@ def main():
         estimated_export = 0
         if args.epochs > 1:
             estimated_export = epoch_0_training - (total_training_time - epoch_0_training) / (args.epochs - 1)
-            print("  Estimated ONNX export took:               {:.4f}s".format(estimated_export))
+            print(f"  Estimated ONNX export took:               {estimated_export:.4f}s")
         else:
             print("  Estimated ONNX export took:               Estimate available when epochs > 1 only")
-        print("  Accumulated training without export took: {:.4f}s".format(total_training_time - estimated_export))
-    print("  Accumulated training took:                {:.4f}s".format(total_training_time))
-    print("  Accumulated validation took:              {:.4f}s".format(total_test_time))
+        print(f"  Accumulated training without export took: {total_training_time - estimated_export:.4f}s")
+    print(f"  Accumulated training took:                {total_training_time:.4f}s")
+    print(f"  Accumulated validation took:              {total_test_time:.4f}s")
 
 
 if __name__ == "__main__":
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_experimental_json_config.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_experimental_json_config.py
index 7b2e08dc9ed6c..cc46a2db95003 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_experimental_json_config.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_experimental_json_config.py
@@ -1,13 +1,15 @@
 import os
+
 import torch
-from onnxruntime.training import ortmodule
+
 from onnxruntime.capi import _pybind_state as C
+from onnxruntime.training import ortmodule
 from onnxruntime.training.ortmodule.experimental.json_config import load_from_json
 
 
 class Net(torch.nn.Module):
     def __init__(self, input_size=784, hidden_size=500, num_classes=10):
-        super(Net, self).__init__()
+        super().__init__()
 
         self.fc1 = torch.nn.Linear(input_size, hidden_size)
         self.relu = torch.nn.ReLU()
@@ -41,30 +43,30 @@ def test_load_config_from_json_1():
         assert ort_model_attributes._propagate_cast_ops_allow == ["ABC", "DEF"]
 
         # test use external gpu allocator
-        assert ort_model_attributes._use_external_gpu_allocator == False
+        assert ort_model_attributes._use_external_gpu_allocator is False
 
         # test enable custom autograd function
-        assert ort_model_attributes._enable_custom_autograd_function == True
+        assert ort_model_attributes._enable_custom_autograd_function is True
 
         # test use static shape
-        assert ort_model_attributes._use_static_shape == True
+        assert ort_model_attributes._use_static_shape is True
 
         # test run symbolic shape inference
-        assert ort_model_attributes._run_symbolic_shape_infer == False
+        assert ort_model_attributes._run_symbolic_shape_infer is False
 
         # test enable grad acc optimization
-        assert ort_model_attributes._enable_grad_acc_optimization == True
+        assert ort_model_attributes._enable_grad_acc_optimization is True
 
         # test skip check
         assert ort_model_attributes._skip_check.value == 14
 
         # test debug options
-        assert ort_model_attributes._debug_options.save_onnx_models.save == True
+        assert ort_model_attributes._debug_options.save_onnx_models.save is True
         assert ort_model_attributes._debug_options.save_onnx_models.name_prefix == "my_model"
         assert ort_model_attributes._debug_options.logging.log_level.name == "VERBOSE"
 
         # test use memory aware gradient builder.
-        assert ort_model_attributes._use_memory_efficient_gradient == False
+        assert ort_model_attributes._use_memory_efficient_gradient is False
 
         # test fallback policy
         assert ort_model_attributes._fallback_manager.policy.value == 1
@@ -94,30 +96,30 @@ def test_load_config_from_json_2():
         assert ort_model_attributes._propagate_cast_ops_allow == ["XYZ", "PQR"]
 
         # test use external gpu allocator
-        assert ort_model_attributes._use_external_gpu_allocator == True
+        assert ort_model_attributes._use_external_gpu_allocator is True
 
         # test enable custom autograd function
-        assert ort_model_attributes._enable_custom_autograd_function == False
+        assert ort_model_attributes._enable_custom_autograd_function is False
 
         # test use static shape
-        assert ort_model_attributes._use_static_shape == False
+        assert ort_model_attributes._use_static_shape is False
 
         # test run symbolic shape inference
-        assert ort_model_attributes._run_symbolic_shape_infer == True
+        assert ort_model_attributes._run_symbolic_shape_infer is True
 
         # test enable grad acc optimization
-        assert ort_model_attributes._enable_grad_acc_optimization == False
+        assert ort_model_attributes._enable_grad_acc_optimization is False
 
         # test skip check
         assert ort_model_attributes._skip_check.value == 10
 
         # test debug options
-        assert ort_model_attributes._debug_options.save_onnx_models.save == True
+        assert ort_model_attributes._debug_options.save_onnx_models.save is True
         assert ort_model_attributes._debug_options.save_onnx_models.name_prefix == "my_other_model"
         assert ort_model_attributes._debug_options.logging.log_level.name == "INFO"
 
         # test use memory aware gradient builder.
-        assert ort_model_attributes._use_memory_efficient_gradient == True
+        assert ort_model_attributes._use_memory_efficient_gradient is True
 
         # test fallback policy
         assert ort_model_attributes._fallback_manager.policy.value == 250
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_fairscale_sharded_optimizer.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_fairscale_sharded_optimizer.py
index e1a7dd591ec36..b8b4db45bb8ee 100755
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_fairscale_sharded_optimizer.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_fairscale_sharded_optimizer.py
@@ -1,16 +1,18 @@
 import argparse
+import os
+import time
+
+import numpy as np
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
-from fairscale.optim.oss import OSS
-from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP
 import torchvision
+from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP
+from fairscale.optim.oss import OSS
+from torch.nn.parallel import DistributedDataParallel as DDP  # noqa: N817
 from torchvision import datasets, transforms
-import time
-from torch.nn.parallel import DistributedDataParallel as DDP
-import os
-from onnxruntime.training.ortmodule import ORTModule, DebugOptions
-import numpy as np
+
+from onnxruntime.training.ortmodule import DebugOptions, ORTModule
 
 # Usage :
 # pip install fairscale
@@ -27,7 +29,7 @@ def dist_init(rank, world_size):
 
 class NeuralNet(torch.nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNet, self).__init__()
+        super().__init__()
 
         self.fc1 = torch.nn.Linear(input_size, hidden_size)
         self.relu = torch.nn.ReLU()
@@ -41,7 +43,6 @@ def forward(self, input1):
 
 
 def get_dataloader(args, rank, batch_size):
-
     # Data loading code
     train_dataset = torchvision.datasets.MNIST(
         root=args.data_dir, train=True, transform=transforms.ToTensor(), download=True
@@ -83,7 +84,7 @@ def my_loss(x, target, is_train=True):
 
 
 def train_step(args, model, device, optimizer, loss_fn, train_loader, epoch):
-    print("\n======== Epoch {:} / {:} with batch size {:} ========".format(epoch + 1, args.epochs, args.batch_size))
+    print(f"\n======== Epoch {epoch + 1} / {args.epochs} with batch size {args.batch_size} ========")
     model.train()
     # Measure how long the training epoch takes.
     t0 = time.time()
@@ -136,8 +137,8 @@ def train_step(args, model, device, optimizer, loss_fn, train_loader, epoch):
     avg_train_loss = total_loss / len(train_loader)
 
     epoch_time = time.time() - t0
-    print("\n  Average training loss: {0:.2f}".format(avg_train_loss))
-    print("  Training epoch took: {:.4f}s".format(epoch_time))
+    print(f"\n  Average training loss: {avg_train_loss:.2f}")
+    print(f"  Training epoch took: {epoch_time:.4f}s")
     return epoch_time
 
 
@@ -171,13 +172,12 @@ def test(args, model, device, loss_fn, test_loader):
     # Report the final accuracy for this validation run.
     epoch_time = time.time() - t0
     accuracy = float(correct) / len(test_loader.dataset)
-    print("  Accuracy: {0:.2f}".format(accuracy))
-    print("  Validation took: {:.4f}s".format(epoch_time))
+    print(f"  Accuracy: {accuracy:.2f}")
+    print(f"  Validation took: {epoch_time:.4f}s")
     return epoch_time, accuracy
 
 
 def train(rank: int, args, world_size: int, epochs: int):
-
     # DDP init example
     dist_init(rank, world_size)
     torch.backends.cudnn.deterministic = True
@@ -201,9 +201,6 @@ def train(rank: int, args, world_size: int, epochs: int):
     train_dataloader, test_dataloader = get_dataloader(args, rank, args.batch_size)
     loss_fn = my_loss
     base_optimizer = torch.optim.SGD  # pick any pytorch compliant optimizer here
-    base_optimizer_arguments = (
-        {}
-    )  # pass any optimizer specific arguments here, or directly below when instantiating OSS
     if args.use_sharded_optimizer:
         # Wrap the optimizer in its state sharding brethren
         optimizer = OSS(params=model.parameters(), optim=base_optimizer, lr=args.lr)
@@ -231,18 +228,17 @@ def train(rank: int, args, world_size: int, epochs: int):
         estimated_export = 0
         if args.epochs > 1:
             estimated_export = epoch_0_training - (total_training_time - epoch_0_training) / (args.epochs - 1)
-            print("  Estimated ONNX export took:               {:.4f}s".format(estimated_export))
+            print(f"  Estimated ONNX export took:               {estimated_export:.4f}s")
         else:
             print("  Estimated ONNX export took:               Estimate available when epochs > 1 only")
-        print("  Accumulated training without export took: {:.4f}s".format(total_training_time - estimated_export))
-    print("  Accumulated training took:                {:.4f}s".format(total_training_time))
-    print("  Accumulated validation took:              {:.4f}s".format(total_test_time))
+        print(f"  Accumulated training without export took: {total_training_time - estimated_export:.4f}s")
+    print(f"  Accumulated training took:                {total_training_time:.4f}s")
+    print(f"  Accumulated validation took:              {total_test_time:.4f}s")
 
     dist.destroy_process_group()
 
 
 if __name__ == "__main__":
-
     parser = argparse.ArgumentParser(
         description="Benchmark the optimizer state sharding, on a typical computer vision workload"
     )
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_fallback.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_fallback.py
index 6cde304a6570b..58eb507fbb37a 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_fallback.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_fallback.py
@@ -52,7 +52,7 @@ class Point:
 
     class UnsupportedInputModel(torch.nn.Module):
         def __init__(self):
-            super(UnsupportedInputModel, self).__init__()
+            super().__init__()
 
         def forward(self, point):
             return point.x * point.y
@@ -160,7 +160,7 @@ def test_ortmodule_fallback_device__mismatch(is_training, fallback_enabled, matc
     os.environ["ORTMODULE_SKIPCHECK_POLICY"] = "SKIP_CHECK_DISABLED"
 
     data_device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
 
     pt_model = NeuralNetSinglePositionalArgument(D_in, H, D_out)
     ort_model = ORTModule(copy.deepcopy(pt_model))
@@ -224,7 +224,7 @@ def test_ortmodule_fallback_output(is_training, fallback_enabled, matching_polic
     os.environ["ORTMODULE_FALLBACK_RETRY"] = str(not persist_fallback)
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     pt_model = NeuralNetCustomClassOutput(D_in, H, D_out).to(device)
     ort_model = ORTModule(copy.deepcopy(pt_model))
     x = torch.randn(N, D_in, device=device)
@@ -336,7 +336,7 @@ def test_ortmodule_fallback_torch_model(is_training, fallback_enabled, matching_
     os.environ["ORTMODULE_FALLBACK_RETRY"] = str(not persist_fallback)
 
     device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
     x = torch.randn(N, D_in, device=device)
 
     pt_model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
@@ -379,7 +379,6 @@ def test_ortmodule_fallback_init__torch_version(is_training, fallback_enabled, m
     runtime_pytorch_version = version.parse(torch.__version__.split("+")[0])
     minimum_runtime_pytorch_version = version.parse(MINIMUM_RUNTIME_PYTORCH_VERSION_STR)
     if runtime_pytorch_version < minimum_runtime_pytorch_version:
-
         if fallback_enabled:
             if matching_policy:
                 policy = "FALLBACK_BAD_INITIALIZATION"
@@ -391,12 +390,12 @@ def test_ortmodule_fallback_init__torch_version(is_training, fallback_enabled, m
         os.environ["ORTMODULE_FALLBACK_RETRY"] = str(not persist_fallback)
 
         device = "cuda"
-        N, D_in, H, D_out = 64, 784, 500, 10
+        N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
         x = torch.randn(N, D_in, device=device)
 
         pt_model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
 
-        for i in range(3):
+        for _i in range(3):
             if fallback_enabled:
                 if matching_policy:
                     ort_model = ORTModule(pt_model)
@@ -440,10 +439,9 @@ def test_ortmodule_fallback_init__missing_cpp_extensions(
     if is_torch_cpp_extensions_installed(ORTMODULE_TORCH_CPP_DIR):
         warnings.warn(
             "Skipping test_ortmodule_fallback_init__missing_cpp_extensions."
-            f" It requires PyTorch CPP extensions to be missing"
+            " It requires PyTorch CPP extensions to be missing"
         )
     else:
-
         if fallback_enabled:
             if matching_policy:
                 policy = "FALLBACK_BAD_INITIALIZATION"
@@ -455,7 +453,7 @@ def test_ortmodule_fallback_init__missing_cpp_extensions(
         os.environ["ORTMODULE_FALLBACK_RETRY"] = str(not persist_fallback)
 
         device = "cuda"
-        N, D_in, H, D_out = 64, 784, 500, 10
+        N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
         x = torch.randn(N, D_in, device=device)
 
         pt_model = NeuralNetSinglePositionalArgument(D_in, H, D_out).to(device)
@@ -544,7 +542,7 @@ def test_ortmodule_fallback_warn_message(is_training, persist_fallback):
     os.environ["ORTMODULE_SKIPCHECK_POLICY"] = "SKIP_CHECK_DISABLED"
 
     data_device = "cuda"
-    N, D_in, H, D_out = 64, 784, 500, 10
+    N, D_in, H, D_out = 64, 784, 500, 10  # noqa: N806
 
     pt_model = NeuralNetSinglePositionalArgument(D_in, H, D_out)
     ort_model = ORTModule(copy.deepcopy(pt_model))
@@ -557,9 +555,8 @@ def test_ortmodule_fallback_warn_message(is_training, persist_fallback):
     inputs = torch.randn(N, D_in, device=data_device)
 
     for _ in range(3):
-        with pytest.raises(RuntimeError):
-            with pytest.warns(UserWarning) as warning_record:
-                ort_model(inputs)
+        with pytest.raises(RuntimeError), pytest.warns(UserWarning) as warning_record:
+            ort_model(inputs)
         assert "Fallback to PyTorch due to exception" in str(warning_record[0].message.args[0])
 
     del os.environ["ORTMODULE_SKIPCHECK_POLICY"]
@@ -644,11 +641,11 @@ def get_batch(source, i):
     optimizer = torch.optim.SGD(model.parameters(), lr=5.0)
 
     n_iter = 0
-    for epoch in range(1, 2):
+    for _epoch in range(1, 2):
         model.train()  # turn on train mode
 
-        num_batches = len(train_data) // bptt
-        for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
+        len(train_data) // bptt
+        for _batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
             data, targets = get_batch(train_data, i)
             batch_size = data.size(0)
             if batch_size != bptt:  # only on last batch
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py
index bb94a6c514977..e6c8e97fcbd0f 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py
@@ -1,16 +1,17 @@
 import argparse
 import logging
-import torch
 import time
+
+import torch
 from torchvision import datasets, transforms
 
 import onnxruntime
-from onnxruntime.training.ortmodule import ORTModule, DebugOptions
+from onnxruntime.training.ortmodule import DebugOptions, ORTModule
 
 
 class NeuralNet(torch.nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNet, self).__init__()
+        super().__init__()
 
         self.fc1 = torch.nn.Linear(input_size, hidden_size)
         self.relu = torch.nn.ReLU()
@@ -24,7 +25,7 @@ def forward(self, input1):
 
 
 def train(args, model, device, optimizer, loss_fn, train_loader, epoch):
-    print("\n======== Epoch {:} / {:} with batch size {:} ========".format(epoch + 1, args.epochs, args.batch_size))
+    print(f"\n======== Epoch {epoch + 1} / {args.epochs} with batch size {args.batch_size} ========")
     model.train()
     # Measure how long the training epoch takes.
     t0 = time.time()
@@ -77,8 +78,8 @@ def train(args, model, device, optimizer, loss_fn, train_loader, epoch):
     avg_train_loss = total_loss / len(train_loader)
 
     epoch_time = time.time() - t0
-    print("\n  Average training loss: {0:.2f}".format(avg_train_loss))
-    print("  Training epoch took: {:.4f}s".format(epoch_time))
+    print(f"\n  Average training loss: {avg_train_loss:.2f}")
+    print(f"  Training epoch took: {epoch_time:.4f}s")
     return epoch_time
 
 
@@ -113,8 +114,8 @@ def test(args, model, device, loss_fn, test_loader):
     # Report the final accuracy for this validation run.
     epoch_time = time.time() - t0
     accuracy = float(correct) / len(test_loader.dataset)
-    print("  Accuracy: {0:.2f}".format(accuracy))
-    print("  Validation took: {:.4f}s".format(epoch_time))
+    print(f"  Accuracy: {accuracy:.2f}")
+    print(f"  Validation took: {epoch_time:.4f}s")
     return epoch_time, accuracy
 
 
@@ -235,12 +236,12 @@ def main():
         estimated_export = 0
         if args.epochs > 1:
             estimated_export = epoch_0_training - (total_training_time - epoch_0_training) / (args.epochs - 1)
-            print("  Estimated ONNX export took:               {:.4f}s".format(estimated_export))
+            print(f"  Estimated ONNX export took:               {estimated_export:.4f}s")
         else:
             print("  Estimated ONNX export took:               Estimate available when epochs > 1 only")
-        print("  Accumulated training without export took: {:.4f}s".format(total_training_time - estimated_export))
-    print("  Accumulated training took:                {:.4f}s".format(total_training_time))
-    print("  Accumulated validation took:              {:.4f}s".format(total_test_time))
+        print(f"  Accumulated training without export took: {total_training_time - estimated_export:.4f}s")
+    print(f"  Accumulated training took:                {total_training_time:.4f}s")
+    print(f"  Accumulated validation took:              {total_test_time:.4f}s")
 
 
 if __name__ == "__main__":
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_pytorch_ddp.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_pytorch_ddp.py
index 93426659991fe..85fe86ab93639 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_pytorch_ddp.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_pytorch_ddp.py
@@ -1,19 +1,18 @@
 # This test script is a modified version of Pytorch's tutorial.
 # For details, see https://pytorch.org/tutorials/intermediate/ddp_tutorial.html.
+import argparse
 import os
-import sys
+import sys  # noqa: F401
 import tempfile
-import torch
-import argparse
 
+import torch
 import torch.distributed as dist
+import torch.multiprocessing as mp
 import torch.nn as nn
 import torch.optim as optim
-import torch.multiprocessing as mp
-
-from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.nn.parallel import DistributedDataParallel as DDP  # noqa: N817
 
-import onnxruntime
+import onnxruntime  # noqa: F401
 from onnxruntime.training.ortmodule import ORTModule
 
 
@@ -31,7 +30,7 @@ def cleanup():
 
 class ToyModel(nn.Module):
     def __init__(self):
-        super(ToyModel, self).__init__()
+        super().__init__()
         self.net1 = nn.Linear(10, 10)
         self.relu = nn.ReLU()
         self.net2 = nn.Linear(10, 5)
@@ -102,7 +101,7 @@ def demo_checkpoint(rank, world_size, use_ort_module):
     loss_fn = nn.MSELoss()
     optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
 
-    CHECKPOINT_PATH = os.path.join(tempfile.gettempdir(), "model.checkpoint")
+    CHECKPOINT_PATH = os.path.join(tempfile.gettempdir(), "model.checkpoint")  # noqa: N806
     if rank == 0:
         # All processes should see same parameters as they all start from same
         # random parameters and gradients are synchronized in backward passes.
@@ -135,7 +134,7 @@ def demo_checkpoint(rank, world_size, use_ort_module):
     elif rank == 3:
         assert torch.allclose(loss.cpu(), torch.FloatTensor([0.825118362903595]))
     else:
-        assert False
+        raise AssertionError()
 
     # Not necessary to use a dist.barrier() to guard the file deletion below
     # as the AllReduce ops in the backward pass of DDP already served as
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_torch_lightning_basic.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_torch_lightning_basic.py
index 558b6049f6327..7e2418a8913b9 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_torch_lightning_basic.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_torch_lightning_basic.py
@@ -1,13 +1,13 @@
 import argparse
 from multiprocessing import cpu_count
 
+import pytorch_lightning as pl
 import torch
-from torch import nn
 import torch.nn.functional as F
+from torch import nn
+from torch.utils.data import DataLoader
 from torchvision import transforms
 from torchvision.datasets import MNIST
-from torch.utils.data import DataLoader
-import pytorch_lightning as pl
 
 import onnxruntime
 from onnxruntime.training.ortmodule import ORTModule
diff --git a/orttraining/orttraining/test/python/orttraining_test_orttrainer_bert_toy_onnx.py b/orttraining/orttraining/test/python/orttraining_test_orttrainer_bert_toy_onnx.py
index 531085f21ce61..a376567450c78 100644
--- a/orttraining/orttraining/test/python/orttraining_test_orttrainer_bert_toy_onnx.py
+++ b/orttraining/orttraining/test/python/orttraining_test_orttrainer_bert_toy_onnx.py
@@ -1,33 +1,23 @@
-import copy
+import copy  # noqa: F401
+import inspect  # noqa: F401
+import math  # noqa: F401
+import os
 from functools import partial
-import inspect
-import math
-import numpy as np
-from numpy.testing import assert_allclose
+
+import _test_commons
+import _test_helpers
 import onnx
-import os
 import pytest
 import torch
+from numpy.testing import assert_allclose
 
 import onnxruntime
-from onnxruntime.capi.ort_trainer import (
-    IODescription as Legacy_IODescription,
-    ModelDescription as Legacy_ModelDescription,
-    LossScaler as Legacy_LossScaler,
-    ORTTrainer as Legacy_ORTTrainer,
-)
-from onnxruntime.training import (
-    _utils,
-    amp,
-    checkpoint,
-    optim,
-    orttrainer,
-    TrainStepInfo,
-    model_desc_validation as md_val,
-    orttrainer_options as orttrainer_options,
-)
-
-import _test_commons, _test_helpers
+from onnxruntime.capi.ort_trainer import IODescription as Legacy_IODescription
+from onnxruntime.capi.ort_trainer import LossScaler as Legacy_LossScaler
+from onnxruntime.capi.ort_trainer import ModelDescription as Legacy_ModelDescription
+from onnxruntime.capi.ort_trainer import ORTTrainer as Legacy_ORTTrainer
+from onnxruntime.training import amp, optim, orttrainer
+from onnxruntime.training import orttrainer_options as orttrainer_options
 
 ###############################################################################
 # Helper functions ############################################################
@@ -145,7 +135,7 @@ def load_bert_onnx_model():
 
 
 class CustomLossScaler(amp.LossScaler):
-    def __init__(self, loss_scale=float(1 << 16)):
+    def __init__(self, loss_scale=float(1 << 16)):  # noqa: B008
         super().__init__(loss_scale)
         self._initial_loss_scale = loss_scale
         self.loss_scale = loss_scale
@@ -162,7 +152,7 @@ def update(self, train_step_info):
 
 
 class LegacyCustomLossScaler:
-    def __init__(self, loss_scale=float(1 << 16)):
+    def __init__(self, loss_scale=float(1 << 16)):  # noqa: B008
         self._initial_loss_scale = loss_scale
         self.loss_scale_ = loss_scale
 
@@ -173,7 +163,7 @@ def update_loss_scale(self, is_all_finite):
         self.loss_scale_ *= 0.9
 
 
-def legacy_model_params(lr, device=torch.device("cuda", 0)):
+def legacy_model_params(lr, device=torch.device("cuda", 0)):  # noqa: B008
     legacy_model_desc = legacy_bert_model_description()
     learning_rate_description = legacy_ort_trainer_learning_rate_description()
     learning_rate = torch.tensor([lr]).to(device)
@@ -191,7 +181,6 @@ def legacy_ort_trainer_learning_rate_description():
 
 
 def legacy_bert_model_description():
-    vocab_size = 30528
     input_ids_desc = Legacy_IODescription("input_ids", ["batch", "max_seq_len_in_batch"])
     segment_ids_desc = Legacy_IODescription("segment_ids", ["batch", "max_seq_len_in_batch"])
     input_mask_desc = Legacy_IODescription("input_mask", ["batch", "max_seq_len_in_batch"])
@@ -242,7 +231,7 @@ def testToyBERTModelBasicTraining(dynamic_shape):
     opts = orttrainer.ORTTrainerOptions({})
     trainer = orttrainer.ORTTrainer(model, model_desc, optim_config, options=opts)
 
-    for i in range(10):
+    for _i in range(10):
         sample_input = generate_random_input_from_model_desc(model_desc)
         output = trainer.train_step(*sample_input)
         assert output.shape == torch.Size([])
@@ -264,7 +253,7 @@ def testToyBERTDeterministicCheck(expected_losses):
     # Modeling
     model_desc = bert_model_description()
     model = load_bert_onnx_model()
-    params = optimizer_parameters(model)
+    optimizer_parameters(model)
     optim_config = optim.LambConfig()
     opts = orttrainer.ORTTrainerOptions(
         {
@@ -692,7 +681,7 @@ def testToyBertCheckpointFrozenWeights():
     trainer = orttrainer.ORTTrainer(model, model_desc, optim_config, options=opts)
 
     # Train for a few steps
-    for i in range(total_steps):
+    for _i in range(total_steps):
         sample_input = generate_random_input_from_model_desc(model_desc, seed)
         _ = trainer.train_step(*sample_input)
     sample_input = generate_random_input_from_model_desc(model_desc, seed + total_steps + 1)
@@ -727,7 +716,6 @@ def testToyBertCheckpointFrozenWeights():
 )
 def testToyBertLoadOptimState(optimizer, mixedprecision_enabled):
     # Common setup
-    rtol = 1e-03
     device = "cuda"
     seed = 1
     torch.manual_seed(seed)
diff --git a/orttraining/orttraining/test/python/orttraining_test_orttrainer_checkpoint_functions.py b/orttraining/orttraining/test/python/orttraining_test_orttrainer_checkpoint_functions.py
index 99606d923e1d2..d366f2cb26557 100644
--- a/orttraining/orttraining/test/python/orttraining_test_orttrainer_checkpoint_functions.py
+++ b/orttraining/orttraining/test/python/orttraining_test_orttrainer_checkpoint_functions.py
@@ -1,10 +1,12 @@
-import pytest
-from unittest.mock import patch, Mock
-from _test_commons import _load_pytorch_transformer_model
-from onnxruntime.training import amp, checkpoint, optim, orttrainer, _checkpoint_storage
+from unittest.mock import Mock, patch
+
 import numpy as np
 import onnx
+import pytest
 import torch
+from _test_commons import _load_pytorch_transformer_model
+
+from onnxruntime.training import _checkpoint_storage, amp, checkpoint, optim, orttrainer  # noqa: F401
 
 # Helper functions
 
@@ -32,7 +34,7 @@ def _create_trainer(zero_enabled=False):
     return trainer
 
 
-class _training_session_mock(object):
+class _training_session_mock:  # noqa: N801
     """Mock object for the ORTTrainer _training_session member"""
 
     def __init__(self, model_states, optimizer_states, partition_info):
@@ -319,7 +321,7 @@ def test_load_state_dict_warns_when_model_optimizer_key_missing(state_dict, inpu
         with pytest.warns(UserWarning) as user_warning:
             trainer.load_state_dict(input_state_dict)
 
-    assert user_warning[0].message.args[0] == "Missing key: {} in state_dict".format(error_key)
+    assert user_warning[0].message.args[0] == f"Missing key: {error_key} in state_dict"
 
 
 @pytest.mark.parametrize("state_dict, input_state_dict, error_keys", _get_load_state_dict_strict_error_arguments())
@@ -625,7 +627,7 @@ def test_checkpoint_aggregation(load_mock):
     assert (state_dict["optimizer"]["non_sharded"]["Moment_2"] == np.array([6666, 5555, 4444])).all()
     assert (state_dict["optimizer"]["non_sharded"]["Step"] == np.array([55])).all()
 
-    assert state_dict["trainer_options"]["mixed_precision"] == False
+    assert state_dict["trainer_options"]["mixed_precision"] is False
     assert state_dict["trainer_options"]["world_rank"] == 0
     assert state_dict["trainer_options"]["world_size"] == 1
     assert state_dict["trainer_options"]["horizontal_parallel_size"] == 1
@@ -711,7 +713,7 @@ def test_checkpoint_aggregation_mixed_precision(load_mock):
     assert (state_dict["optimizer"]["non_sharded"]["Moment_2"] == np.array([6666, 5555, 4444])).all()
     assert (state_dict["optimizer"]["non_sharded"]["Step"] == np.array([55])).all()
 
-    assert state_dict["trainer_options"]["mixed_precision"] == True
+    assert state_dict["trainer_options"]["mixed_precision"] is True
     assert state_dict["trainer_options"]["world_rank"] == 0
     assert state_dict["trainer_options"]["world_size"] == 1
     assert state_dict["trainer_options"]["horizontal_parallel_size"] == 1
diff --git a/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py b/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py
index 57b5af656eb66..47e648ecf5d12 100644
--- a/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py
+++ b/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py
@@ -2,7 +2,7 @@
 import os
 import tempfile
 from functools import partial
-from packaging.version import Version as StrictVersion
+
 import _test_commons
 import _test_helpers
 import onnx
@@ -10,13 +10,12 @@
 import torch
 import torch.nn.functional as F
 from numpy.testing import assert_allclose
+from packaging.version import Version as StrictVersion
 
 from onnxruntime import SessionOptions, set_seed
-from onnxruntime.capi.ort_trainer import IODescription as Legacy_IODescription
 from onnxruntime.capi.ort_trainer import LossScaler as Legacy_LossScaler
-from onnxruntime.capi.ort_trainer import ModelDescription as Legacy_ModelDescription
 from onnxruntime.capi.ort_trainer import ORTTrainer as Legacy_ORTTrainer
-from onnxruntime.training import PropagateCastOpsStrategy, TrainStepInfo, _utils, amp, checkpoint
+from onnxruntime.training import PropagateCastOpsStrategy, TrainStepInfo, _utils, amp
 from onnxruntime.training import model_desc_validation as md_val
 from onnxruntime.training import optim, orttrainer
 from onnxruntime.training import orttrainer_options as orttrainer_options
@@ -60,8 +59,6 @@ def testORTTrainerOptionsDefaultValues(test_input):
                 "sliced_tensor_names": [],
             },
             "allreduce_post_accumulation": False,
-            "data_parallel_size": 1,
-            "horizontal_parallel_size": 1,
             "deepspeed_zero_optimization": {
                 "stage": 0,
             },
@@ -194,9 +191,9 @@ def testORTTrainerModelDescValidSchemas(input_dict, input_dtype, output_dtype):
     assert model_description.loss_scale_input.dtype == torch.float32
 
     # Append type to inputs/outputs tuples
-    for idx, i_desc in enumerate(model_description.inputs):
+    for idx, i_desc in enumerate(model_description.inputs):  # noqa: B007
         model_description.add_type_to_input_description(idx, input_dtype[idx])
-    for idx, o_desc in enumerate(model_description.outputs):
+    for idx, o_desc in enumerate(model_description.outputs):  # noqa: B007
         model_description.add_type_to_output_description(idx, output_dtype[idx])
 
     # Verify inputs/outputs tuples are replaced by the typed counterparts
@@ -265,7 +262,6 @@ def testDynamicLossScaler():
     # Performing 9*2000 updates to cover all branches of LossScaler.update(train_step_info.all_finite=True)
     loss_scale = float(1 << 16)
     for cycles in range(1, 10):
-
         # 1999 updates without overflow produces 1999 stable steps
         for i in range(1, 2000):
             new_loss_scale = default_scaler.update(train_step_info)
@@ -294,7 +290,7 @@ def testDynamicLossScaler():
 
     # Performing 24 updates to half the loss scale each time
     loss_scale = float(1 << 16) * (2**8)
-    for count in range(1, 25):
+    for count in range(1, 25):  # noqa: B007
         new_loss_scale = default_scaler.update(train_step_info)
         loss_scale /= 2
         assert default_scaler._stable_steps_count == 0
@@ -304,7 +300,7 @@ def testDynamicLossScaler():
     assert_allclose(new_loss_scale, 1.0, rtol=rtol, err_msg="loss scale mismatch")
 
     # After 25 updates, min_loss_scale is reached and loss scale is not halfed from that point on
-    for count in range(1, 5):
+    for count in range(1, 5):  # noqa: B007
         new_loss_scale = default_scaler.update(train_step_info)
         assert default_scaler._stable_steps_count == 0
         assert_allclose(new_loss_scale, loss_scale, rtol=rtol, err_msg="loss scale mismatch")
@@ -315,7 +311,7 @@ def testDynamicLossScalerCustomValues():
     scaler = amp.loss_scaler.DynamicLossScaler(
         automatic_update=False, loss_scale=3, up_scale_window=7, min_loss_scale=5, max_loss_scale=10
     )
-    assert scaler.automatic_update == False
+    assert scaler.automatic_update is False
     assert_allclose(scaler.loss_scale, 3, rtol=rtol, err_msg="loss scale mismatch")
     assert_allclose(scaler.min_loss_scale, 5, rtol=rtol, err_msg="min loss scale mismatch")
     assert_allclose(scaler.max_loss_scale, 10, rtol=rtol, err_msg="max loss scale mismatch")
@@ -331,14 +327,14 @@ def testTrainStepInfo():
         optimizer_config=optimizer_config, all_finite=False, fetches=fetches, optimization_step=123, step=456
     )
     assert step_info.optimizer_config == optimizer_config
-    assert step_info.all_finite == False
+    assert step_info.all_finite is False
     assert step_info.fetches == fetches
     assert step_info.optimization_step == 123
     assert step_info.step == 456
 
     step_info = orttrainer.TrainStepInfo(optimizer_config)
     assert step_info.optimizer_config == optimizer_config
-    assert step_info.all_finite == True
+    assert step_info.all_finite is True
     assert step_info.fetches == []
     assert step_info.optimization_step == 0
     assert step_info.step == 0
@@ -458,7 +454,7 @@ def testOptimizerConfigAdam():
     assert_allclose(0.0, cfg.lambda_coef, rtol=rtol, err_msg="lambda_coef mismatch")
     assert_allclose(1e-8, cfg.epsilon, rtol=rtol, err_msg="epsilon mismatch")
     assert_allclose(1.0, cfg.max_norm_clip, rtol=rtol, err_msg="max_norm_clip mismatch")
-    assert cfg.do_bias_correction == True, "lambda_coef mismatch"
+    assert cfg.do_bias_correction is True, "lambda_coef mismatch"
     assert cfg.weight_decay_mode == optim.AdamConfig.DecayMode.BEFORE_WEIGHT_UPDATE, "weight_decay_mode mismatch"
 
 
@@ -475,7 +471,7 @@ def testOptimizerConfigLamb():
     assert cfg.ratio_max == float("inf"), "ratio_max mismatch"
     assert_allclose(1e-6, cfg.epsilon, rtol=rtol, err_msg="epsilon mismatch")
     assert_allclose(1.0, cfg.max_norm_clip, rtol=rtol, err_msg="max_norm_clip mismatch")
-    assert cfg.do_bias_correction == False, "do_bias_correction mismatch"
+    assert cfg.do_bias_correction is False, "do_bias_correction mismatch"
 
 
 @pytest.mark.parametrize("optim_name", [("Adam"), ("Lamb")])
@@ -1044,7 +1040,7 @@ def testORTTrainerInternalUseContribOps(enable_onnx_contrib_ops):
     # Training loop
     data, targets = batcher_fn(train_data, 0)
     if not enable_onnx_contrib_ops and not pytorch_110:
-        with pytest.raises(Exception) as e_info:
+        with pytest.raises(Exception):
             _, _ = trainer.train_step(data, targets)
     else:
         _, _ = trainer.train_step(data, targets)
@@ -1168,7 +1164,7 @@ def loss_fn(x, label):
     # Train once and check initial state
     trainer.train_step(x=data1, label=label1)
     state_dict = trainer.state_dict()
-    assert all([weight in state_dict["model"]["full_precision"].keys() for weight in ["linear.bias", "linear.weight"]])
+    assert all([weight in state_dict["model"]["full_precision"] for weight in ["linear.bias", "linear.weight"]])
 
     # Initialize training session 2 from state of Training 1
     torch.manual_seed(seed)
@@ -1591,7 +1587,7 @@ def testORTTrainerLegacyAndExperimentalLRScheduler(seed, device, optimizer_confi
 
 
 def testLossScalerLegacyAndExperimentalFullCycle():
-    info = orttrainer.TrainStepInfo(
+    orttrainer.TrainStepInfo(
         optimizer_config=optim.LambConfig(lr=0.001), all_finite=True, fetches=[], optimization_step=0, step=0
     )
     new_ls = amp.DynamicLossScaler()
@@ -1605,10 +1601,9 @@ def testLossScalerLegacyAndExperimentalFullCycle():
     assert_allclose(new_ls.max_loss_scale, old_ls.max_loss_scale_)
 
     # Performing 9*2000 updates to cover all branches of LossScaler.update(train_step_info.all_finite=True)
-    for cycles in range(1, 10):
-
+    for _cycles in range(1, 10):
         # 1999 updates without overflow produces 1999 stable steps
-        for i in range(1, 2000):
+        for _i in range(1, 2000):
             new_loss_scale = new_ls.update(train_step_info)
             old_ls.update_loss_scale(train_step_info.all_finite)
             old_loss_scale = old_ls.loss_scale_
@@ -1626,7 +1621,7 @@ def testLossScalerLegacyAndExperimentalFullCycle():
     assert_allclose(new_loss_scale, old_loss_scale)
 
     # After 9 cycles, loss scale reaches max_loss_scale and it is not doubled from that point on
-    for count in range(1, 2050):
+    for _count in range(1, 2050):
         new_loss_scale = new_ls.update(train_step_info)
         old_ls.update_loss_scale(train_step_info.all_finite)
         old_loss_scale = old_ls.loss_scale_
@@ -1637,7 +1632,7 @@ def testLossScalerLegacyAndExperimentalFullCycle():
     train_step_info.all_finite = False
 
     # Performing 24 updates to half the loss scale each time
-    for count in range(1, 25):
+    for _count in range(1, 25):
         new_loss_scale = new_ls.update(train_step_info)
         old_ls.update_loss_scale(train_step_info.all_finite)
         old_loss_scale = old_ls.loss_scale_
@@ -1648,7 +1643,7 @@ def testLossScalerLegacyAndExperimentalFullCycle():
     assert_allclose(new_loss_scale, old_loss_scale)
 
     # After 25 updates, min_loss_scale is reached and loss scale is not halfed from that point on
-    for count in range(1, 5):
+    for _count in range(1, 5):
         new_loss_scale = new_ls.update(train_step_info)
         old_ls.update_loss_scale(train_step_info.all_finite)
         old_loss_scale = old_ls.loss_scale_
@@ -1757,7 +1752,7 @@ def testORTTrainerOptionsEnabledAdasumFlag(test_input):
     """Test the enabled_adasum flag values when set enabled"""
 
     actual_values = orttrainer_options.ORTTrainerOptions(test_input)
-    assert actual_values.distributed.enable_adasum == True
+    assert actual_values.distributed.enable_adasum is True
 
 
 @pytest.mark.parametrize(
@@ -1774,13 +1769,13 @@ def testORTTrainerOptionsDisabledAdasumFlag(test_input):
     """Test the enabled_adasum flag values when set disabled"""
 
     actual_values = orttrainer_options.ORTTrainerOptions(test_input)
-    assert actual_values.distributed.enable_adasum == False
+    assert actual_values.distributed.enable_adasum is False
 
 
 def testORTTrainerUnusedInput():
     class UnusedInputModel(torch.nn.Module):
         def __init__(self):
-            super(UnusedInputModel, self).__init__()
+            super().__init__()
 
         def forward(self, x, y):
             return torch.mean(x)
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortvalue.py b/orttraining/orttraining/test/python/orttraining_test_ortvalue.py
index cfdc52a9f0848..7119fd405d3c4 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortvalue.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortvalue.py
@@ -3,20 +3,22 @@
 # Licensed under the MIT License.
 # pylint: disable=W0212,C0114,C0116
 
-import unittest
 import copy
 import sys
+import unittest
+
+import _test_helpers
 import numpy as np
-from numpy.testing import assert_almost_equal
-import onnxruntime as onnxrt
-from onnxruntime.capi.onnxruntime_pybind11_state import OrtValue as C_OrtValue, OrtValueVector
-from onnxruntime.training.ortmodule import ORTModule, _utils
-from onnxruntime.capi import _pybind_state as C
 import torch
+from numpy.testing import assert_almost_equal
 from torch._C import _from_dlpack
 from torch.utils.dlpack import from_dlpack
-import _test_helpers
 
+import onnxruntime as onnxrt
+from onnxruntime.capi import _pybind_state as C
+from onnxruntime.capi.onnxruntime_pybind11_state import OrtValue as C_OrtValue
+from onnxruntime.capi.onnxruntime_pybind11_state import OrtValueVector
+from onnxruntime.training.ortmodule import ORTModule, _utils
 
 has_cuda = torch.cuda.is_available()
 
@@ -208,7 +210,7 @@ def testOrtValueVectorDlPackNone_cuda(self):
     def test_ortmodule_dlpack(self):
         class NeuralNetTanh(torch.nn.Module):
             def __init__(self, input_size, hidden_size, num_classes):
-                super(NeuralNetTanh, self).__init__()
+                super().__init__()
 
                 self.fc1 = torch.nn.Linear(input_size, hidden_size)
                 self.tanh = torch.nn.Tanh()
@@ -224,11 +226,11 @@ def run_step(model, x):
             loss.backward()
             return prediction, loss
 
-        N, D_in, H, D_out = 120, 1536, 500, 1536
+        N, D_in, H, D_out = 120, 1536, 500, 1536  # noqa: N806
         pt_model = NeuralNetTanh(D_in, H, D_out)
         ort_model = ORTModule(copy.deepcopy(pt_model))
 
-        for step in range(10):
+        for _step in range(10):
             pt_x = torch.randn(N, D_in, device="cpu", requires_grad=True)
             ort_x = copy.deepcopy(pt_x)
             ort_prediction, ort_loss = run_step(ort_model, ort_x)
@@ -240,7 +242,7 @@ def run_step(model, x):
     def test_bool_input_and_output(self):
         class NeuralNetBoolInputOutput(torch.nn.Module):
             def __init__(self, input_size, num_classes):
-                super(NeuralNetBoolInputOutput, self).__init__()
+                super().__init__()
                 self.fc = torch.nn.Linear(input_size, num_classes)
                 self.relu = torch.nn.ReLU()
 
@@ -250,7 +252,7 @@ def forward(self, condition, x1, x2):
                 return out1, out2
 
         device = "cpu"
-        N, D_in, D_out = 8, 16, 2
+        N, D_in, D_out = 8, 16, 2  # noqa: N806
         model = NeuralNetBoolInputOutput(D_in, D_out).to(device)
         model = ORTModule(model)
         condition = torch.randint(2, (N, D_in), dtype=torch.bool, device=device)
diff --git a/orttraining/orttraining/test/python/orttraining_test_python_bindings.py b/orttraining/orttraining/test/python/orttraining_test_python_bindings.py
index c9039a9f910d7..14d7b54ff339e 100644
--- a/orttraining/orttraining/test/python/orttraining_test_python_bindings.py
+++ b/orttraining/orttraining/test/python/orttraining_test_python_bindings.py
@@ -12,7 +12,7 @@
 
 class SimpleModelWithCrossEntropyLoss(onnxblock.TrainingModel):
     def __init__(self):
-        super(SimpleModelWithCrossEntropyLoss, self).__init__()
+        super().__init__()
         self.loss = onnxblock.loss.CrossEntropyLoss()
 
     def build(self, output_name):
diff --git a/orttraining/orttraining/test/python/orttraining_test_sampler.py b/orttraining/orttraining/test/python/orttraining_test_sampler.py
index c47b721b7d100..68f9ac5052134 100644
--- a/orttraining/orttraining/test/python/orttraining_test_sampler.py
+++ b/orttraining/orttraining/test/python/orttraining_test_sampler.py
@@ -2,9 +2,11 @@
 # Licensed under the MIT License.
 # orttraining_test_sampler.py
 
+import random
+
 import torch
+
 from onnxruntime.training.utils.data import sampler
-import random
 
 
 class MyDataset(torch.utils.data.Dataset):
@@ -45,7 +47,7 @@ def complexity_fn(sample):
 
 def test_load_balancing_data_sampler_shuffles_and_balances_load():
     complexities = []
-    for i in range(50):
+    for _i in range(50):
         c = torch.randint(0, 100, (1,)).item()
         complexities.append(c)
         complexities.append(c)
diff --git a/orttraining/orttraining/test/python/orttraining_test_transformers.py b/orttraining/orttraining/test/python/orttraining_test_transformers.py
index 1e73da0f65b3f..dbaf4a293c466 100644
--- a/orttraining/orttraining/test/python/orttraining_test_transformers.py
+++ b/orttraining/orttraining/test/python/orttraining_test_transformers.py
@@ -1,27 +1,19 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import unittest
-import shutil
-import pytest
-import os
 import random
+import unittest
+
 import numpy as np
+import torch
 from numpy.testing import assert_allclose
-from transformers import BertConfig, BertForPreTraining, BertModel
-
-from orttraining_test_data_loader import ids_tensor, BatchArgsOption
-from orttraining_test_utils import run_test, get_lr
+from orttraining_test_data_loader import BatchArgsOption, ids_tensor
+from orttraining_test_utils import get_lr, run_test
+from transformers import BertConfig, BertForPreTraining
 
 import onnxruntime
-from onnxruntime.capi.ort_trainer import ORTTrainer, IODescription, ModelDescription, LossScaler
-
-import torch
+from onnxruntime.capi.ort_trainer import IODescription, LossScaler, ModelDescription, ORTTrainer  # noqa: F401
 
 
 class BertModelTest(unittest.TestCase):
-    class BertModelTester(object):
+    class BertModelTester:
         def __init__(
             self,
             parent,
@@ -218,8 +210,8 @@ def create_and_check_bert_for_pretraining(
             option_allreduce_post_accumulation,
             option_gradient_accumulation_steps,
             option_split_batch,
-            option_use_internal_get_lr_this_step=[True],
-            option_use_internal_loss_scaler=[True],
+            option_use_internal_get_lr_this_step=[True],  # noqa: B006
+            option_use_internal_loss_scaler=[True],  # noqa: B006
         ):
             seed = 42
             random.seed(seed)
diff --git a/orttraining/orttraining/test/python/orttraining_test_utils.py b/orttraining/orttraining/test/python/orttraining_test_utils.py
index a3ee6d9e6791a..a57ca9e1fd58c 100644
--- a/orttraining/orttraining/test/python/orttraining_test_utils.py
+++ b/orttraining/orttraining/test/python/orttraining_test_utils.py
@@ -1,11 +1,10 @@
+import math
+
 import torch
-from orttraining_test_bert_postprocess import postprocess_model
 from orttraining_test_data_loader import BatchArgsOption, create_ort_test_dataloader, split_batch
 
 from onnxruntime.capi.ort_trainer import IODescription, ORTTrainer
-from onnxruntime.training import TrainStepInfo, _utils, amp
-from onnxruntime.training import model_desc_validation as md_val
-from onnxruntime.training import optim, orttrainer
+from onnxruntime.training import amp, optim, orttrainer
 from onnxruntime.training import orttrainer_options as orttrainer_options
 from onnxruntime.training.optim import _LRScheduler
 
@@ -200,7 +199,7 @@ def run_test(
     eval_batch = None
     if not use_new_api:
         model.train()
-    for epoch in range(epochs):
+    for _epoch in range(epochs):
         for step, batch in enumerate(dataloader):
             if eval_batch is None:
                 eval_batch = batch
@@ -214,13 +213,9 @@ def run_test(
 
             if batch_args_option == BatchArgsOption.List:
                 if not use_internal_get_lr_this_step:
-                    batch = batch + [
-                        learning_rate,
-                    ]
+                    batch = [*batch, learning_rate]
                 if not use_internal_loss_scaler and fp16:
-                    batch = batch + [
-                        loss_scale,
-                    ]
+                    batch = [*batch, loss_scale]
                 outputs = model.train_step(*batch)
             elif batch_args_option == BatchArgsOption.Dict:
                 args, kwargs = split_batch(batch, model_desc.inputs_, 0)
diff --git a/orttraining/orttraining/test/python/orttraining_transformer_trainer.py b/orttraining/orttraining/test/python/orttraining_transformer_trainer.py
index 0185670dac79f..b71751d55c9fc 100644
--- a/orttraining/orttraining/test/python/orttraining_transformer_trainer.py
+++ b/orttraining/orttraining/test/python/orttraining_transformer_trainer.py
@@ -4,36 +4,22 @@
 import logging
 import os
 import random
-
-from typing import Callable, Dict, List, NamedTuple, Optional, Tuple
+from typing import Callable, Dict, List, NamedTuple, Optional
 
 import numpy as np
 import torch
-from torch import nn
 from torch.utils.data.dataloader import DataLoader
 from torch.utils.data.dataset import Dataset
 from torch.utils.data.distributed import DistributedSampler
-from torch.utils.data.sampler import RandomSampler, SequentialSampler
+from torch.utils.data.sampler import SequentialSampler
 from tqdm import tqdm, trange
-
-from transformers.data.data_collator import DataCollator, DefaultDataCollator
+from transformers.data.data_collator import DefaultDataCollator
 from transformers.modeling_utils import PreTrainedModel
 from transformers.training_args import TrainingArguments
 
 import onnxruntime
-from orttraining_test_bert_postprocess import postprocess_model
-from onnxruntime.capi.ort_trainer import ORTTrainer, LossScaler, ModelDescription, IODescription
-
-from onnxruntime.training import (
-    _utils,
-    amp,
-    optim,
-    orttrainer,
-    TrainStepInfo,
-    model_desc_validation as md_val,
-    orttrainer_options as orttrainer_options,
-)
-from onnxruntime.training.optim import LinearWarmupLRScheduler, _LRScheduler
+from onnxruntime.training import amp, optim, orttrainer
+from onnxruntime.training import orttrainer_options as orttrainer_options
 
 try:
     from torch.utils.tensorboard import SummaryWriter
@@ -41,7 +27,7 @@
     _has_tensorboard = True
 except ImportError:
     try:
-        from tensorboardX import SummaryWriter
+        from tensorboardX import SummaryWriter  # noqa: F401
 
         _has_tensorboard = True
     except ImportError:
@@ -242,10 +228,9 @@ def train(self):
             disable=self.args.local_rank not in [-1, 0],
         )
 
-        for epoch in train_iterator:
+        for _epoch in train_iterator:
             epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=self.args.local_rank not in [-1, 0])
             for step, inputs in enumerate(epoch_iterator):
-
                 # Skip past any already trained steps if resuming training
                 if steps_trained_in_current_epoch > 0:
                     steps_trained_in_current_epoch -= 1
@@ -266,7 +251,7 @@ def train(self):
                             if self.args.evaluate_during_training:
                                 results = self.evaluate()
                                 for key, value in results.items():
-                                    eval_key = "eval_{}".format(key)
+                                    eval_key = f"eval_{key}"
                                     logs[eval_key] = value
 
                             loss_scalar = (tr_loss - logging_loss) / self.args.logging_steps
diff --git a/orttraining/orttraining/test/python/perf_log/ort_module_perf_test_tools.py b/orttraining/orttraining/test/python/perf_log/ort_module_perf_test_tools.py
index b7b619a92e53b..03b66e4367f23 100644
--- a/orttraining/orttraining/test/python/perf_log/ort_module_perf_test_tools.py
+++ b/orttraining/orttraining/test/python/perf_log/ort_module_perf_test_tools.py
@@ -1,13 +1,13 @@
 # https://docs.microsoft.com/en-us/azure/mysql/connect-python
 
-import mysql.connector
-from mysql.connector import errorcode
-import git
-import os
-
 import argparse
+import os
 from datetime import datetime
 
+import git
+import mysql.connector
+from mysql.connector import errorcode
+
 
 def get_repo_commit(repo_path):
     repo = git.Repo(repo_path, search_parent_directories=True)
@@ -102,6 +102,7 @@ def get_repo_commit(repo_path):
     %(RunConfig)s,\
     %(Time)s)"
 
+
 # Obtain connection string information from the portal
 def connect_to_perf_dashboard_db(mysql_server_name, power_bi_user_name, password, database):
     config = {
diff --git a/orttraining/orttraining/test/python/qat_poc_example/model.py b/orttraining/orttraining/test/python/qat_poc_example/model.py
index bb3e95808e2a7..91d7ccd7294f5 100644
--- a/orttraining/orttraining/test/python/qat_poc_example/model.py
+++ b/orttraining/orttraining/test/python/qat_poc_example/model.py
@@ -3,15 +3,16 @@
 import os
 
 import onnx
-import onnxruntime.training.onnxblock as onnxblock
 import torch
 
+import onnxruntime.training.onnxblock as onnxblock
+
 
 class MNIST(torch.nn.Module):
     """MNIST PyTorch model"""
 
     def __init__(self, input_size, hidden_size, num_classes):
-        super(MNIST, self).__init__()
+        super().__init__()
 
         self.fc1 = torch.nn.Linear(input_size, hidden_size)
         self.relu = torch.nn.ReLU()
@@ -97,7 +98,7 @@ def create_training_artifacts(model_path, artifacts_dir, model_prefix):
 
     class MNISTWithLoss(onnxblock.TrainingModel):
         def __init__(self):
-            super(MNISTWithLoss, self).__init__()
+            super().__init__()
             self.loss = onnxblock.loss.CrossEntropyLoss()
 
         def build(self, output_name):
diff --git a/orttraining/orttraining/test/python/qat_poc_example/qat.py b/orttraining/orttraining/test/python/qat_poc_example/qat.py
index 11fde06f663a8..51a15475ee911 100644
--- a/orttraining/orttraining/test/python/qat_poc_example/qat.py
+++ b/orttraining/orttraining/test/python/qat_poc_example/qat.py
@@ -2,7 +2,6 @@
 import os
 
 import onnx
-
 import quantize
 import utils
 from model import create_training_artifacts, get_models
diff --git a/orttraining/orttraining/test/python/qat_poc_example/quantize.py b/orttraining/orttraining/test/python/qat_poc_example/quantize.py
index 5e6127782b019..077738daff982 100644
--- a/orttraining/orttraining/test/python/qat_poc_example/quantize.py
+++ b/orttraining/orttraining/test/python/qat_poc_example/quantize.py
@@ -1,10 +1,11 @@
 import logging
 
 import numpy as np
-import onnxruntime.quantization as quantization
 import torch
 from torchvision import datasets, transforms
 
+import onnxruntime.quantization as quantization
+
 
 def preprocess(input_model_dir, output_model_dir):
     """Preprocesses the given onnx model for quantization. Unused for QAT process."""
diff --git a/orttraining/orttraining/test/python/qat_poc_example/train.py b/orttraining/orttraining/test/python/qat_poc_example/train.py
index 2d38842b7bc22..9a429d2adc6f1 100644
--- a/orttraining/orttraining/test/python/qat_poc_example/train.py
+++ b/orttraining/orttraining/test/python/qat_poc_example/train.py
@@ -1,10 +1,11 @@
 import logging
 
 import numpy as np
-import onnxruntime.training.api as orttraining
 import torch
 from torchvision import datasets, transforms
 
+import onnxruntime.training.api as orttraining
+
 
 def _get_dataloaders(data_dir, batch_size):
     """Preprocesses the data and returns dataloaders."""
diff --git a/orttraining/orttraining/test/python/utils_multiple_choice.py b/orttraining/orttraining/test/python/utils_multiple_choice.py
index 562ecbf8c496d..f425cf3d61545 100644
--- a/orttraining/orttraining/test/python/utils_multiple_choice.py
+++ b/orttraining/orttraining/test/python/utils_multiple_choice.py
@@ -2,21 +2,19 @@
 # https://github.com/huggingface/transformers/blob/master/examples/multiple-choice/utils_multiple_choice.py
 
 import csv
-import glob
-import json
+import glob  # noqa: F401
+import json  # noqa: F401
 import logging
 import os
 from dataclasses import dataclass
 from enum import Enum
 from typing import List, Optional
 
+import torch
 import tqdm
 from filelock import FileLock
-
-from transformers import PreTrainedTokenizer, is_tf_available, is_torch_available
-
-import torch
 from torch.utils.data.dataset import Dataset
+from transformers import PreTrainedTokenizer, is_tf_available, is_torch_available  # noqa: F401
 
 logger = logging.getLogger(__name__)
 
@@ -116,7 +114,6 @@ def __init__(
         # and the others will use the cache.
         lock_path = cached_features_file + ".lock"
         with FileLock(lock_path):
-
             if os.path.exists(cached_features_file) and not overwrite_cache:
                 logger.info(f"Loading features from cached file {cached_features_file}")
                 self.features = torch.load(cached_features_file)
@@ -155,17 +152,17 @@ class SwagProcessor(DataProcessor):
 
     def get_train_examples(self, data_dir):
         """See base class."""
-        logger.info("LOOKING AT {} train".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} train")
         return self._create_examples(self._read_csv(os.path.join(data_dir, "train.csv")), "train")
 
     def get_dev_examples(self, data_dir):
         """See base class."""
-        logger.info("LOOKING AT {} dev".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} dev")
         return self._create_examples(self._read_csv(os.path.join(data_dir, "val.csv")), "dev")
 
     def get_test_examples(self, data_dir):
         """See base class."""
-        logger.info("LOOKING AT {} dev".format(data_dir))
+        logger.info(f"LOOKING AT {data_dir} dev")
         raise ValueError(
             "For swag testing, the input file does not contain a label column. It can not be tested in current code"
             "setting!"
@@ -177,7 +174,7 @@ def get_labels(self):
         return ["0", "1", "2", "3"]
 
     def _read_csv(self, input_file):
-        with open(input_file, "r", encoding="utf-8") as f:
+        with open(input_file, encoding="utf-8") as f:
             return list(csv.reader(f))
 
     def _create_examples(self, lines: List[List[str]], type: str):
@@ -218,11 +215,11 @@ def convert_examples_to_features(
     label_map = {label: i for i, label in enumerate(label_list)}
 
     features = []
-    for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
+    for ex_index, example in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
         if ex_index % 10000 == 0:
             logger.info("Writing example %d of %d" % (ex_index, len(examples)))
         choices_inputs = []
-        for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
+        for _ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
             text_a = context
             if example.question.find("_") != -1:
                 # this is for cloze question
diff --git a/orttraining/pytorch_frontend_examples/mnist_training.py b/orttraining/pytorch_frontend_examples/mnist_training.py
index afab8a3bf7ec2..007eee6955c60 100644
--- a/orttraining/pytorch_frontend_examples/mnist_training.py
+++ b/orttraining/pytorch_frontend_examples/mnist_training.py
@@ -4,20 +4,20 @@
 
 ## Model testing is not complete.
 
-from __future__ import print_function
 import argparse
+import os
+
+import numpy as np  # noqa: F401
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import torch.optim as optim
+import torch.optim as optim  # noqa: F401
+from mpi4py import MPI
 from torchvision import datasets, transforms
-import numpy as np
-import os
 
 from onnxruntime.capi.ort_trainer import IODescription, ModelDescription, ORTTrainer
-from mpi4py import MPI
 
-try:
+try:  # noqa: SIM105
     from onnxruntime.capi._pybind_state import set_cuda_device_id
 except ImportError:
     pass
@@ -25,7 +25,7 @@
 
 class NeuralNet(nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNet, self).__init__()
+        super().__init__()
         self.fc1 = nn.Linear(input_size, hidden_size)
         self.relu = nn.ReLU()
         self.fc2 = nn.Linear(hidden_size, num_classes)
@@ -193,7 +193,7 @@ def main():
 
     for epoch in range(1, args.epochs + 1):
         train_with_trainer(args, trainer, device, train_loader, epoch)
-        import pdb
+        import pdb  # noqa: F401
 
         test_with_trainer(args, trainer, device, test_loader)
 
diff --git a/orttraining/tools/amdgpu/script/rocprof.py b/orttraining/tools/amdgpu/script/rocprof.py
index dc91d13606fb0..e5b107ba285bf 100644
--- a/orttraining/tools/amdgpu/script/rocprof.py
+++ b/orttraining/tools/amdgpu/script/rocprof.py
@@ -1,7 +1,8 @@
 import argparse
-import numpy as np
-import os
 import csv
+import os  # noqa: F401
+
+import numpy as np  # noqa: F401
 
 parser = argparse.ArgumentParser()
 parser.add_argument("--input", type=str)
diff --git a/orttraining/tools/ci_test/compare_huggingface.py b/orttraining/tools/ci_test/compare_huggingface.py
index c484cfb56adcb..fd7244a0cf0b7 100755
--- a/orttraining/tools/ci_test/compare_huggingface.py
+++ b/orttraining/tools/ci_test/compare_huggingface.py
@@ -1,6 +1,6 @@
-import sys
-import json
 import collections
+import json
+import sys
 
 actual = sys.argv[1]
 expect = sys.argv[2]
diff --git a/orttraining/tools/ci_test/compare_results.py b/orttraining/tools/ci_test/compare_results.py
index ba76b9eaf414c..24854d6cf9c82 100644
--- a/orttraining/tools/ci_test/compare_results.py
+++ b/orttraining/tools/ci_test/compare_results.py
@@ -1,10 +1,10 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-import argparse
+import argparse  # noqa: F401
 import collections
 import csv
-import re
+import re  # noqa: F401
 import sys
 
 Comparison = collections.namedtuple("Comparison", ["name", "fn"])
@@ -19,8 +19,7 @@ def eq():
     def float_le(tolerance=None):
         actual_tolerance = 0.0 if tolerance is None else tolerance
         return Comparison(
-            name="less than or equal to"
-            + (" (tolerance: {})".format(str(actual_tolerance)) if tolerance is not None else ""),
+            name="less than or equal to" + (f" (tolerance: {str(actual_tolerance)})" if tolerance is not None else ""),
             fn=(lambda actual, expected: float(actual) <= float(expected) + actual_tolerance),
         )
 
diff --git a/orttraining/tools/ci_test/download_azure_blob_archive.py b/orttraining/tools/ci_test/download_azure_blob_archive.py
index 6fa875a1d2373..6910e45bd378e 100755
--- a/orttraining/tools/ci_test/download_azure_blob_archive.py
+++ b/orttraining/tools/ci_test/download_azure_blob_archive.py
@@ -9,8 +9,8 @@
 import subprocess
 import sys
 import tempfile
-import urllib.request
-import zipfile
+import urllib.request  # noqa: F401
+import zipfile  # noqa: F401
 
 SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
 REPO_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "..", "..", ".."))
@@ -43,7 +43,7 @@ def _check_file_sha256_digest(path, expected_digest):
     match = actual_digest.lower() == expected_digest.lower()
     if not match:
         raise RuntimeError(
-            "SHA256 digest mismatch, expected: {}, actual: {}".format(expected_digest.lower(), actual_digest.lower())
+            f"SHA256 digest mismatch, expected: {expected_digest.lower()}, actual: {actual_digest.lower()}"
         )
 
 
@@ -56,7 +56,7 @@ def main():
 
     with tempfile.TemporaryDirectory() as temp_dir, get_azcopy() as azcopy_path:
         archive_path = os.path.join(temp_dir, "archive.zip")
-        print("Downloading archive from '{}'...".format(args.azure_blob_url))
+        print(f"Downloading archive from '{args.azure_blob_url}'...")
 
         azure_blob_url = args.azure_blob_url
         azure_blob_sas_token = os.getenv("AZURE_BLOB_SAS_TOKEN", None)
@@ -66,7 +66,7 @@ def main():
         _download(azcopy_path, azure_blob_url, archive_path)
         if args.archive_sha256_digest:
             _check_file_sha256_digest(archive_path, args.archive_sha256_digest)
-        print("Extracting to '{}'...".format(args.target_dir))
+        print(f"Extracting to '{args.target_dir}'...")
         shutil.unpack_archive(archive_path, args.target_dir)
         print("Done.")
 
diff --git a/orttraining/tools/ci_test/run_batch_size_test.py b/orttraining/tools/ci_test/run_batch_size_test.py
index 4a7ec51062914..cd93c44cf73b6 100755
--- a/orttraining/tools/ci_test/run_batch_size_test.py
+++ b/orttraining/tools/ci_test/run_batch_size_test.py
@@ -4,9 +4,9 @@
 
 import argparse
 import collections
+import os
 import subprocess
 import sys
-import os
 
 
 def parse_args():
diff --git a/orttraining/tools/ci_test/run_bert_perf_test.py b/orttraining/tools/ci_test/run_bert_perf_test.py
index 8f6a59c1fd883..fbc1403583ba0 100644
--- a/orttraining/tools/ci_test/run_bert_perf_test.py
+++ b/orttraining/tools/ci_test/run_bert_perf_test.py
@@ -3,10 +3,10 @@
 # Licensed under the MIT License.
 
 import argparse
+import json
+import os
 import subprocess
 import sys
-import os
-import json
 from collections import namedtuple
 
 SCRIPT_DIR = os.path.realpath(os.path.dirname(__file__))
@@ -59,7 +59,7 @@ def main():
         cmds = [
             os.path.join(args.binary_dir, "onnxruntime_training_bert"),
             "--model_name",
-            os.path.join(args.model_root, "nv/bert-large/{}".format(model)),
+            os.path.join(args.model_root, f"nv/bert-large/{model}"),
             "--train_data_dir",
             os.path.join(args.training_data_root, str(c.max_seq_length), "books_wiki_en_corpus/train"),
             "--test_data_dir",
diff --git a/orttraining/tools/ci_test/run_convergence_test.py b/orttraining/tools/ci_test/run_convergence_test.py
index 568e3c4cd9c4c..58250e7f8ae8c 100755
--- a/orttraining/tools/ci_test/run_convergence_test.py
+++ b/orttraining/tools/ci_test/run_convergence_test.py
@@ -3,12 +3,12 @@
 # Licensed under the MIT License.
 
 import argparse
+import os
 import subprocess
 import sys
 import tempfile
-import os
 
-from compare_results import compare_results_files, Comparisons
+from compare_results import Comparisons, compare_results_files
 
 SCRIPT_DIR = os.path.realpath(os.path.dirname(__file__))
 
@@ -86,7 +86,7 @@ def main():
         elif args.gpu_sku == "V100_16G":
             reference_csv = "bert_base.convergence.baseline.csv"
         else:
-            raise ValueError("Unrecognized gpu_sku {}".format(args.gpu_sku))
+            raise ValueError(f"Unrecognized gpu_sku {args.gpu_sku}")
 
         # verify output
         comparison_result = compare_results_files(
diff --git a/orttraining/tools/ci_test/run_gpt2_perf_test.py b/orttraining/tools/ci_test/run_gpt2_perf_test.py
index 8c0ac1953feed..e64fc3c7812e3 100644
--- a/orttraining/tools/ci_test/run_gpt2_perf_test.py
+++ b/orttraining/tools/ci_test/run_gpt2_perf_test.py
@@ -3,9 +3,9 @@
 # Licensed under the MIT License.
 
 import argparse
+import os
 import subprocess
 import sys
-import os
 from collections import namedtuple
 
 SCRIPT_DIR = os.path.realpath(os.path.dirname(__file__))
diff --git a/orttraining/tools/scripts/experiment.py b/orttraining/tools/scripts/experiment.py
index 0e3e2ceead465..069c08ff97132 100644
--- a/orttraining/tools/scripts/experiment.py
+++ b/orttraining/tools/scripts/experiment.py
@@ -1,20 +1,17 @@
 import argparse
-import os
+import os  # noqa: F401
 import re
-import sys
+import sys  # noqa: F401
 
 from azure.common.client_factory import get_client_from_cli_profile
 from azure.mgmt.containerregistry import ContainerRegistryManagementClient
-
-from azureml.core import Workspace, Experiment, Run, Datastore
-from azureml.core.compute import ComputeTarget, AmlCompute
-
+from azureml.core import Datastore, Experiment, Run, Workspace  # noqa: F401
+from azureml.core.compute import AmlCompute, ComputeTarget  # noqa: F401
 from azureml.core.container_registry import ContainerRegistry
+from azureml.core.runconfig import MpiConfiguration, RunConfiguration  # noqa: F401
+from azureml.data.azure_storage_datastore import AzureBlobDatastore, AzureFileDatastore  # noqa: F401
 from azureml.train.estimator import Estimator
 
-from azureml.data.azure_storage_datastore import AzureFileDatastore, AzureBlobDatastore
-from azureml.core.runconfig import MpiConfiguration, RunConfiguration
-
 parser = argparse.ArgumentParser()
 parser.add_argument(
     "--subscription", type=str, default="ea482afa-3a32-437c-aa10-7de928a9e793"
@@ -130,7 +127,7 @@
 container_image = args.container
 registry_details = None
 
-acr = re.match("^((\w+).azurecr.io)/(.*)", args.container)
+acr = re.match("^((\\w+).azurecr.io)/(.*)", args.container)
 if acr:
     # Extract the relevant parts from the container image
     #   e.g. onnxtraining.azurecr.io/azureml/bert:latest
@@ -169,4 +166,4 @@
 # Start the AzureML Experiment
 experiment = Experiment(workspace=ws, name=args.experiment)
 run = experiment.submit(estimator, tags)
-print("Experiment running at: {}".format(run.get_portal_url()))
+print(f"Experiment running at: {run.get_portal_url()}")
diff --git a/orttraining/tools/scripts/gpt2_model_transform.py b/orttraining/tools/scripts/gpt2_model_transform.py
index 9e018a34069e5..06f03e06632b4 100644
--- a/orttraining/tools/scripts/gpt2_model_transform.py
+++ b/orttraining/tools/scripts/gpt2_model_transform.py
@@ -1,10 +1,9 @@
 ### Be noted: this script is developed against the model exported from Megatron GPT2 Pretraining script.
 
 import sys
-import onnx
-from onnx import helper, shape_inference
-from onnx import TensorProto
+
 import numpy as np
+import onnx
 from onnx import numpy_helper
 
 if len(sys.argv) < 2:
@@ -112,7 +111,7 @@ def process_concat(model):
                 skip = True
             input_nodes.append(concat_input_node)
 
-        if skip == True:
+        if skip is True:
             continue
 
         # figure out target shape
@@ -129,7 +128,7 @@ def process_concat(model):
                 data = numpy_helper.to_array(attr[0].t)
                 shape.append(np.asscalar(data))
 
-        print("concat node: %s, new_shape is: %s" % (node.name, shape))
+        print(f"concat node: {node.name}, new_shape is: {shape}")
 
         # find out the nodes need to be deleted.
         fuse_nodes = find_all_fused_nodes(model, node)
@@ -346,6 +345,6 @@ def align_attention_mask_dim(model):
 # set opset version to 10
 model.opset_import[0].version = 10
 
-f = open(output_model_name, "wb")
+f = open(output_model_name, "wb")  # noqa: SIM115
 f.write(model.SerializeToString())
 f.close()
diff --git a/orttraining/tools/scripts/layer_norm_transform.py b/orttraining/tools/scripts/layer_norm_transform.py
index 0ad4ea2559207..2ccc947a58832 100644
--- a/orttraining/tools/scripts/layer_norm_transform.py
+++ b/orttraining/tools/scripts/layer_norm_transform.py
@@ -1,8 +1,9 @@
-import sys
 import os.path
-from onnx import *
-import onnx
+import sys
+
 import numpy as np
+import onnx
+from onnx import *  # noqa: F403
 
 
 def find_node(graph_proto, op_type):
@@ -17,10 +18,10 @@ def find_node(graph_proto, op_type):
 
 
 def gen_attribute(key, value):
-    attr = AttributeProto()
+    attr = AttributeProto()  # noqa: F405
     attr.name = key
     attr.ints.extend(int(v) for v in value)
-    attr.type = AttributeProto.INTS
+    attr.type = AttributeProto.INTS  # noqa: F405
     return attr
 
 
@@ -45,21 +46,21 @@ def main():
     # print(graph_proto)
     # print(graph_proto.input)
 
-    nodes_Div, map_input_Div = find_node(graph_proto, "Div")
+    nodes_Div, map_input_Div = find_node(graph_proto, "Div")  # noqa: N806
     # print(map_input_Div)
-    nodes_Sqrt, map_input_Sqrt = find_node(graph_proto, "Sqrt")
+    nodes_Sqrt, map_input_Sqrt = find_node(graph_proto, "Sqrt")  # noqa: N806
     # print(map_input_Sqrt)
-    nodes_Add, map_input_Add = find_node(graph_proto, "Add")
+    nodes_Add, map_input_Add = find_node(graph_proto, "Add")  # noqa: N806
     # print(map_input_Add)
-    nodes_ReduceMean, map_input_ReduceMean = find_node(graph_proto, "ReduceMean")
+    nodes_ReduceMean, map_input_ReduceMean = find_node(graph_proto, "ReduceMean")  # noqa: N806
     # print(map_input_ReduceMean)
-    nodes_Pow, map_input_Pow = find_node(graph_proto, "Pow")
+    nodes_Pow, map_input_Pow = find_node(graph_proto, "Pow")  # noqa: N806
     # print(map_input_Pow)
-    nodes_Mul, map_input_Mul = find_node(graph_proto, "Mul")
+    nodes_Mul, map_input_Mul = find_node(graph_proto, "Mul")  # noqa: N806
 
     # find right side Sub
-    nodes_Sub = []
-    map_input_Sub = {}
+    nodes_Sub = []  # noqa: N806
+    map_input_Sub = {}  # noqa: N806
     for node in graph_proto.node:
         if node.op_type == "Sub":
             if node.output[0] in map_input_Pow:
@@ -68,8 +69,8 @@ def main():
     # print(map_input_Sub)
 
     # find first ReduceMean
-    first_ReduceMean = []
-    first_ReduceMean_outputs = []
+    first_ReduceMean = []  # noqa: N806
+    first_ReduceMean_outputs = []  # noqa: N806
     for node in nodes_ReduceMean:
         if node.output[0] in map_input_Sub:
             first_ReduceMean.append(node)
@@ -77,8 +78,8 @@ def main():
     # print(first_ReduceMean)
 
     # find constant node
-    nodes_Constant = []
-    map_output_Constant = {}
+    nodes_Constant = []  # noqa: N806
+    map_output_Constant = {}  # noqa: N806
     for node in graph_proto.node:
         if node.op_type == "Constant":
             nodes_Constant.append(node)
@@ -96,12 +97,12 @@ def main():
         node_sub = map_input_Sub[node.output[0]]
         node_pow = map_input_Pow[node_sub.output[0]]
         node_reduce = map_input_ReduceMean[node_pow.output[0]]
-        node_Add = map_input_Add[node_reduce.output[0]]
-        node_Sqrt = map_input_Sqrt[node_Add.output[0]]
-        node_Div = map_input_Div[node_Sqrt.output[0]]
-        node_Mul = map_input_Mul[node_Div.output[0]]
+        node_Add = map_input_Add[node_reduce.output[0]]  # noqa: N806
+        node_Sqrt = map_input_Sqrt[node_Add.output[0]]  # noqa: N806
+        node_Div = map_input_Div[node_Sqrt.output[0]]  # noqa: N806
+        node_Mul = map_input_Mul[node_Div.output[0]]  # noqa: N806
         layer_norm_input.append(node_Mul.input[0])
-        node_Add1 = map_input_Add[node_Mul.output[0]]
+        node_Add1 = map_input_Add[node_Mul.output[0]]  # noqa: N806
         layer_norm_input.append(node_Add1.input[1])
         removed_nodes.append(node)
         removed_nodes.append(node_sub)
@@ -120,7 +121,7 @@ def main():
         layer_norm_output.append("saved_mean_" + str(id))
         id = id + 1
         layer_norm_output.append("saved_inv_std_var_" + str(id))
-        layer_norm = helper.make_node(
+        layer_norm = helper.make_node(  # noqa: F405
             "LayerNormalization",
             layer_norm_input,
             layer_norm_output,
diff --git a/orttraining/tools/scripts/model_transform.py b/orttraining/tools/scripts/model_transform.py
index 8c0be5b08c04a..81e9f7b16be14 100644
--- a/orttraining/tools/scripts/model_transform.py
+++ b/orttraining/tools/scripts/model_transform.py
@@ -1,9 +1,8 @@
 import sys
-import onnx
-from onnx import helper, shape_inference
-from onnx import TensorProto
+
 import numpy as np
-from onnx import numpy_helper
+import onnx
+from onnx import TensorProto, helper, numpy_helper, shape_inference  # noqa: F401
 
 if len(sys.argv) < 2:
     print("Please give model path...")
@@ -110,7 +109,7 @@ def process_concat(model):
                     assert attr[0].type == 4
                     data = numpy_helper.to_array(attr[0].t)
                     shape.append(np.asscalar(data))
-            print("concat node: %s, new_shape is: %s" % (node.name, shape))
+            print(f"concat node: {node.name}, new_shape is: {shape}")
             # find out the nodes need to be deleted.
             fuse_nodes = find_all_fused_nodes(model, node)
             reshape_node = find_output_node(model, node.output[0])
@@ -299,13 +298,13 @@ def add_expand_shape(model):
 # set opset version to 10
 model.opset_import[0].version = 10
 
-f = open(output_model_name, "wb")
+f = open(output_model_name, "wb")  # noqa: SIM115
 f.write(model.SerializeToString())
 f.close()
 
 # Use ORT to verify the converted model. Notice that you must use python package from the
 # training branch because training requires some extra ops.
-import onnxruntime as ort
+import onnxruntime as ort  # noqa: E402
 
 # We convert model to accept variable-length batch size, so it can be any positive integer.
 batch = 3
diff --git a/orttraining/tools/scripts/nv_run_pretraining.py b/orttraining/tools/scripts/nv_run_pretraining.py
index 3e51a8886ecb6..ed1333d17496f 100644
--- a/orttraining/tools/scripts/nv_run_pretraining.py
+++ b/orttraining/tools/scripts/nv_run_pretraining.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
 # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
 
@@ -15,42 +14,34 @@
 # limitations under the License.
 """BERT finetuning runner."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
+
+import argparse
 
 # ==================
-import csv
-import os
-import time
 import logging
-import argparse
+import os
 import random
+import time
+from concurrent.futures import ProcessPoolExecutor
+
+import amp_C
+import apex_C
 import h5py
-from tqdm import tqdm, trange
-import os
 import numpy as np
 import torch
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset
-from torch.utils.data.distributed import DistributedSampler
-import math
 from apex import amp
-import multiprocessing
-
-from tokenization import BertTokenizer
-from modeling import BertForPreTraining, BertConfig
+from apex.amp import _amp_state
+from apex.parallel import DistributedDataParallel as DDP  # noqa: N817
+from apex.parallel.distributed import flat_dist_call
+from file_utils import PYTORCH_PRETRAINED_BERT_CACHE  # noqa: F401
+from modeling import BertConfig, BertForPreTraining
 from optimization import BertLAMB
-
-from file_utils import PYTORCH_PRETRAINED_BERT_CACHE
+from schedulers import LinearWarmUpScheduler  # noqa: F401
+from tokenization import BertTokenizer  # noqa: F401
+from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler  # noqa: F401
+from torch.utils.data.distributed import DistributedSampler  # noqa: F401
+from tqdm import tqdm, trange  # noqa: F401
 from utils import is_main_process
-from apex.parallel import DistributedDataParallel as DDP
-from schedulers import LinearWarmUpScheduler
-from apex.parallel.distributed import flat_dist_call
-import amp_C
-import apex_C
-from apex.amp import _amp_state
-
-from concurrent.futures import ProcessPoolExecutor
 
 logging.basicConfig(
     format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
@@ -59,7 +50,6 @@
 
 
 def create_pretraining_dataset(input_file, max_pred_length, shared_list, args):
-
     train_data = pretraining_dataset(input_file=input_file, max_pred_length=max_pred_length)
     train_sampler = RandomSampler(train_data)
     train_dataloader = DataLoader(
@@ -69,7 +59,7 @@ def create_pretraining_dataset(input_file, max_pred_length, shared_list, args):
     return train_dataloader, input_file
 
 
-class pretraining_dataset(Dataset):
+class pretraining_dataset(Dataset):  # noqa: N801
     def __init__(self, input_file, max_pred_length):
         self.input_file = input_file
         self.max_pred_length = max_pred_length
@@ -90,7 +80,6 @@ def __len__(self):
         return len(self.inputs[0])
 
     def __getitem__(self, index):
-
         [input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids, next_sentence_labels] = [
             torch.from_numpy(input[index].astype(np.int64))
             if indice < 5
@@ -110,7 +99,6 @@ def __getitem__(self, index):
 
 
 def parse_arguments():
-
     parser = argparse.ArgumentParser()
 
     ## Required parameters
@@ -223,7 +211,6 @@ def parse_arguments():
 
 
 def setup_training(args):
-
     assert torch.cuda.is_available()
 
     if args.local_rank == -1:
@@ -240,7 +227,7 @@ def setup_training(args):
 
     if args.gradient_accumulation_steps < 1:
         raise ValueError(
-            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(args.gradient_accumulation_steps)
+            f"Invalid gradient_accumulation_steps parameter: {args.gradient_accumulation_steps}, should be >= 1"
         )
     if args.train_batch_size % args.gradient_accumulation_steps != 0:
         raise ValueError(
@@ -259,7 +246,7 @@ def setup_training(args):
         and os.path.exists(args.output_dir)
         and (os.listdir(args.output_dir) and os.listdir(args.output_dir) != ["logfile.txt"])
     ):
-        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
+        raise ValueError(f"Output directory ({args.output_dir}) already exists and is not empty.")
 
     if not args.resume_from_checkpoint:
         os.makedirs(args.output_dir, exist_ok=True)
@@ -268,7 +255,6 @@ def setup_training(args):
 
 
 def prepare_model_and_optimizer(args, device):
-
     # Prepare model
     config = BertConfig.from_json_file(args.config_file)
 
@@ -286,7 +272,7 @@ def prepare_model_and_optimizer(args, device):
             args.resume_step = max([int(x.split(".pt")[0].split("_")[1].strip()) for x in model_names])
         global_step = args.resume_step
 
-        checkpoint = torch.load(os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step)), map_location="cpu")
+        checkpoint = torch.load(os.path.join(args.output_dir, f"ckpt_{global_step}.pt"), map_location="cpu")
         model.load_state_dict(checkpoint["model"], strict=False)
         if args.phase2:
             global_step -= args.phase1_end_step
@@ -314,7 +300,6 @@ def prepare_model_and_optimizer(args, device):
         optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=args.max_steps
     )
     if args.fp16:
-
         if args.loss_scale == 0:
             # optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
             model, optimizer = amp.initialize(
@@ -322,7 +307,7 @@ def prepare_model_and_optimizer(args, device):
                 optimizer,
                 opt_level="O2",
                 loss_scale="dynamic",
-                master_weights=False if args.accumulate_into_fp16 else True,
+                master_weights=not args.accumulate_into_fp16,
             )
         else:
             # optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
@@ -331,7 +316,7 @@ def prepare_model_and_optimizer(args, device):
                 optimizer,
                 opt_level="O2",
                 loss_scale=args.loss_scale,
-                master_weights=False if args.accumulate_into_fp16 else True,
+                master_weights=not args.accumulate_into_fp16,
             )
         amp._amp_state.loss_scalers[0]._loss_scale = 2**20
 
@@ -341,7 +326,7 @@ def prepare_model_and_optimizer(args, device):
             # Override hyperparameters from Phase 1
             for key in keys:
                 checkpoint["optimizer"]["state"][key]["step"] = global_step
-            for iter, item in enumerate(checkpoint["optimizer"]["param_groups"]):
+            for iter, _item in enumerate(checkpoint["optimizer"]["param_groups"]):
                 checkpoint["optimizer"]["param_groups"][iter]["t_total"] = args.max_steps
                 checkpoint["optimizer"]["param_groups"][iter]["warmup"] = args.warmup_proportion
                 checkpoint["optimizer"]["param_groups"][iter]["lr"] = args.learning_rate
@@ -367,7 +352,6 @@ def prepare_model_and_optimizer(args, device):
 
 
 def take_optimizer_step(args, optimizer, model, overflow_buf, global_step):
-
     if args.allreduce_post_accumulation:
         # manually allreduce gradients after all accumulation steps
         # check for Inf/NaN
@@ -425,7 +409,6 @@ def take_optimizer_step(args, optimizer, model, overflow_buf, global_step):
 
 
 def main():
-
     args = parse_arguments()
     random.seed(args.seed)
     np.random.seed(args.seed)
@@ -439,7 +422,7 @@ def main():
     is_model_exported = False
 
     if is_main_process():
-        print("SEED {}".format(args.seed))
+        print(f"SEED {args.seed}")
 
     if args.do_train:
         if is_main_process():
@@ -459,7 +442,6 @@ def main():
 
         # Note: We loop infinitely over epochs, termination is handled via iteration count
         while True:
-            thread = None
             if not args.resume_from_checkpoint or epoch > 0 or args.phase2:
                 files = [
                     os.path.join(args.input_dir, f)
@@ -494,7 +476,7 @@ def main():
 
             previous_file = data_file
 
-            print("Create pretraining_dataset with file {}...".format(data_file))
+            print(f"Create pretraining_dataset with file {data_file}...")
             train_data = pretraining_dataset(data_file, args.max_predictions_per_seq)
             train_sampler = RandomSampler(train_data)
             train_dataloader = DataLoader(
@@ -511,7 +493,6 @@ def main():
                 overflow_buf = torch.cuda.IntTensor([0])
 
             for f_id in range(f_start_id + 1, len(files)):
-
                 # torch.cuda.synchronize()
                 # f_start = time.time()
                 if torch.distributed.is_initialized() and torch.distributed.get_world_size() > num_files:
@@ -522,7 +503,7 @@ def main():
                 else:
                     data_file = files[f_id % num_files]
 
-                logger.info("file no %s file %s" % (f_id, previous_file))
+                logger.info(f"file no {f_id} file {previous_file}")
 
                 previous_file = data_file
 
@@ -534,7 +515,7 @@ def main():
                 #     args=(data_file, args.max_predictions_per_seq, shared_file_list, args, n_gpu)
                 # )
                 # thread.start()
-                print("Submit new data file {0} for the next iteration...".format(data_file))
+                print(f"Submit new data file {data_file} for the next iteration...")
                 dataset_future = pool.submit(
                     create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args
                 )
@@ -543,7 +524,7 @@ def main():
                 # print('[{}] : shard overhead {}'.format(torch.distributed.get_rank(), f_end - f_start))
 
                 train_iter = tqdm(train_dataloader, desc="Iteration") if is_main_process() else train_dataloader
-                for step, batch in enumerate(train_iter):
+                for _step, batch in enumerate(train_iter):
                     # torch.cuda.synchronize()
                     # iter_start = time.time()
 
@@ -639,7 +620,7 @@ def main():
                             average_loss /= torch.distributed.get_world_size()
                             torch.distributed.all_reduce(average_loss)
                         if is_main_process():
-                            logger.info("Total Steps:{} Final Loss = {}".format(training_steps, average_loss.item()))
+                            logger.info(f"Total Steps:{training_steps} Final Loss = {average_loss.item()}")
                     elif training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0:
                         if is_main_process():
                             print(
@@ -663,10 +644,10 @@ def main():
                                 model.module if hasattr(model, "module") else model
                             )  # Only save the model it-self
                             if args.resume_step < 0 or not args.phase2:
-                                output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step))
+                                output_save_file = os.path.join(args.output_dir, f"ckpt_{global_step}.pt")
                             else:
                                 output_save_file = os.path.join(
-                                    args.output_dir, "ckpt_{}.pt".format(global_step + args.phase1_end_step)
+                                    args.output_dir, f"ckpt_{global_step + args.phase1_end_step}.pt"
                                 )
                             if args.do_train:
                                 torch.save(
@@ -674,7 +655,7 @@ def main():
                                         "model": model_to_save.state_dict(),
                                         "optimizer": optimizer.state_dict(),
                                         "master params": list(amp.master_params(optimizer)),
-                                        "files": [f_id] + files,
+                                        "files": [f_id, *files],
                                     },
                                     output_save_file,
                                 )
@@ -708,4 +689,4 @@ def main():
     now = time.time()
     args = main()
     if is_main_process():
-        print("Total time taken {}".format(time.time() - now))
+        print(f"Total time taken {time.time() - now}")
diff --git a/orttraining/tools/scripts/opset12_model_transform.py b/orttraining/tools/scripts/opset12_model_transform.py
index c19aceb6216d8..e8c2263a39c32 100644
--- a/orttraining/tools/scripts/opset12_model_transform.py
+++ b/orttraining/tools/scripts/opset12_model_transform.py
@@ -13,11 +13,10 @@
 #   bert-base-uncased_L_12_H_768_A_12_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12.onnx
 
 import sys
-import onnx
-from onnx import helper, shape_inference
-from onnx import TensorProto
+
 import numpy as np
-from onnx import numpy_helper
+import onnx
+from onnx import TensorProto, helper, numpy_helper, shape_inference  # noqa: F401
 
 if len(sys.argv) < 2:
     print("Please give model path...")
@@ -27,6 +26,7 @@
 output_model_name = input_model_name[:-5] + "_opset12.onnx"
 model = onnx.load(input_model_name)
 
+
 # for a given node input, look thru the graph nodes and find the node
 # whose output is matching the input
 def find_input_node(model, arg):
diff --git a/orttraining/tools/scripts/performance_investigation.py b/orttraining/tools/scripts/performance_investigation.py
index b064b13fa6d34..c8550a4d73c49 100644
--- a/orttraining/tools/scripts/performance_investigation.py
+++ b/orttraining/tools/scripts/performance_investigation.py
@@ -1,4 +1,5 @@
 import argparse
+
 import onnx
 
 parser = argparse.ArgumentParser(description="ONNX file analyzer for performance investigation.")
diff --git a/orttraining/tools/scripts/pipeline_model_split.py b/orttraining/tools/scripts/pipeline_model_split.py
index b95bbe49003ec..d1f98c76ea646 100644
--- a/orttraining/tools/scripts/pipeline_model_split.py
+++ b/orttraining/tools/scripts/pipeline_model_split.py
@@ -1,9 +1,8 @@
-import sys
 import os
+import sys  # noqa: F401
+
 import onnx
-from onnx import helper
-from onnx import TensorProto
-from onnx import OperatorSetIdProto
+from onnx import OperatorSetIdProto, TensorProto, helper  # noqa: F401
 
 # Edge that needs to be cut for the split.
 # If the edge is feeding into more than one nodes, and not all the nodes belong to the same cut,
@@ -32,7 +31,7 @@ def split_graph(model, split_edge_groups):
     new_recv_nodes = []
 
     for cut_index in range(len(split_edge_groups)):
-        edgeIds = split_edge_groups[cut_index]
+        edgeIds = split_edge_groups[cut_index]  # noqa: N806
 
         # split the graph based on edgeIds
         upstream_nodes = []
@@ -136,7 +135,7 @@ def split_graph(model, split_edge_groups):
 
             for output_node in output_nodes:
                 for i in range(len(output_node.input)):
-                    for edgeId in edgeIds:
+                    for edgeId in edgeIds:  # noqa: N806
                         if output_node.input[i] == edgeId:
                             output_node.input[i] = new_receive_output_name
 
@@ -153,7 +152,7 @@ def find_all_input_nodes(model, node):
     inputs = []
 
     if node:
-        for inputId in node.input:
+        for inputId in node.input:  # noqa: N806
             nodes.extend([n for n in model.graph.node if inputId in n.output])
             inputs.extend([n for n in model.graph.input if inputId in n.name])
     return nodes, inputs
@@ -163,7 +162,7 @@ def find_all_output_nodes(model, node):
     nodes = []
     outputs = []
     if node:
-        for outputId in node.output:
+        for outputId in node.output:  # noqa: N806
             nodes.extend([n for n in model.graph.node if outputId in n.input])
             outputs.extend([n for n in model.graph.output if outputId in n.name])
     return nodes, outputs
@@ -179,7 +178,7 @@ def find_all_output_nodes_by_edge(model, arg):
 
 def add_identity(model, cuttingEdge, newEdgeIdName):
     output_nodes = None
-    edgeId = cuttingEdge.edgeId
+    edgeId = cuttingEdge.edgeId  # noqa: N806
     for node in model.graph.node:
         if len(node.output) >= 1:
             for output in node.output:
@@ -287,7 +286,7 @@ def generate_subgraph(model, start_nodes, identity_node_list):
         try:
             if i in identity_node_index:
                 del main_graph.graph.node[i]
-        except:
+        except Exception:
             print("error deleting identity node", i)
 
     all_visited_nodes = []
@@ -301,7 +300,7 @@ def generate_subgraph(model, start_nodes, identity_node_list):
         outputs0 = []
         while stack0:
             node = stack0.pop()
-            if not node in visited0:
+            if node not in visited0:
                 tranversed_node += 1
                 visited0.append(node)
                 all_visited_nodes.append(node)
@@ -338,7 +337,7 @@ def generate_subgraph(model, start_nodes, identity_node_list):
                     del subgraph.graph.node[i]
                 else:
                     del main_graph.graph.node[i]
-            except:
+            except Exception:
                 print("error deleting node", i)
 
         for i in reversed(range(len(main_graph.graph.input))):
@@ -347,7 +346,7 @@ def generate_subgraph(model, start_nodes, identity_node_list):
                     del subgraph.graph.input[i]
                 else:
                     del main_graph.graph.input[i]
-            except:
+            except Exception:
                 print("error deleting inputs", i)
 
         for i in reversed(range(len(main_graph.graph.output))):
@@ -356,7 +355,7 @@ def generate_subgraph(model, start_nodes, identity_node_list):
                     del subgraph.graph.output[i]
                 else:
                     del main_graph.graph.output[i]
-            except:
+            except Exception:
                 print("error deleting outputs ", i)
 
         print("model", str(model_count), " length ", len(subgraph.graph.node))
diff --git a/orttraining/tools/scripts/sqldb_to_tensors.py b/orttraining/tools/scripts/sqldb_to_tensors.py
index cf24e0c294450..da2959e506fc0 100644
--- a/orttraining/tools/scripts/sqldb_to_tensors.py
+++ b/orttraining/tools/scripts/sqldb_to_tensors.py
@@ -2,6 +2,7 @@
 # Licensed under the MIT License.
 
 import sqlite3
+
 import onnx
 from onnx import numpy_helper
 
@@ -16,7 +17,7 @@ def convert_tensor_proto_to_numpy_array(blob):
 
 sqlite3.register_converter("TensorProto", convert_tensor_proto_to_numpy_array)
 
-for step, name, value, device, producer, consumers in connection.execute(
+for step, name, value, _device, _producer, consumers in connection.execute(
     "Select Step, Name, Value, DeviceType, TracedProducer, TracedConsumers from Tensors"
 ):
     print(step, name, value.shape, consumers)
diff --git a/orttraining/tools/scripts/watch_experiment.py b/orttraining/tools/scripts/watch_experiment.py
index 33bb73f8dc9b9..aefa1f57cfc16 100644
--- a/orttraining/tools/scripts/watch_experiment.py
+++ b/orttraining/tools/scripts/watch_experiment.py
@@ -1,13 +1,12 @@
 import argparse
-import sys
 import os
-
+import sys
 from concurrent.futures import ThreadPoolExecutor
-from requests import Session
-from threading import Event, Thread
+from threading import Event, Thread  # noqa: F401
 
-from azureml.core import Workspace, Experiment, Run
 from azureml._run_impl.run_watcher import RunWatcher
+from azureml.core import Experiment, Run, Workspace  # noqa: F401
+from requests import Session
 
 parser = argparse.ArgumentParser()
 parser.add_argument(
@@ -41,7 +40,7 @@
 runs = [r for r in experiment.get_runs()]
 
 if len(runs) == 0:
-    print("No runs found in Experiment '{}'".format(args.experiment))
+    print(f"No runs found in Experiment '{args.experiment}'")
     sys.exit()
 
 run = runs[0]
@@ -49,7 +48,7 @@
     try:
         run = next(r for r in runs if r.id == args.run)
     except StopIteration:
-        print("Run id '{}' not found in Experiment '{}'".format(args.run, args.experiment))
+        print(f"Run id '{args.run}' not found in Experiment '{args.experiment}'")
         sys.exit()
 
 # Optionally start synchronizing files from Run
diff --git a/pyproject.toml b/pyproject.toml
index 6bec8bf94350a..8844bd4a516b9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,10 +1,11 @@
 [tool.black]
 line-length = 120
-# extend-exclude needs to be a regular expression
+# NOTE: Do not extend the exclude list. Edit .lintrunner.toml instead
 extend-exclude = "cmake|onnxruntime/core/flatbuffers/"
-target-version = ["py37", "py38", "py39", "py310"]
+target-version = ["py37", "py38", "py39", "py310", "py311"]
 
 [tool.isort]
+# NOTE: Do not extend the exclude list. Edit .lintrunner.toml instead
 profile = "black"
 line_length = 120
 extend_skip_glob = [
@@ -39,3 +40,40 @@ disable = [
 [tool.pyright]
 exclude = ["onnxruntime/core/flatbuffers/*"]
 reportMissingImports = false
+
+[tool.ruff]
+# NOTE: Do not create an exclude list. Edit .lintrunner.toml instead
+target-version = "py37"
+select = [
+    "E", # pycodestyle
+    "F", # Pyflakes
+    "W", # pycodestyle
+    "B", # flake8-bugbear
+    "N", # pep8-naming
+    "YTT", # flake8-2020
+    "RUF", # Ruff-specific rules
+    "SIM", # flake8-simplify
+    "UP", # pyupgrade
+]
+# NOTE: Refrain from growing the ignore list unless for exceptional cases.
+# Always include a comment to explain why.
+ignore = [
+    "E501", # Line length controlled by black
+    "N803", # Argument casing
+    "N812", # Allow import torch.nn.functional as F
+    "N999", # Module names
+    "SIM102", # We don't perfer always combining if branches
+    "SIM108", # We don't encourage ternary operators
+    "SIM114", # Don't combine if branches for debugability
+    "SIM116", # Don't use dict lookup to replace if-else
+]
+ignore-init-module-imports = true
+unfixable = [
+    "F401", # Unused imports
+]
+
+[tool.ruff.per-file-ignores]
+# NOTE: Refrain from growing the ignore list unless for exceptional cases.
+# Prefer inline ignores with `noqa: xxx`.
+# Eventually this list should become empty.
+"orttraining/orttraining/test/**" = ["N802"] # Function casing
diff --git a/samples/python/training/orttrainer/mnist/ort_mnist.py b/samples/python/training/orttrainer/mnist/ort_mnist.py
index 5b28f90a025ff..ef18daca2ac03 100644
--- a/samples/python/training/orttrainer/mnist/ort_mnist.py
+++ b/samples/python/training/orttrainer/mnist/ort_mnist.py
@@ -16,7 +16,7 @@
 # Pytorch model
 class NeuralNet(nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNet, self).__init__()
+        super().__init__()
         self.fc1 = nn.Linear(input_size, hidden_size)
         self.relu = nn.ReLU()
         self.fc2 = nn.Linear(hidden_size, num_classes)
diff --git a/samples/python/training/orttrainer/mnist/pytorch_mnist.py b/samples/python/training/orttrainer/mnist/pytorch_mnist.py
index 0f62b56d35221..2ce3cfd971679 100644
--- a/samples/python/training/orttrainer/mnist/pytorch_mnist.py
+++ b/samples/python/training/orttrainer/mnist/pytorch_mnist.py
@@ -11,7 +11,7 @@
 # Pytorch model
 class NeuralNet(nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
-        super(NeuralNet, self).__init__()
+        super().__init__()
         self.fc1 = nn.Linear(input_size, hidden_size)
         self.relu = nn.ReLU()
         self.fc2 = nn.Linear(hidden_size, num_classes)
diff --git a/samples/python/training/orttrainer/pytorch_transformer/ort_train.py b/samples/python/training/orttrainer/pytorch_transformer/ort_train.py
index fdca0f20a9385..551e878cc9035 100644
--- a/samples/python/training/orttrainer/pytorch_transformer/ort_train.py
+++ b/samples/python/training/orttrainer/pytorch_transformer/ort_train.py
@@ -79,11 +79,11 @@ def evaluate(trainer, data_source, bptt=35):
         train(trainer, train_data, device, epoch, args)
         val_loss = evaluate(trainer, val_data)
         print("-" * 89)
-        print("| end of epoch {:3d} | valid loss {:5.2f} | ".format(epoch, val_loss))
+        print(f"| end of epoch {epoch:3d} | valid loss {val_loss:5.2f} | ")
         print("-" * 89)
 
     # Evaluate
     test_loss = evaluate(trainer, test_data)
     print("=" * 89)
-    print("| End of training | test loss {:5.2f}".format(test_loss))
+    print(f"| End of training | test loss {test_loss:5.2f}")
     print("=" * 89)
diff --git a/samples/python/training/orttrainer/pytorch_transformer/pt_model.py b/samples/python/training/orttrainer/pytorch_transformer/pt_model.py
index e125124d14718..07752f52d7a84 100644
--- a/samples/python/training/orttrainer/pytorch_transformer/pt_model.py
+++ b/samples/python/training/orttrainer/pytorch_transformer/pt_model.py
@@ -6,7 +6,7 @@
 
 class TransformerModel(nn.Module):
     def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
-        super(TransformerModel, self).__init__()
+        super().__init__()
         from torch.nn import TransformerEncoder, TransformerEncoderLayer
 
         self.model_type = "Transformer"
@@ -46,7 +46,7 @@ def forward(self, input1):
 
 class PositionalEncoding(nn.Module):
     def __init__(self, d_model, dropout=0.1, max_len=5000):
-        super(PositionalEncoding, self).__init__()
+        super().__init__()
         self.dropout = nn.Dropout(p=dropout)
 
         pe = torch.zeros(max_len, d_model)
diff --git a/samples/python/training/orttrainer/pytorch_transformer/pt_train.py b/samples/python/training/orttrainer/pytorch_transformer/pt_train.py
index 6dc6032a4acef..a197fb50357e9 100644
--- a/samples/python/training/orttrainer/pytorch_transformer/pt_train.py
+++ b/samples/python/training/orttrainer/pytorch_transformer/pt_train.py
@@ -84,11 +84,11 @@ def evaluate(model, data_source, criterion, bptt=35):
         train(model, train_data, device, epoch, args)
         val_loss = evaluate(model, val_data, criterion)
         print("-" * 89)
-        print("| end of epoch {:3d} | valid loss {:5.2f} | ".format(epoch, val_loss))
+        print(f"| end of epoch {epoch:3d} | valid loss {val_loss:5.2f} | ")
         print("-" * 89)
 
     # Evaluate
     test_loss = evaluate(model, test_data, criterion)
     print("=" * 89)
-    print("| End of training | test loss {:5.2f}".format(test_loss))
+    print(f"| End of training | test loss {test_loss:5.2f}")
     print("=" * 89)
diff --git a/samples/python/training/orttrainer/pytorch_transformer/utils.py b/samples/python/training/orttrainer/pytorch_transformer/utils.py
index 489c0b5acf350..3be8b6cf3f420 100644
--- a/samples/python/training/orttrainer/pytorch_transformer/utils.py
+++ b/samples/python/training/orttrainer/pytorch_transformer/utils.py
@@ -1,4 +1,3 @@
-import io
 import os
 
 import torch
@@ -41,15 +40,15 @@ def prepare_data(device="cpu", train_batch_size=20, eval_batch_size=20, data_dir
         download_from_url(url, root=download_path), to_path=extract_path
     )
     tokenizer = get_tokenizer("basic_english")
-    vocab = build_vocab_from_iterator(map(tokenizer, iter(io.open(train_filepath, encoding="utf8"))))
+    vocab = build_vocab_from_iterator(map(tokenizer, iter(open(train_filepath, encoding="utf8"))))  # noqa: SIM115
 
     def data_process(raw_text_iter):
         data = [torch.tensor([vocab[token] for token in tokenizer(item)], dtype=torch.long) for item in raw_text_iter]
         return torch.cat(tuple(filter(lambda t: t.numel() > 0, data)))
 
-    train_data = data_process(iter(io.open(train_filepath, encoding="utf8")))
-    val_data = data_process(iter(io.open(valid_filepath, encoding="utf8")))
-    test_data = data_process(iter(io.open(test_filepath, encoding="utf8")))
+    train_data = data_process(iter(open(train_filepath, encoding="utf8")))  # noqa: SIM115
+    val_data = data_process(iter(open(valid_filepath, encoding="utf8")))  # noqa: SIM115
+    test_data = data_process(iter(open(test_filepath, encoding="utf8")))  # noqa: SIM115
 
     device = torch.device(device)
 
diff --git a/setup.py b/setup.py
index 7e384b22d618e..9bbc759402669 100644
--- a/setup.py
+++ b/setup.py
@@ -113,7 +113,7 @@ def parse_arg_remove_string(argv, arg_name_equal):
 is_manylinux = environ.get("AUDITWHEEL_PLAT", None) in manylinux_tags
 
 
-class build_ext(_build_ext):
+class build_ext(_build_ext):  # noqa: N801
     def build_extension(self, ext):
         dest_file = self.get_ext_fullpath(ext.name)
         logger.info("copying %s -> %s", ext.sources[0], dest_file)
@@ -123,7 +123,7 @@ def build_extension(self, ext):
 try:
     from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
 
-    class bdist_wheel(_bdist_wheel):
+    class bdist_wheel(_bdist_wheel):  # noqa: N801
         """Helper functions to create wheel package"""
 
         if is_openvino and is_manylinux:
@@ -138,7 +138,7 @@ def get_tag(self):
                     if glibc_major == "2" and glibc_minor == "17":
                         plat = "manylinux_2_17_x86_64.manylinux2014_x86_64"
                     else:  # For manylinux2014 and above, no alias is required
-                        plat = "manylinux_%s_%s_x86_64" % (glibc_major, glibc_minor)
+                        plat = f"manylinux_{glibc_major}_{glibc_minor}_x86_64"
                 tags = next(sys_tags())
                 return (tags.interpreter, tags.abi, plat)
 
@@ -198,7 +198,7 @@ def run(self):
                 logger.info("copying %s -> %s", source, dest)
                 copyfile(source, dest)
                 result = subprocess.run(
-                    ["patchelf", "--print-needed", dest], check=True, stdout=subprocess.PIPE, universal_newlines=True
+                    ["patchelf", "--print-needed", dest], check=True, stdout=subprocess.PIPE, text=True
                 )
                 dependencies = [
                     "librccl.so",
@@ -229,7 +229,7 @@ def run(self):
                         ["patchelf", "--print-needed", dest],
                         check=True,
                         stdout=subprocess.PIPE,
-                        universal_newlines=True,
+                        text=True,
                     )
                     cuda_dependencies = [
                         "libcublas.so",
@@ -266,7 +266,7 @@ def run(self):
                         ["patchelf", "--print-needed", dest],
                         check=True,
                         stdout=subprocess.PIPE,
-                        universal_newlines=True,
+                        text=True,
                     )
                     tensorrt_dependencies = ["libnvinfer.so", "libnvinfer_plugin.so", "libnvonnxparser.so"]
                     args = ["patchelf", "--debug"]
@@ -286,7 +286,7 @@ def run(self):
                         ["patchelf", "--print-needed", dest],
                         check=True,
                         stdout=subprocess.PIPE,
-                        universal_newlines=True,
+                        text=True,
                     )
                     cann_dependencies = ["libascendcl.so", "libacl_op_compiler.so", "libfmk_onnx_parser.so"]
                     args = ["patchelf", "--debug"]
@@ -306,7 +306,7 @@ def run(self):
                         ["patchelf", "--set-rpath", "$ORIGIN", dest, "--force-rpath"],
                         check=True,
                         stdout=subprocess.PIPE,
-                        universal_newlines=True,
+                        text=True,
                     )
 
                 self._rewrite_ld_preload(to_preload)
@@ -315,7 +315,7 @@ def run(self):
                 self._rewrite_ld_preload(to_preload_cann)
 
             else:
-                if "onnxruntime-azure" == package_name:
+                if package_name == "onnxruntime-azure":
                     self._rewrite_ld_preload_azure()
 
             _bdist_wheel.run(self)
@@ -417,7 +417,7 @@ def finalize_options(self):
                 ["patchelf", "--set-rpath", "$ORIGIN", y, "--force-rpath"],
                 check=True,
                 stdout=subprocess.PIPE,
-                universal_newlines=True,
+                text=True,
             )
             dl_libs.append(x)
         dl_libs.append(providers_openvino)
@@ -451,7 +451,7 @@ def finalize_options(self):
 
 if not path.exists(README):
     raise FileNotFoundError("Unable to find 'README.rst'")
-with open(README, "r", encoding="utf-8") as fdesc:
+with open(README, encoding="utf-8") as fdesc:
     long_description = fdesc.read()
 
 # Include files in onnxruntime/external if --enable_external_custom_op_schemas build.sh command
@@ -634,18 +634,18 @@ def check_date_format(date_str):
             try:
                 datetime.datetime.strptime(date_str, "%Y%m%d")
                 return True
-            except:  # noqa
+            except Exception:
                 return False
 
         def reformat_run_count(count_str):
             try:
                 count = int(count_str)
                 if count >= 0 and count < 1000:
-                    return "{:03}".format(count)
+                    return f"{count:03}"
                 elif count >= 1000:
                     raise RuntimeError(f"Too many builds for the same day: {count}")
                 return ""
-            except:  # noqa
+            except Exception:
                 return ""
 
         build_suffix_is_date_format = check_date_format(build_suffix[:8])
@@ -682,7 +682,7 @@ def reformat_run_count(count_str):
 if wheel_name_suffix:
     if not (enable_training and wheel_name_suffix == "gpu"):
         # for training packages, local version is used to indicate device types
-        package_name = "{}-{}".format(package_name, wheel_name_suffix)
+        package_name = f"{package_name}-{wheel_name_suffix}"
 
 cmd_classes = {}
 if bdist_wheel is not None:
@@ -708,16 +708,16 @@ def save_build_and_package_info(package_name, version_number, cuda_version, rocm
 
         version_path = path.join("onnxruntime", "capi", "build_and_package_info.py")
         with open(version_path, "w") as f:
-            f.write("package_name = '{}'\n".format(package_name))
-            f.write("__version__ = '{}'\n".format(version_number))
+            f.write(f"package_name = '{package_name}'\n")
+            f.write(f"__version__ = '{version_number}'\n")
 
             if cuda_version:
-                f.write("cuda_version = '{}'\n".format(cuda_version))
+                f.write(f"cuda_version = '{cuda_version}'\n")
 
                 # cudart_versions are integers
                 cudart_versions = find_cudart_versions(build_env=True)
                 if cudart_versions and len(cudart_versions) == 1:
-                    f.write("cudart_version = {}\n".format(cudart_versions[0]))
+                    f.write(f"cudart_version = {cudart_versions[0]}\n")
                 else:
                     print(
                         "Error getting cudart version. ",
@@ -726,7 +726,7 @@ def save_build_and_package_info(package_name, version_number, cuda_version, rocm
                         else "found multiple cudart libraries",
                     )
             elif rocm_version:
-                f.write("rocm_version = '{}'\n".format(rocm_version))
+                f.write(f"rocm_version = '{rocm_version}'\n")
 
     save_build_and_package_info(package_name, version_number, cuda_version, rocm_version)
 
diff --git a/tools/android_custom_build/build_custom_android_package.py b/tools/android_custom_build/build_custom_android_package.py
index 20ce4d0d40f32..475f6ecc5d2d0 100755
--- a/tools/android_custom_build/build_custom_android_package.py
+++ b/tools/android_custom_build/build_custom_android_package.py
@@ -105,18 +105,15 @@ def main():
     if args.onnxruntime_repo_url:
         docker_build_image_args += ["--build-arg", f"ONNXRUNTIME_REPO={args.onnxruntime_repo_url}"]
 
-    docker_build_image_cmd = (
-        [
-            args.docker_path,
-            "build",
-            "--tag",
-            args.docker_image_tag,
-            "--file",
-            str(SCRIPT_DIR / "Dockerfile"),
-        ]
-        + docker_build_image_args
-        + [str(SCRIPT_DIR)]
-    )
+    docker_build_image_cmd = [
+        args.docker_path,
+        "build",
+        "--tag",
+        args.docker_image_tag,
+        "--file",
+        str(SCRIPT_DIR / "Dockerfile"),
+        *docker_build_image_args,
+    ] + [str(SCRIPT_DIR)]
 
     run(docker_build_image_cmd)
 
@@ -150,24 +147,17 @@ def main():
     # enable use of Ctrl-C to stop when running interactively
     docker_run_interactive_args = ["-it"] if sys.stdin.isatty() else []
 
-    docker_container_build_cmd = (
-        [
-            args.docker_path,
-            "run",
-        ]
-        + docker_run_interactive_args
-        + [
-            f"--name={args.docker_container_name}" if args.docker_container_name is not None else "--rm",
-            f"--volume={working_dir}:/workspace/shared",
-            args.docker_image_tag,
-            "/bin/bash",
-            "/workspace/scripts/build.sh",
-            args.config,
-            container_ops_config_file,
-            container_build_settings_file,
-            "/workspace/shared/output",
-        ]
-    )
+    docker_container_build_cmd = [args.docker_path, "run", *docker_run_interactive_args] + [
+        f"--name={args.docker_container_name}" if args.docker_container_name is not None else "--rm",
+        f"--volume={working_dir}:/workspace/shared",
+        args.docker_image_tag,
+        "/bin/bash",
+        "/workspace/scripts/build.sh",
+        args.config,
+        container_ops_config_file,
+        container_build_settings_file,
+        "/workspace/shared/output",
+    ]
 
     run(docker_container_build_cmd)
 
diff --git a/tools/ci_build/amd_hipify.py b/tools/ci_build/amd_hipify.py
index 08b258406ffd3..e0293128045a5 100644
--- a/tools/ci_build/amd_hipify.py
+++ b/tools/ci_build/amd_hipify.py
@@ -11,9 +11,7 @@ def hipify(hipify_perl_path, src_file_path, dst_file_path):
     if not os.path.exists(dir_name):
         os.makedirs(dir_name, exist_ok=True)
     # Run hipify-perl first, capture output
-    s = subprocess.run(
-        [hipify_perl_path, "-roc", src_file_path], stdout=subprocess.PIPE, universal_newlines=True, check=False
-    ).stdout
+    s = subprocess.run([hipify_perl_path, "-roc", src_file_path], stdout=subprocess.PIPE, text=True, check=False).stdout
 
     # Additional exact-match replacements.
     # Order matters for all of the following replacements, reglardless of appearing in logical sections.
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index e9109523950f3..ba5fb6b3ae125 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -56,9 +56,9 @@ def _check_python_version():
     # According to the BUILD.md, python 3.5+ is required:
     # Python 2 is definitely not supported and it should be safer to consider
     # it won't run with python 4:
-    if sys.version_info[0] != 3:
+    if sys.version_info[0] != 3:  # noqa: YTT201
         raise BuildError("Bad python major version: expecting python 3, found version " "'{}'".format(sys.version))
-    if sys.version_info[1] < 6:
+    if sys.version_info[1] < 6:  # noqa: YTT203
         raise BuildError("Bad python minor version: expecting python 3.6+, found version " "'{}'".format(sys.version))
 
 
@@ -719,10 +719,10 @@ def resolve_executable_path(command_or_path):
 
 def get_linux_distro():
     try:
-        with open("/etc/os-release", "r") as f:
+        with open("/etc/os-release") as f:
             dist_info = dict(line.strip().split("=", 1) for line in f.readlines())
         return dist_info.get("NAME", "").strip('"'), dist_info.get("VERSION", "").strip('"')
-    except (IOError, ValueError):
+    except (OSError, ValueError):
         return "", ""
 
 
@@ -737,8 +737,17 @@ def get_config_build_dir(build_dir, config):
 
 
 def run_subprocess(
-    args, cwd=None, capture_stdout=False, dll_path=None, shell=False, env={}, python_path=None, cuda_home=None
+    args,
+    cwd=None,
+    capture_stdout=False,
+    dll_path=None,
+    shell=False,
+    env=None,
+    python_path=None,
+    cuda_home=None,
 ):
+    if env is None:
+        env = {}
     if isinstance(args, str):
         raise ValueError("args should be a sequence of strings, not a string")
 
@@ -776,16 +785,20 @@ def update_submodules(source_dir):
 
 def is_docker():
     path = "/proc/self/cgroup"
-    return os.path.exists("/.dockerenv") or os.path.isfile(path) and any("docker" in line for line in open(path))
+    return (
+        os.path.exists("/.dockerenv")
+        or os.path.isfile(path)
+        and any("docker" in line for line in open(path))  # noqa: SIM115
+    )
 
 
 def install_python_deps(numpy_version=""):
     dep_packages = ["setuptools", "wheel", "pytest"]
-    dep_packages.append("numpy=={}".format(numpy_version) if numpy_version else "numpy>=1.16.6")
+    dep_packages.append(f"numpy=={numpy_version}" if numpy_version else "numpy>=1.16.6")
     dep_packages.append("sympy>=1.10")
     dep_packages.append("packaging")
     dep_packages.append("cerberus")
-    run_subprocess([sys.executable, "-m", "pip", "install"] + dep_packages)
+    run_subprocess([sys.executable, "-m", "pip", "install", *dep_packages])
 
 
 def setup_test_data(source_onnx_model_dir, dest_model_dir_name, build_dir, configs):
@@ -795,17 +808,17 @@ def setup_test_data(source_onnx_model_dir, dest_model_dir_name, build_dir, confi
     if is_windows():
         src_model_dir = os.path.join(build_dir, dest_model_dir_name)
         if os.path.exists(source_onnx_model_dir) and not os.path.exists(src_model_dir):
-            log.debug("creating shortcut %s -> %s" % (source_onnx_model_dir, src_model_dir))
+            log.debug(f"creating shortcut {source_onnx_model_dir} -> {src_model_dir}")
             run_subprocess(["mklink", "/D", "/J", src_model_dir, source_onnx_model_dir], shell=True)
         for config in configs:
             config_build_dir = get_config_build_dir(build_dir, config)
             os.makedirs(config_build_dir, exist_ok=True)
             dest_model_dir = os.path.join(config_build_dir, dest_model_dir_name)
             if os.path.exists(source_onnx_model_dir) and not os.path.exists(dest_model_dir):
-                log.debug("creating shortcut %s -> %s" % (source_onnx_model_dir, dest_model_dir))
+                log.debug(f"creating shortcut {source_onnx_model_dir} -> {dest_model_dir}")
                 run_subprocess(["mklink", "/D", "/J", dest_model_dir, source_onnx_model_dir], shell=True)
             elif os.path.exists(src_model_dir) and not os.path.exists(dest_model_dir):
-                log.debug("creating shortcut %s -> %s" % (src_model_dir, dest_model_dir))
+                log.debug(f"creating shortcut {src_model_dir} -> {dest_model_dir}")
                 run_subprocess(["mklink", "/D", "/J", dest_model_dir, src_model_dir], shell=True)
     else:
         src_model_dir = os.path.join(build_dir, dest_model_dir_name)
@@ -823,8 +836,8 @@ def use_dev_mode(args):
         return False
     if args.ios and is_macOS():
         return False
-    SYSTEM_COLLECTIONURI = os.getenv("SYSTEM_COLLECTIONURI")
-    if SYSTEM_COLLECTIONURI and not SYSTEM_COLLECTIONURI == "https://dev.azure.com/onnxruntime/":
+    SYSTEM_COLLECTIONURI = os.getenv("SYSTEM_COLLECTIONURI")  # noqa: N806
+    if SYSTEM_COLLECTIONURI and SYSTEM_COLLECTIONURI != "https://dev.azure.com/onnxruntime/":
         return False
     return True
 
@@ -1298,7 +1311,7 @@ def generate_build_tree(
     if args.use_azure:
         add_default_definition(cmake_extra_defines, "onnxruntime_USE_AZURE", "ON")
 
-    cmake_args += ["-D{}".format(define) for define in cmake_extra_defines]
+    cmake_args += [f"-D{define}" for define in cmake_extra_defines]
 
     cmake_args += cmake_extra_args
 
@@ -1311,9 +1324,9 @@ def generate_build_tree(
     if build_number and source_version:
         build_matches = re.fullmatch(r"(\d\d)(\d\d)(\d\d)(\d\d)\.(\d+)", build_number)
         if build_matches:
-            YY = build_matches.group(2)
-            MM = build_matches.group(3)
-            DD = build_matches.group(4)
+            YY = build_matches.group(2)  # noqa: N806
+            MM = build_matches.group(3)  # noqa: N806
+            DD = build_matches.group(4)  # noqa: N806
 
             # Get ORT major and minor number
             with open(os.path.join(source_dir, "VERSION_NUMBER")) as f:
@@ -1331,11 +1344,11 @@ def generate_build_tree(
                 # PrivatePart = 123
                 # String = 191101-2300.1.master.0bce7ae
                 cmake_args += [
-                    "-DVERSION_MAJOR_PART={}".format(ort_major),
-                    "-DVERSION_MINOR_PART={}".format(ort_minor),
-                    "-DVERSION_BUILD_PART={}".format(YY),
-                    "-DVERSION_PRIVATE_PART={}{}".format(MM, DD),
-                    "-DVERSION_STRING={}.{}.{}.{}".format(ort_major, ort_minor, build_number, source_version[0:7]),
+                    f"-DVERSION_MAJOR_PART={ort_major}",
+                    f"-DVERSION_MINOR_PART={ort_minor}",
+                    f"-DVERSION_BUILD_PART={YY}",
+                    f"-DVERSION_PRIVATE_PART={MM}{DD}",
+                    f"-DVERSION_STRING={ort_major}.{ort_minor}.{build_number}.{source_version[0:7]}",
                 ]
 
     for config in configs:
@@ -1353,8 +1366,8 @@ def generate_build_tree(
             )
         preinstalled_dir = Path(build_dir) / config
         run_subprocess(
-            cmake_args
-            + [
+            [
+                *cmake_args,
                 "-Donnxruntime_ENABLE_MEMLEAK_CHECKER="
                 + (
                     "ON"
@@ -1366,8 +1379,8 @@ def generate_build_tree(
                     and not args.disable_memleak_checker
                     else "OFF"
                 ),
-                "-DCMAKE_BUILD_TYPE={}".format(config),
-                "-DCMAKE_PREFIX_PATH={}/{}/installed".format(build_dir, config)
+                f"-DCMAKE_BUILD_TYPE={config}",
+                f"-DCMAKE_PREFIX_PATH={build_dir}/{config}/installed"
                 if preinstalled_dir.exists() and not (args.arm64 or args.arm64ec or args.arm)
                 else "",
             ],
@@ -1406,7 +1419,7 @@ def build_targets(args, cmake_path, build_dir, configs, num_parallel_jobs, targe
                 # CMake will generate correct build tool args for Xcode
                 cmd_args += ["--parallel", str(num_parallel_jobs)]
             else:
-                build_tool_args += ["-j{}".format(num_parallel_jobs)]
+                build_tool_args += [f"-j{num_parallel_jobs}"]
 
         if build_tool_args:
             cmd_args += ["--"]
@@ -1458,7 +1471,7 @@ def setup_cann_vars(args):
         if not cann_home_valid:
             raise BuildError(
                 "cann_home paths must be specified and valid.",
-                "cann_home='{}' valid={}.".format(cann_home, cann_home_valid),
+                f"cann_home='{cann_home}' valid={cann_home_valid}.",
             )
 
     return cann_home
@@ -1472,7 +1485,7 @@ def setup_tensorrt_vars(args):
         if not tensorrt_home_valid:
             raise BuildError(
                 "tensorrt_home paths must be specified and valid.",
-                "tensorrt_home='{}' valid={}.".format(tensorrt_home, tensorrt_home_valid),
+                f"tensorrt_home='{tensorrt_home}' valid={tensorrt_home_valid}.",
             )
 
         # Set maximum workspace size in byte for
@@ -1494,11 +1507,10 @@ def setup_tensorrt_vars(args):
 
 
 def setup_migraphx_vars(args):
-
     migraphx_home = None
 
     if args.use_migraphx:
-        print("migraphx_home = {}".format(args.migraphx_home))
+        print(f"migraphx_home = {args.migraphx_home}")
         migraphx_home = args.migraphx_home or os.getenv("MIGRAPHX_HOME") or None
 
         migraphx_home_not_valid = migraphx_home and not os.path.exists(migraphx_home)
@@ -1506,7 +1518,7 @@ def setup_migraphx_vars(args):
         if migraphx_home_not_valid:
             raise BuildError(
                 "migraphx_home paths must be specified and valid.",
-                "migraphx_home='{}' valid={}.".format(migraphx_home, migraphx_home_not_valid),
+                f"migraphx_home='{migraphx_home}' valid={migraphx_home_not_valid}.",
             )
     return migraphx_home or ""
 
@@ -1519,9 +1531,7 @@ def setup_dml_build(args, cmake_path, build_dir, configs):
         for expected_file in ["bin/DirectML.dll", "lib/DirectML.lib", "include/DirectML.h"]:
             file_path = os.path.join(args.dml_path, expected_file)
             if not os.path.exists(file_path):
-                raise BuildError(
-                    "dml_path is invalid.", "dml_path='{}' expected_file='{}'.".format(args.dml_path, file_path)
-                )
+                raise BuildError("dml_path is invalid.", f"dml_path='{args.dml_path}' expected_file='{file_path}'.")
     elif not args.dml_external_project:
         for config in configs:
             # Run the RESTORE_PACKAGES target to perform the initial
@@ -1541,13 +1551,13 @@ def setup_dml_build(args, cmake_path, build_dir, configs):
 def setup_rocm_build(args):
     rocm_home = None
     if args.use_rocm:
-        print("rocm_home = {}".format(args.rocm_home))
+        print(f"rocm_home = {args.rocm_home}")
         rocm_home = args.rocm_home or None
         rocm_home_not_valid = rocm_home and not os.path.exists(rocm_home)
         if rocm_home_not_valid:
             raise BuildError(
                 "rocm_home paths must be specified and valid.",
-                "rocm_home='{}' valid={}.".format(rocm_home, rocm_home_not_valid),
+                f"rocm_home='{rocm_home}' valid={rocm_home_not_valid}.",
             )
     return rocm_home or ""
 
@@ -1574,13 +1584,13 @@ def run_adb_shell(cmd):
                 "cd {0} && GCOV_PREFIX={0} GCOV_PREFIX_STRIP={1} {2}".format(device_dir, cwd.count(os.sep) + 1, cmd)
             )
         else:
-            adb_shell("cd {} && {}".format(device_dir, cmd))
+            adb_shell(f"cd {device_dir} && {cmd}")
 
     if args.android_abi == "x86_64":
         with contextlib.ExitStack() as context_stack:
             if args.android_run_emulator:
                 avd_name = "ort_android"
-                system_image = "system-images;android-{};default;{}".format(args.android_api, args.android_abi)
+                system_image = f"system-images;android-{args.android_api};default;{args.android_abi}"
 
                 android.create_virtual_device(sdk_tool_paths, system_image, avd_name)
                 emulator_proc = context_stack.enter_context(
@@ -1597,10 +1607,10 @@ def run_adb_shell(cmd):
                 os.path.join(source_dir, "cmake", "external", "onnx", "onnx", "backend", "test"), device_dir, cwd=cwd
             )
             adb_push("onnxruntime_test_all", device_dir, cwd=cwd)
-            adb_shell("chmod +x {}/onnxruntime_test_all".format(device_dir))
+            adb_shell(f"chmod +x {device_dir}/onnxruntime_test_all")
             adb_push("onnx_test_runner", device_dir, cwd=cwd)
-            adb_shell("chmod +x {}/onnx_test_runner".format(device_dir))
-            run_adb_shell("{0}/onnxruntime_test_all".format(device_dir))
+            adb_shell(f"chmod +x {device_dir}/onnx_test_runner")
+            run_adb_shell(f"{device_dir}/onnxruntime_test_all")
 
             if args.build_java:
                 # use the gradle wrapper under <repo root>/java
@@ -1610,7 +1620,7 @@ def run_adb_shell(cmd):
                     [
                         gradle_executable,
                         "--no-daemon",
-                        "-DminSdkVer={}".format(args.android_api),
+                        f"-DminSdkVer={args.android_api}",
                         "clean",
                         "connectedDebugAndroidTest",
                     ],
@@ -1629,8 +1639,8 @@ def run_adb_shell(cmd):
                 adb_push("libcustom_op_library.so", device_dir, cwd=cwd)
                 adb_push("libcustom_op_get_const_input_test_library.so", device_dir, cwd=cwd)
                 adb_push("onnxruntime_customopregistration_test", device_dir, cwd=cwd)
-                adb_shell("chmod +x {}/onnxruntime_shared_lib_test".format(device_dir))
-                adb_shell("chmod +x {}/onnxruntime_customopregistration_test".format(device_dir))
+                adb_shell(f"chmod +x {device_dir}/onnxruntime_shared_lib_test")
+                adb_shell(f"chmod +x {device_dir}/onnxruntime_customopregistration_test")
                 run_adb_shell("LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{0} {0}/onnxruntime_shared_lib_test".format(device_dir))
                 run_adb_shell(
                     "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:{0} {0}/onnxruntime_customopregistration_test".format(device_dir)
@@ -1750,13 +1760,13 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                     [
                         "vstest.console.exe",
                         "--parallel",
-                        "--TestAdapterPath:..\\googletestadapter.0.17.1\\build\\_common",  # noqa
+                        "--TestAdapterPath:..\\googletestadapter.0.17.1\\build\\_common",
                         "/Logger:trx",
                         "/Enablecodecoverage",
                         "/Platform:x64",
                         "/Settings:%s" % os.path.join(source_dir, "cmake\\codeconv.runsettings"),
-                    ]
-                    + executables,
+                        *executables,
+                    ],
                     cwd=cwd2,
                     dll_path=dll_path,
                 )
@@ -1841,7 +1851,7 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                     )
 
             try:
-                import onnx  # noqa
+                import onnx  # noqa: F401
 
                 onnx_test = True
             except ImportError as error:
@@ -1901,8 +1911,8 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
 
             if not args.skip_keras_test:
                 try:
-                    import keras  # noqa
-                    import onnxmltools  # noqa
+                    import keras  # noqa: F401
+                    import onnxmltools  # noqa: F401
 
                     onnxml_test = True
                 except ImportError:
@@ -1926,7 +1936,7 @@ def tvm_run_python_tests(build_dir, configs):
 def run_nodejs_tests(nodejs_binding_dir):
     args = ["npm", "test", "--", "--timeout=90000"]
     if is_windows():
-        args = ["cmd", "/c"] + args
+        args = ["cmd", "/c", *args]
     run_subprocess(args, cwd=nodejs_binding_dir)
 
 
@@ -1971,7 +1981,7 @@ def build_python_wheel(
         if default_training_package_device:
             args.append("--default_training_package_device")
         if wheel_name_suffix:
-            args.append("--wheel_name_suffix={}".format(wheel_name_suffix))
+            args.append(f"--wheel_name_suffix={wheel_name_suffix}")
         if enable_training:
             args.append("--enable_training")
         if enable_training_apis:
@@ -1986,11 +1996,11 @@ def build_python_wheel(
             # The following line assumes no other EP is enabled
             args.append("--wheel_name_suffix=gpu")
             if cuda_version:
-                args.append("--cuda_version={}".format(cuda_version))
+                args.append(f"--cuda_version={cuda_version}")
         elif use_rocm:
             args.append("--use_rocm")
             if rocm_version:
-                args.append("--rocm_version={}".format(rocm_version))
+                args.append(f"--rocm_version={rocm_version}")
         elif use_openvino:
             args.append("--use_openvino")
         elif use_dnnl:
@@ -2246,7 +2256,7 @@ def build_protoc_for_host(cmake_path, source_dir, build_dir, args):
     expected_protoc_path = os.path.join(protoc_build_dir, config_dir, "protoc" + suffix)
 
     if not os.path.exists(expected_protoc_path):
-        raise BuildError("Couldn't find {}. Host build of protoc failed.".format(expected_protoc_path))
+        raise BuildError(f"Couldn't find {expected_protoc_path}. Host build of protoc failed.")
 
     return expected_protoc_path
 
@@ -2308,7 +2318,7 @@ def diff_file(path, regenerate_qualifiers=""):
                 raise BuildError("Generated documents have diffs. Check build output for details.")
 
         except subprocess.CalledProcessError:
-            raise BuildError("git diff returned non-zero error code")
+            raise BuildError("git diff returned non-zero error code")  # noqa: B904
 
 
 def main():
@@ -2658,7 +2668,7 @@ def gen_ort_ops():
 
     if args.build:
         if args.parallel < 0:
-            raise BuildError("Invalid parallel job count: {}".format(args.parallel))
+            raise BuildError(f"Invalid parallel job count: {args.parallel}")
         num_parallel_jobs = os.cpu_count() if args.parallel == 0 else args.parallel
         build_targets(args, cmake_path, build_dir, configs, num_parallel_jobs, args.target)
 
diff --git a/tools/ci_build/clean_docker_image_cache.py b/tools/ci_build/clean_docker_image_cache.py
index 8ad3bb11c8c46..f9b41ce31f92a 100755
--- a/tools/ci_build/clean_docker_image_cache.py
+++ b/tools/ci_build/clean_docker_image_cache.py
@@ -87,7 +87,7 @@ def download_logs(storage_account, container, log_path_pattern, target_dir, az_p
 
 
 def get_image_name(image_info):
-    return "{}@{}".format(image_info.repository, image_info.digest)
+    return f"{image_info.repository}@{image_info.digest}"
 
 
 timestamp_pattern = re.compile(
@@ -146,8 +146,8 @@ def get_valid_images_from_logs(log_paths, min_datetime, min_access_count):
     image_counts = dict()  # dict of {ImageInfo -> count}
 
     for log_path in log_paths:
-        log.debug("Processing log file: {}".format(log_path))
-        with open(log_path, mode="r") as log_file:
+        log.debug(f"Processing log file: {log_path}")
+        with open(log_path) as log_file:
             for line in log_file:
                 image_info = parse_log_line(line, min_datetime)
                 if image_info is not None:
diff --git a/tools/ci_build/gen_def.py b/tools/ci_build/gen_def.py
index ce970dbbe1d1d..c18a59ae2ed2e 100755
--- a/tools/ci_build/gen_def.py
+++ b/tools/ci_build/gen_def.py
@@ -16,7 +16,7 @@ def parse_arguments():
 
 args = parse_arguments()
 print("Generating symbol file for %s" % str(args.config))
-with open(args.version_file, "r") as f:
+with open(args.version_file) as f:
     VERSION_STRING = f.read().strip()
 
 print("VERSION:%s" % VERSION_STRING)
@@ -24,7 +24,7 @@ def parse_arguments():
 symbols = set()
 for c in args.config:
     file_name = os.path.join(args.src_root, "core", "providers", c, "symbols.txt")
-    with open(file_name, "r") as file:
+    with open(file_name) as file:
         for line in file:
             line = line.strip()
             if line in symbols:
@@ -68,10 +68,10 @@ def parse_arguments():
         # external symbols are removed, xnnpack ep will be created via the standard ORT API.
         # https://github.com/microsoft/onnxruntime/pull/11798
         if c not in ("winml", "cuda", "migraphx", "qnn", "snpe", "xnnpack", "cann", "dnnl"):
-            file.write("#include <core/providers/%s/%s_provider_factory.h>\n" % (c, c))
+            file.write(f"#include <core/providers/{c}/{c}_provider_factory.h>\n")
     file.write("void* GetFunctionEntryByName(const char* name){\n")
     for symbol in symbols:
         if symbol != "OrtGetWinMLAdapter":
-            file.write('if(strcmp(name,"%s") ==0) return (void*)&%s;\n' % (symbol, symbol))
+            file.write(f'if(strcmp(name,"{symbol}") ==0) return (void*)&{symbol};\n')
     file.write("return NULL;\n")
     file.write("}\n")
diff --git a/tools/ci_build/get_docker_image.py b/tools/ci_build/get_docker_image.py
index 3639e58a1dccd..087aa09483fe4 100755
--- a/tools/ci_build/get_docker_image.py
+++ b/tools/ci_build/get_docker_image.py
@@ -68,12 +68,12 @@ def main():
         log.info("No container registry will be used")
 
     full_image_name = (
-        "{}.azurecr.io/{}:latest".format(args.container_registry, args.repository)
+        f"{args.container_registry}.azurecr.io/{args.repository}:latest"
         if use_container_registry
-        else "{}:latest".format(args.repository)
+        else f"{args.repository}:latest"
     )
 
-    log.info("Image: {}".format(full_image_name))
+    log.info(f"Image: {full_image_name}")
 
     dst_deps_file = Path(args.context) / "scripts" / "deps.txt"
     # The docker file may provide a special deps.txt in its docker context dir and uses that one.
@@ -86,7 +86,7 @@ def main():
         manylinux_build_scripts_folder = Path(args.manylinux_src) / "docker" / "build_scripts"
         dest = Path(args.context) / "build_scripts"
         if dest.exists():
-            log.info("Deleting: {}".format(str(dest)))
+            log.info(f"Deleting: {str(dest)}")
             shutil.rmtree(str(dest))
         shutil.copytree(str(manylinux_build_scripts_folder), str(dest))
         src_entrypoint_file = str(Path(args.manylinux_src) / "docker" / "manylinux-entrypoint")
diff --git a/tools/ci_build/github/android/build_aar_package.py b/tools/ci_build/github/android/build_aar_package.py
index c36d74402d406..2a93b3e706971 100644
--- a/tools/ci_build/github/android/build_aar_package.py
+++ b/tools/ci_build/github/android/build_aar_package.py
@@ -35,7 +35,7 @@ def _parse_build_settings(args):
     setting_file = args.build_settings_file.resolve()
 
     if not setting_file.is_file():
-        raise FileNotFoundError("Build config file {} is not a file.".format(setting_file))
+        raise FileNotFoundError(f"Build config file {setting_file} is not a file.")
 
     with open(setting_file) as f:
         build_settings_data = json.load(f)
@@ -99,7 +99,7 @@ def _build_aar(args):
     # Build binary for each ABI, one by one
     for abi in build_settings["build_abis"]:
         abi_build_dir = os.path.join(intermediates_dir, abi)
-        abi_build_command = base_build_command + ["--android_abi=" + abi, "--build_dir=" + abi_build_dir]
+        abi_build_command = [*base_build_command, "--android_abi=" + abi, "--build_dir=" + abi_build_dir]
 
         if ops_config_path is not None:
             abi_build_command += ["--include_ops_by_config=" + ops_config_path]
@@ -152,9 +152,9 @@ def _build_aar(args):
     ]
 
     # clean, build, and publish to a local directory
-    subprocess.run(gradle_command + ["clean"], env=temp_env, shell=False, check=True, cwd=JAVA_ROOT)
-    subprocess.run(gradle_command + ["build"], env=temp_env, shell=False, check=True, cwd=JAVA_ROOT)
-    subprocess.run(gradle_command + ["publish"], env=temp_env, shell=False, check=True, cwd=JAVA_ROOT)
+    subprocess.run([*gradle_command, "clean"], env=temp_env, shell=False, check=True, cwd=JAVA_ROOT)
+    subprocess.run([*gradle_command, "build"], env=temp_env, shell=False, check=True, cwd=JAVA_ROOT)
+    subprocess.run([*gradle_command, "publish"], env=temp_env, shell=False, check=True, cwd=JAVA_ROOT)
 
 
 def parse_args():
diff --git a/tools/ci_build/github/apple/build_and_assemble_ios_pods.py b/tools/ci_build/github/apple/build_and_assemble_ios_pods.py
index e316aa5d2c324..b4704fe1dd910 100755
--- a/tools/ci_build/github/apple/build_and_assemble_ios_pods.py
+++ b/tools/ci_build/github/apple/build_and_assemble_ios_pods.py
@@ -87,9 +87,7 @@ def run(arg_list, cwd=None):
     import subprocess
 
     log.info(
-        "Running subprocess in '{0}'\n  {1}".format(
-            cwd or os.getcwd(), " ".join([shlex.quote(arg) for arg in arg_list])
-        )
+        "Running subprocess in '{}'\n  {}".format(cwd or os.getcwd(), " ".join([shlex.quote(arg) for arg in arg_list]))
     )
 
     return subprocess.run(arg_list, check=True, cwd=cwd)
@@ -110,7 +108,8 @@ def main():
     build_ios_framework_args = [
         sys.executable,
         str(SCRIPT_DIR / "build_ios_framework.py"),
-    ] + args.build_ios_framework_extra_args
+        *args.build_ios_framework_extra_args,
+    ]
 
     if args.include_ops_by_config is not None:
         build_ios_framework_args += ["--include_ops_by_config", args.include_ops_by_config]
diff --git a/tools/ci_build/github/apple/build_ios_framework.py b/tools/ci_build/github/apple/build_ios_framework.py
index b5d84ff1c8a28..7983581f07fd6 100644
--- a/tools/ci_build/github/apple/build_ios_framework.py
+++ b/tools/ci_build/github/apple/build_ios_framework.py
@@ -52,7 +52,8 @@ def _build_for_ios_sysroot(
     # Build binary for each arch, one by one
     for current_arch in archs:
         build_dir_current_arch = os.path.join(intermediates_dir, sysroot + "_" + current_arch)
-        build_command = base_build_command + [
+        build_command = [
+            *base_build_command,
             "--ios_sysroot=" + sysroot,
             "--osx_arch=" + current_arch,
             "--build_dir=" + build_dir_current_arch,
@@ -203,12 +204,12 @@ def parse_args():
     args = parser.parse_args()
 
     if not args.build_settings_file.resolve().is_file():
-        raise FileNotFoundError("Build config file {} is not a file.".format(args.build_settings_file.resolve()))
+        raise FileNotFoundError(f"Build config file {args.build_settings_file.resolve()} is not a file.")
 
     if args.include_ops_by_config is not None:
         include_ops_by_config_file = args.include_ops_by_config.resolve()
         if not include_ops_by_config_file.is_file():
-            raise FileNotFoundError("Include ops config file {} is not a file.".format(include_ops_by_config_file))
+            raise FileNotFoundError(f"Include ops config file {include_ops_by_config_file} is not a file.")
 
     return args
 
diff --git a/tools/ci_build/github/apple/objectivec/assemble_objc_pod_package.py b/tools/ci_build/github/apple/objectivec/assemble_objc_pod_package.py
index 4401c865ed00f..f83ea12950687 100755
--- a/tools/ci_build/github/apple/objectivec/assemble_objc_pod_package.py
+++ b/tools/ci_build/github/apple/objectivec/assemble_objc_pod_package.py
@@ -94,7 +94,7 @@ def assemble_objc_pod_package(
         print("Warning: staging directory already exists", file=sys.stderr)
 
     # copy the necessary files to the staging directory
-    copy_repo_relative_to_dir([license_file] + source_files + test_source_files + test_resource_files, staging_dir)
+    copy_repo_relative_to_dir([license_file, *source_files, *test_source_files, *test_resource_files], staging_dir)
 
     # generate the podspec file from the template
 
diff --git a/tools/ci_build/github/apple/package_assembly_utils.py b/tools/ci_build/github/apple/package_assembly_utils.py
index 18a5b39ae2333..cb603fadc7371 100644
--- a/tools/ci_build/github/apple/package_assembly_utils.py
+++ b/tools/ci_build/github/apple/package_assembly_utils.py
@@ -46,7 +46,7 @@ def gen_file_from_template(
     :param strict Whether to require the set of template variable names in the file and the keys of
                   `variable_substitutions` to be equal.
     """
-    with open(template_file, mode="r") as template:
+    with open(template_file) as template:
         content = template.read()
 
     variables_in_file = set()
@@ -94,7 +94,7 @@ def load_json_config(json_config_file: pathlib.Path):
     :param json_config_file The JSON configuration file path.
     :return The configuration info values.
     """
-    with open(json_config_file, mode="r") as config:
+    with open(json_config_file) as config:
         return json.load(config)
 
 
@@ -104,5 +104,5 @@ def get_ort_version():
 
     :return The ONNX Runtime version string.
     """
-    with open(repo_root / "VERSION_NUMBER", mode="r") as version_file:
+    with open(repo_root / "VERSION_NUMBER") as version_file:
         return version_file.read().strip()
diff --git a/tools/ci_build/github/apple/test_ios_packages.py b/tools/ci_build/github/apple/test_ios_packages.py
index 5bfeb8b42bd29..0f464608518b9 100644
--- a/tools/ci_build/github/apple/test_ios_packages.py
+++ b/tools/ci_build/github/apple/test_ios_packages.py
@@ -30,13 +30,13 @@ def _test_ios_packages(args):
     # should be under the c_framework_dir
     c_framework_dir = args.c_framework_dir.resolve()
     if not c_framework_dir.is_dir():
-        raise FileNotFoundError("c_framework_dir {} is not a folder.".format(c_framework_dir))
+        raise FileNotFoundError(f"c_framework_dir {c_framework_dir} is not a folder.")
 
     has_framework = (c_framework_dir / "onnxruntime.framework").exists()
     has_xcframework = (c_framework_dir / "onnxruntime.xcframework").exists()
 
     if not has_framework and not has_xcframework:
-        raise FileNotFoundError("{} does not have onnxruntime.framework/xcframework".format(c_framework_dir))
+        raise FileNotFoundError(f"{c_framework_dir} does not have onnxruntime.framework/xcframework")
 
     if has_framework and has_xcframework:
         raise ValueError("Cannot proceed when both onnxruntime.framework " "and onnxruntime.xcframework exist")
@@ -91,7 +91,7 @@ def _test_ios_packages(args):
         shutil.make_archive(zip_file_path.with_suffix(""), "zip", root_dir=local_pods_dir)
 
         # update the podspec to point to the local framework zip file
-        with open(podspec, "r") as file:
+        with open(podspec) as file:
             file_data = file.read()
 
         file_data = file_data.replace("file:///http_source_placeholder", f"file:///{zip_file_path}")
diff --git a/tools/ci_build/github/azure-pipelines/python-checks-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/python-checks-ci-pipeline.yml
deleted file mode 100644
index 17c2b8766d891..0000000000000
--- a/tools/ci_build/github/azure-pipelines/python-checks-ci-pipeline.yml
+++ /dev/null
@@ -1,19 +0,0 @@
-jobs:
-- job: 'PythonCodeChecks'
-  pool:
-    vmImage: 'ubuntu-20.04'
-
-  timeoutInMinutes: 10
-
-  steps:
-  - task: UsePythonVersion@0
-    inputs:
-      versionSpec: '3.8'
-      addToPath: true
-      architecture: 'x64'
-
-  - script: python -m pip install -r tools/ci_build/github/python_checks/requirements.txt
-    displayName: "Install requirements"
-
-  - script: python -m flake8 --config .flake8
-    displayName: "Run Flake8"
diff --git a/tools/ci_build/github/linux/ort_minimal/build_ort_and_check_binary_size.py b/tools/ci_build/github/linux/ort_minimal/build_ort_and_check_binary_size.py
index 2a461108f3a52..dd5af86e3ad52 100644
--- a/tools/ci_build/github/linux/ort_minimal/build_ort_and_check_binary_size.py
+++ b/tools/ci_build/github/linux/ort_minimal/build_ort_and_check_binary_size.py
@@ -27,7 +27,7 @@ def parse_args():
 def main():
     args = parse_args()
 
-    with open(args.build_check_binsize_config, mode="r") as config_file:
+    with open(args.build_check_binsize_config) as config_file:
         config = json.load(config_file)
 
     config_type = config["type"]
@@ -38,8 +38,7 @@ def main():
 
     # build ORT
     build_command = (
-        [sys.executable, str(REPO_ROOT / "tools/ci_build/build.py")]
-        + build_params
+        [sys.executable, str(REPO_ROOT / "tools/ci_build/build.py"), *build_params]
         + (["--cmake_extra_defines", "ADD_DEBUG_INFO_TO_MINIMAL_BUILD=ON"] if args.with_debug_info else [])
         # put the following options last so they don't get overridden by build_params
         + [
diff --git a/tools/ci_build/github/linux/ort_minimal/check_build_binary_size.py b/tools/ci_build/github/linux/ort_minimal/check_build_binary_size.py
index 6d7c70ae953fb..ea4a3fd32b18b 100644
--- a/tools/ci_build/github/linux/ort_minimal/check_build_binary_size.py
+++ b/tools/ci_build/github/linux/ort_minimal/check_build_binary_size.py
@@ -11,8 +11,7 @@
 
 
 def _check_binary_size(path, readelf, threshold, os_str, arch, build_config):
-
-    print("Checking binary size of {} using {}".format(path, readelf))
+    print(f"Checking binary size of {path} using {readelf}")
     ondisk_size = os.path.getsize(path)
 
     print("Section:size in bytes")
@@ -20,17 +19,15 @@ def _check_binary_size(path, readelf, threshold, os_str, arch, build_config):
     sections = readelf_utils.get_section_sizes(path, readelf, sys.stdout)
     sections_total = sum(sections.values())
 
-    print("Sections total={} bytes".format(sections_total))
-    print("File size={} bytes".format(ondisk_size))
+    print(f"Sections total={sections_total} bytes")
+    print(f"File size={ondisk_size} bytes")
 
     # Write the binary size to a file for uploading later
     # On-disk binary size jumps in 4KB increments so we use the total of the sections as it has finer granularity.
     # Note that the sum of the section is slightly larger than the on-disk size
     # due to packing and/or alignment adjustments.
     with open(os.path.join(os.path.dirname(path), "binary_size_data.txt"), "w") as file:
-        file.writelines(
-            ["os,arch,build_config,size\n", "{},{},{},{}\n".format(os_str, arch, build_config, sections_total)]
-        )
+        file.writelines(["os,arch,build_config,size\n", f"{os_str},{arch},{build_config},{sections_total}\n"])
 
     if threshold is not None and sections_total > threshold:
         raise RuntimeError(
diff --git a/tools/ci_build/github/linux/ort_minimal/readelf_utils.py b/tools/ci_build/github/linux/ort_minimal/readelf_utils.py
index 4c0d77805afe7..43bc107df401b 100644
--- a/tools/ci_build/github/linux/ort_minimal/readelf_utils.py
+++ b/tools/ci_build/github/linux/ort_minimal/readelf_utils.py
@@ -39,7 +39,7 @@ def get_section_sizes(binary_path, readelf_path, dump_to_file=None):
         section_sizes[name] = size
 
         if dump_to_file:
-            print("{}:{}".format(name, size), file=dump_to_file)
+            print(f"{name}:{size}", file=dump_to_file)
 
     return section_sizes
 
@@ -104,17 +104,17 @@ def main():
 
     out_file = sys.stdout
     if args.write_to:
-        out_file = open(args.write_to, "w")
+        out_file = open(args.write_to, "w")  # noqa: SIM115
 
     if args.base_binary_path:
         diffs = diff_sections_total_size(args.base_binary_path, args.binary_path, args.readelf_path)
         for key, value in diffs.items():
-            print("{}:{}".format(key, value), file=out_file)
+            print(f"{key}:{value}", file=out_file)
     else:
         section_sizes = get_section_sizes(args.binary_path, args.readelf_path, out_file)
         filesize = os.path.getsize(args.binary_path)
-        print("Sections total:{}".format(sum(section_sizes.values())), file=out_file)
-        print("File size:{}".format(filesize), file=out_file)
+        print(f"Sections total:{sum(section_sizes.values())}", file=out_file)
+        print(f"File size:{filesize}", file=out_file)
 
     if args.write_to:
         out_file.close()
diff --git a/tools/ci_build/github/python_checks/readme.md b/tools/ci_build/github/python_checks/readme.md
deleted file mode 100644
index b31300d6cf07b..0000000000000
--- a/tools/ci_build/github/python_checks/readme.md
+++ /dev/null
@@ -1,18 +0,0 @@
-# Python Code Checks
-
-Python code checks are run by this [CI build](../azure-pipelines/python-checks-ci-pipeline.yml).
-Here are instructions on how to run them manually.
-
-## Prerequisites
-
-Install requirements.
-
-From the repo root, run:
-
-`$ python -m pip install -r tools/ci_build/github/python_checks/requirements.txt`
-
-## Flake8
-
-From the repo root, run:
-
-`$ python -m flake8 --config .flake8`
diff --git a/tools/ci_build/github/python_checks/requirements.txt b/tools/ci_build/github/python_checks/requirements.txt
deleted file mode 100644
index b5446261e8e51..0000000000000
--- a/tools/ci_build/github/python_checks/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-flake8==3.9
diff --git a/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py b/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py
index aadfaf6ff8864..acca4fb13c45a 100644
--- a/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py
+++ b/tools/ci_build/github/windows/post_binary_sizes_to_dashboard.py
@@ -40,7 +40,7 @@ def parse_arguments():
 
 def get_binary_sizes(size_data_file):
     binary_size = []
-    with open(size_data_file, "r") as f:
+    with open(size_data_file) as f:
         line = f.readline()
         headers = line.strip().split(",")
         while line:
diff --git a/tools/ci_build/github/windows/post_code_coverage_to_dashboard.py b/tools/ci_build/github/windows/post_code_coverage_to_dashboard.py
index 424910e18dc73..cedd4ea18471c 100755
--- a/tools/ci_build/github/windows/post_code_coverage_to_dashboard.py
+++ b/tools/ci_build/github/windows/post_code_coverage_to_dashboard.py
@@ -33,7 +33,7 @@ def parse_arguments():
 
 def parse_txt_report(report_file):
     data = {}
-    with open(report_file, "r") as report:
+    with open(report_file) as report:
         for line in reversed(report.readlines()):
             if "TOTAL" in line:
                 fields = line.strip().split()
diff --git a/tools/ci_build/op_registration_utils.py b/tools/ci_build/op_registration_utils.py
index 5120552e81330..178bb0deb13f2 100644
--- a/tools/ci_build/op_registration_utils.py
+++ b/tools/ci_build/op_registration_utils.py
@@ -39,7 +39,7 @@ def map_ort_constant_to_domain(ort_constant_name: str, allow_unknown_constant: b
     if ort_constant_name in constant_to_domain_map:
         return constant_to_domain_map[ort_constant_name]
 
-    unknown_constant_message = "Unknown domain for ONNX Runtime constant of {}.".format(ort_constant_name)
+    unknown_constant_message = f"Unknown domain for ONNX Runtime constant of {ort_constant_name}."
     if not allow_unknown_constant:
         raise ValueError(unknown_constant_message)
 
@@ -167,7 +167,7 @@ def _process_lines(lines: typing.List[str], offset: int, registration_processor:
         # e.g. BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(
         #          kCpuExecutionProvider, kOnnxDomain, 7, Cos)>,
         trim_at = code_line.index(onnx_op) + onnx_op_len + 1
-        *_, domain, start_version, op_type = [arg.strip() for arg in code_line[trim_at : -len(end_mark)].split(",")]
+        *_, domain, start_version, op_type = (arg.strip() for arg in code_line[trim_at : -len(end_mark)].split(","))
 
         registration_processor.process_registration(lines_to_process, domain, op_type, int(start_version), None, None)
 
@@ -175,18 +175,18 @@ def _process_lines(lines: typing.List[str], offset: int, registration_processor:
         # e.g. BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(
         #          kCpuExecutionProvider, kOnnxDomain, 7, double, Sin)>,
         trim_at = code_line.index(onnx_typed_op) + onnx_typed_op_len + 1
-        *_, domain, start_version, type, op_type = [
+        *_, domain, start_version, type, op_type = (
             arg.strip() for arg in code_line[trim_at : -len(end_mark)].split(",")
-        ]
+        )
         registration_processor.process_registration(lines_to_process, domain, op_type, int(start_version), None, type)
 
     elif onnx_versioned_op in code_line:
         # e.g. BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(
         #          kCpuExecutionProvider, kOnnxDomain, 1, 10, Hardmax)>,
         trim_at = code_line.index(onnx_versioned_op) + onnx_versioned_op_len + 1
-        *_, domain, start_version, end_version, op_type = [
+        *_, domain, start_version, end_version, op_type = (
             arg.strip() for arg in code_line[trim_at : -len(end_mark)].split(",")
-        ]
+        )
         registration_processor.process_registration(
             lines_to_process, domain, op_type, int(start_version), int(end_version), None
         )
@@ -195,15 +195,15 @@ def _process_lines(lines: typing.List[str], offset: int, registration_processor:
         # e.g. BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(
         #          kCpuExecutionProvider, kOnnxDomain, 1, 10, float, LogSoftmax)>,
         trim_at = code_line.index(onnx_versioned_typed_op) + onnx_versioned_typed_op_len + 1
-        *_, domain, start_version, end_version, type, op_type = [
+        *_, domain, start_version, end_version, type, op_type = (
             arg.strip() for arg in code_line[trim_at : -len(end_mark)].split(",")
-        ]
+        )
         registration_processor.process_registration(
             lines_to_process, domain, op_type, int(start_version), int(end_version), type
         )
 
     else:
-        log.warning("Ignoring unhandled kernel registration variant: {}".format(code_line))
+        log.warning(f"Ignoring unhandled kernel registration variant: {code_line}")
         for line in lines_to_process:
             registration_processor.process_other_line(line)
 
@@ -221,16 +221,15 @@ def process_kernel_registration_file(
     """
 
     if not os.path.isfile(filename):
-        log.error("File not found: {}".format(filename))
+        log.error(f"File not found: {filename}")
         return False
 
     lines = []
-    with open(filename, "r") as file_to_read:
+    with open(filename) as file_to_read:
         lines = file_to_read.readlines()
 
     offset = 0
     while offset < len(lines):
-
         line = lines[offset]
         stripped = line.strip()
 
diff --git a/tools/ci_build/op_registration_validator.py b/tools/ci_build/op_registration_validator.py
index 975c06ba1fb25..34734cbe732d9 100644
--- a/tools/ci_build/op_registration_validator.py
+++ b/tools/ci_build/op_registration_validator.py
@@ -88,16 +88,15 @@ def validate_last_registrations(self):
             # TODO remove once CUDA EP supports ArgMin/ArgMax for opset 12+
             ops_with_incomplete_support = ["kOnnxDomain:ArgMin", "kOnnxDomain:ArgMax"]
             if key in ops_with_incomplete_support:
-                log.warn("Allowing missing unversioned registration for op with incomplete support: {}".format(key))
+                log.warn(f"Allowing missing unversioned registration for op with incomplete support: {key}")
                 allow_missing_unversioned_registration = True
 
             if opset_to and not allow_missing_unversioned_registration:
-                log.error("Missing unversioned registration for {}".format(key))
+                log.error(f"Missing unversioned registration for {key}")
                 self.failed = True
 
 
 if __name__ == "__main__":
-
     parser = argparse.ArgumentParser(description="Script to validate operator kernel registrations.")
 
     parser.add_argument(
@@ -114,7 +113,7 @@ def validate_last_registrations(self):
     registration_files = op_registration_utils.get_kernel_registration_files(ort_root, include_cuda)
 
     for file in registration_files:
-        log.info("Processing {}".format(file))
+        log.info(f"Processing {file}")
 
         processor = RegistrationValidator()
         op_registration_utils.process_kernel_registration_file(file, processor)
diff --git a/tools/ci_build/reduce_op_kernels.py b/tools/ci_build/reduce_op_kernels.py
index 47e7d5b15408b..0c1be721527f3 100755
--- a/tools/ci_build/reduce_op_kernels.py
+++ b/tools/ci_build/reduce_op_kernels.py
@@ -21,8 +21,8 @@
 ORT_ROOT = SCRIPT_DIR.parents[1]
 sys.path.append(str(ORT_ROOT / "tools" / "python"))
 
-from util import parse_config  # noqa
-from util.ort_format_model.operator_type_usage_processors import OpTypeImplFilterInterface  # noqa
+from util import parse_config  # noqa: E402
+from util.ort_format_model.operator_type_usage_processors import OpTypeImplFilterInterface  # noqa: E402
 
 log = get_logger("reduce_op_kernels")
 
@@ -40,7 +40,6 @@ def _adapt_filters_for_extended_minimal_build(
     extended_minimal_build_required_op_ids = set()  # set of (domain, optype, opset)
     with open(
         ORT_ROOT / "onnxruntime/core/optimizer/transpose_optimizer/layout_transformation_potentially_added_ops.h",
-        mode="r",
     ) as f:
         region_boundary_pattern = re.compile(r"@@region_(begin|end)\(extended_minimal_build_required_kernels\)@@")
         op_id_pattern = re.compile(
@@ -97,7 +96,7 @@ def get_cpp_entries(self):
 
         adapted_op_type_impl_filter = _AdaptedFilter(
             base_op_type_impl_filter,
-            set([(domain, optype) for (domain, optype, opset) in extended_minimal_build_required_op_ids]),
+            {(domain, optype) for (domain, optype, opset) in extended_minimal_build_required_op_ids},
         )
 
     return (adapted_required_ops, adapted_op_type_impl_filter)
@@ -143,7 +142,7 @@ def process_registration(
         type: typing.Optional[str] = None,
     ):
         registration_identifier = "{}:{}({}){}".format(
-            constant_for_domain, operator, start_version, "<{}>".format(type) if type else ""
+            constant_for_domain, operator, start_version, f"<{type}>" if type else ""
         )
 
         # convert from the ORT constant name to the domain string used in the config
@@ -162,12 +161,10 @@ def process_registration(
                     exclude = True
                     reason = "Specific typed registration is not required."
         else:
-            log.warning(
-                "Keeping {} registration from unknown domain: {}".format(registration_identifier, constant_for_domain)
-            )
+            log.warning(f"Keeping {registration_identifier} registration from unknown domain: {constant_for_domain}")
 
         if exclude:
-            log.info("Disabling {} registration: {}".format(registration_identifier, reason))
+            log.info(f"Disabling {registration_identifier} registration: {reason}")
             for line in lines:
                 self._output_file.write("// " + line)
 
@@ -217,7 +214,7 @@ def _generate_provider_registrations(
         if not kernel_registration_file.is_file():
             raise ValueError(f"Kernel registration file does not exist: {kernel_registration_file}")
 
-        log.info("Processing {}".format(kernel_registration_file))
+        log.info(f"Processing {kernel_registration_file}")
 
         reduced_path = _get_op_reduction_file_path(ort_root, build_dir, kernel_registration_file)
 
@@ -257,13 +254,13 @@ def _generate_type_control_overrides(ort_root: Path, build_dir: Path, cpp_lines:
     if cpp_lines:
         # find the insertion block and replace any existing content in it
         inserted = False
-        with open(src, "r") as input, open(target, "w") as output:
+        with open(src) as input, open(target, "w") as output:
             inside_insertion_block = False
             for line in input.readlines():
                 if "@@insertion_point_begin(allowed_types)@@" in line:
                     inside_insertion_block = True
                     output.write(line)
-                    [output.write("{}\n".format(code_line)) for code_line in cpp_lines]
+                    [output.write(f"{code_line}\n") for code_line in cpp_lines]
                     inserted = True
                     continue
                 elif inside_insertion_block:
@@ -276,7 +273,7 @@ def _generate_type_control_overrides(ort_root: Path, build_dir: Path, cpp_lines:
                 output.write(line)
 
         if not inserted:
-            raise RuntimeError("Insertion point was not found in {}".format(target))
+            raise RuntimeError(f"Insertion point was not found in {target}")
 
 
 def reduce_ops(
diff --git a/tools/ci_build/replace_urls_in_deps.py b/tools/ci_build/replace_urls_in_deps.py
index 839e12907c0b1..28e3c91107c6c 100644
--- a/tools/ci_build/replace_urls_in_deps.py
+++ b/tools/ci_build/replace_urls_in_deps.py
@@ -28,15 +28,15 @@ def parse_arguments():
 
 
 def main():
-    SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
-    REPO_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "..", ".."))
+    SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))  # noqa: N806
+    REPO_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "..", ".."))  # noqa: N806
 
     args = parse_arguments()
     new_dir = None
     if args.new_dir:
         new_dir = Path(args.new_dir)
     else:
-        BUILD_BINARIESDIRECTORY = os.environ.get("BUILD_BINARIESDIRECTORY")
+        BUILD_BINARIESDIRECTORY = os.environ.get("BUILD_BINARIESDIRECTORY")  # noqa: N806
         if BUILD_BINARIESDIRECTORY is None:
             raise NameError("Please specify --new_dir or set the env var BUILD_BINARIESDIRECTORY")
         new_dir = Path(BUILD_BINARIESDIRECTORY) / "deps"
diff --git a/tools/ci_build/update_tsaoptions.py b/tools/ci_build/update_tsaoptions.py
index 460874da32f3d..07be746aa1981 100644
--- a/tools/ci_build/update_tsaoptions.py
+++ b/tools/ci_build/update_tsaoptions.py
@@ -11,7 +11,7 @@
 with (REPO_DIR / ".config" / "tsaoptions.json").open() as f:
     data = json.load(f)
 
-buildNumber = os.getenv("BUILD_BUILDNUMBER")
+buildNumber = os.getenv("BUILD_BUILDNUMBER")  # noqa: N816
 if buildNumber is not None:
     data["buildNumber"] = buildNumber
 
diff --git a/tools/ci_build/upload_python_package_to_azure_storage.py b/tools/ci_build/upload_python_package_to_azure_storage.py
index 05ec9df8dfa9b..365cb67381ce7 100755
--- a/tools/ci_build/upload_python_package_to_azure_storage.py
+++ b/tools/ci_build/upload_python_package_to_azure_storage.py
@@ -25,7 +25,7 @@ def parse_nightly_and_local_version_from_whl_name(blob_name):
 
 
 def run_subprocess(args, cwd=None):
-    log.warning("Running subprocess in '{0}'\n{1}".format(cwd or os.getcwd(), args))
+    log.warning(f"Running subprocess in '{cwd or os.getcwd()}'\n{args}")
     return subprocess.run(args, cwd=cwd, check=True)
 
 
@@ -36,11 +36,11 @@ def upload_whl(python_wheel_path, final_storage=False):
 
     nightly_build, local_version = parse_nightly_and_local_version_from_whl_name(blob_name)
     if local_version:
-        html_blob_name = "onnxruntime_{}_{}.html".format(nightly_build, local_version)
+        html_blob_name = f"onnxruntime_{nightly_build}_{local_version}.html"
     else:
-        html_blob_name = "onnxruntime_{}.html".format(nightly_build)
+        html_blob_name = f"onnxruntime_{nightly_build}.html"
 
-    download_path_to_html = "./onnxruntime_{}.html".format(nightly_build)
+    download_path_to_html = f"./onnxruntime_{nightly_build}.html"
 
     run_subprocess(
         [
@@ -64,7 +64,7 @@ def upload_whl(python_wheel_path, final_storage=False):
             for item in lines:
                 f.write("%s\n" % item)
     else:
-        warnings.warn("'{}' exists in {}. The html file is not updated.".format(new_line, download_path_to_html))
+        warnings.warn(f"'{new_line}' exists in {download_path_to_html}. The html file is not updated.")
     run_subprocess(
         [
             "azcopy",
diff --git a/tools/doc/rename_folders.py b/tools/doc/rename_folders.py
index d65b8a350eed1..6df5bffc8232b 100644
--- a/tools/doc/rename_folders.py
+++ b/tools/doc/rename_folders.py
@@ -13,7 +13,7 @@ def rename_folder(root):
     Returns the list of renamed folders.
     """
     found = []
-    for r, dirs, files in os.walk(root):
+    for r, dirs, _files in os.walk(root):
         for name in dirs:
             if name.startswith("_"):
                 found.append((r, name))
@@ -35,12 +35,12 @@ def replace_files(root, renamed):
     subs = {r[1]: r[2] for r in renamed}
     reg = re.compile('(\\"[a-zA-Z0-9\\.\\/\\?\\:@\\-_=#]+\\.([a-zA-Z]){2,6}' '([a-zA-Z0-9\\.\\&\\/\\?\\:@\\-_=#])*\\")')
 
-    for r, dirs, files in os.walk(root):
+    for r, _dirs, files in os.walk(root):
         for name in files:
             if os.path.splitext(name)[-1] != ".html":
                 continue
             full = os.path.join(r, name)
-            with open(full, "r", encoding="utf-8") as f:
+            with open(full, encoding="utf-8") as f:
                 content = f.read()
             find = reg.findall(content)
             repl = []
@@ -49,7 +49,7 @@ def replace_files(root, renamed):
                     continue
                 for k, v in subs.items():
                     if k == v:
-                        raise ValueError("%r == %r" % (k, v))
+                        raise ValueError(f"{k!r} == {v!r}")
                     if ('"%s' % k) in f[0]:
                         repl.append((f[0], f[0].replace('"%s' % k, '"%s' % v)))
                     if ("/%s" % k) in f[0]:
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index db2e4d2e1d9f3..8500d125bff49 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -362,7 +362,7 @@ def generate_files(line_list, args):
     else:
         runtimes_native_folder = "native"
 
-    runtimes = '{}{}\\{}"'.format(runtimes_target, args.target_architecture, runtimes_native_folder)
+    runtimes = f'{runtimes_target}{args.target_architecture}\\{runtimes_native_folder}"'
 
     # Process headers
     files_list.append(
diff --git a/tools/nuget/validate_package.py b/tools/nuget/validate_package.py
index 5baa2f603c5d7..3b2d690d22479 100644
--- a/tools/nuget/validate_package.py
+++ b/tools/nuget/validate_package.py
@@ -89,7 +89,7 @@ def check_if_dlls_are_present(
     platforms = platforms_supported.strip().split(",")
     if package_type == "tarball":
         file_list_in_package = list()
-        for (dirpath, dirnames, filenames) in os.walk(package_path):
+        for dirpath, _dirnames, filenames in os.walk(package_path):
             file_list_in_package += [os.path.join(dirpath, file) for file in filenames]
     else:
         file_list_in_package = zip_file.namelist()
@@ -335,7 +335,7 @@ def main():
     elif args.package_type == "zip":
         validate_zip(args)
     else:
-        print("Package type {} is not supported".format(args.package_type))
+        print(f"Package type {args.package_type} is not supported")
 
 
 if __name__ == "__main__":
diff --git a/tools/python/PythonTools.md b/tools/python/PythonTools.md
index 2dbf962db3e57..a9dfe6470b365 100644
--- a/tools/python/PythonTools.md
+++ b/tools/python/PythonTools.md
@@ -98,7 +98,7 @@ import ort_test_dir_utils
 try:
     ort_test_dir_utils.run_test_dir('temp/examples/test1')
     ort_test_dir_utils.run_test_dir('temp/examples/test2/expand_elimination.onnx')
-except:
+except Exception:
     print("Exception:", sys.exc_info()[1])
 ```
 
diff --git a/tools/python/create_reduced_build_config.py b/tools/python/create_reduced_build_config.py
index f7bbe5001c685..bda913924b5ae 100644
--- a/tools/python/create_reduced_build_config.py
+++ b/tools/python/create_reduced_build_config.py
@@ -74,7 +74,6 @@ def _extract_ops_from_onnx_model(model_files: typing.Iterable[pathlib.Path]):
 
 
 def create_config_from_onnx_models(model_files: typing.Iterable[pathlib.Path], output_file: pathlib.Path):
-
     required_ops = _extract_ops_from_onnx_model(model_files)
 
     output_file.parent.mkdir(parents=True, exist_ok=True)
diff --git a/tools/python/dump_ort_model.py b/tools/python/dump_ort_model.py
index acb69a593eb27..2177c42f5bc35 100644
--- a/tools/python/dump_ort_model.py
+++ b/tools/python/dump_ort_model.py
@@ -20,10 +20,10 @@ def __init__(self, model_path: str):
         Initialize ORT format model dumper
         :param model_path: Path to model
         """
-        self._file = open(model_path, "rb").read()
+        self._file = open(model_path, "rb").read()  # noqa: SIM115
         self._buffer = bytearray(self._file)
         if not fbs.InferenceSession.InferenceSession.InferenceSessionBufferHasIdentifier(self._buffer, 0):
-            raise RuntimeError("File does not appear to be a valid ORT format model: '{}'".format(model_path))
+            raise RuntimeError(f"File does not appear to be a valid ORT format model: '{model_path}'")
         self._inference_session = fbs.InferenceSession.InferenceSession.GetRootAsInferenceSession(self._buffer, 0)
         self._model = self._inference_session.Model()
 
diff --git a/tools/python/example_operator_perf_test.py b/tools/python/example_operator_perf_test.py
index 50a3edd5c9b27..b40ec54e9f592 100644
--- a/tools/python/example_operator_perf_test.py
+++ b/tools/python/example_operator_perf_test.py
@@ -70,7 +70,6 @@ def create_test_input(n, num_items, k):
 # Example code that tests various combinations of input sizes.
 #
 def run_perf_tests(model_path, num_threads=1):
-
     so = rt.SessionOptions()
     so.intra_op_num_threads = num_threads
     sess = rt.InferenceSession(model_path, sess_options=so)
@@ -99,7 +98,7 @@ def run_test():
         # run the model and measure time after 'iters' calls
         while total < num_seconds:
             start = time.time_ns()
-            for i in range(iters):
+            for _i in range(iters):
                 # ignore the outputs as we're not validating them in a performance test
                 sess.run(None, inputs)
             end = time.time_ns()
@@ -108,7 +107,7 @@ def run_test():
             total_iters += iters
 
         # Adjust the output you want as needed
-        print("n={},items={},k={},avg:{:.4f}".format(n, num_items, k, total / total_iters))
+        print(f"n={n},items={num_items},k={k},avg:{total / total_iters:.4f}")
 
     # combine the various input parameters and create input for each test
     for n in batches:
@@ -128,7 +127,6 @@ def run_test():
 # so that the model can be easily run directly or from a debugger.
 #
 def create_example_test_directory():
-
     # fill in the inputs that we want to use specific values for
     input_data = {}
     input_data["K"] = np.asarray([64]).astype(np.int64)
diff --git a/tools/python/find_optimizer_opset_version_updates_required.py b/tools/python/find_optimizer_opset_version_updates_required.py
index bad3258e93887..0076d27fe950e 100644
--- a/tools/python/find_optimizer_opset_version_updates_required.py
+++ b/tools/python/find_optimizer_opset_version_updates_required.py
@@ -25,7 +25,7 @@ def parse_args():
     args = parser.parse_args()
 
     if not os.path.isdir(args.ort_root):
-        raise argparse.ArgumentError(root_arg, "{} is not a valid directory".format(args.ort_root))
+        raise argparse.ArgumentError(root_arg, f"{args.ort_root} is not a valid directory")
 
     return args
 
@@ -164,7 +164,6 @@ def get_latest_onnx_op_versions(root_dir):
 
 
 def find_potential_issues(root_dir, op_to_opset):
-
     optimizer_dir = os.path.join(root_dir, "onnxruntime/core/optimizer")
 
     files = glob.glob(optimizer_dir + "/**/*.cc", recursive=True)
@@ -179,7 +178,7 @@ def find_potential_issues(root_dir, op_to_opset):
             args = call.split(",", 2)  # first 2 args are simple, remainder need custom processing
             op = args[1].strip()
             if not op.startswith('"') or not op.endswith('"'):
-                log.error("Symbolic name of '{}' found for op. Please check manually. File:{}".format(op, file))
+                log.error(f"Symbolic name of '{op}' found for op. Please check manually. File:{file}")
                 continue
 
             versions_and_domain_arg = args[2]
@@ -205,7 +204,7 @@ def find_potential_issues(root_dir, op_to_opset):
                         )
                     )
             else:
-                log.error("Failed to find version information for {}. File:{}".format(op, file))
+                log.error(f"Failed to find version information for {op}. File:{file}")
 
 
 if __name__ == "__main__":
diff --git a/tools/python/gen_contrib_doc.py b/tools/python/gen_contrib_doc.py
index 15e7f65d093d9..9550cc66f65cd 100644
--- a/tools/python/gen_contrib_doc.py
+++ b/tools/python/gen_contrib_doc.py
@@ -2,22 +2,20 @@
 
 # This file is copied and adapted from https://github.com/onnx/onnx repository.
 # There was no copyright statement on the file at the time of copying.
-from __future__ import absolute_import, division, print_function, unicode_literals
 
 import argparse
-import io
 import os
 import pathlib
 import sys
 from collections import defaultdict
-from typing import Any, Dict, List, Sequence, Set, Text, Tuple
+from typing import Any, Dict, List, Sequence, Set, Text, Tuple  # noqa: F401
 
 import numpy as np  # type: ignore
-from onnx import AttributeProto, FunctionProto
+from onnx import AttributeProto, FunctionProto  # noqa: F401
 
 import onnxruntime.capi.onnxruntime_pybind11_state as rtpy
 from onnxruntime.capi.onnxruntime_pybind11_state import schemadef  # noqa: F401
-from onnxruntime.capi.onnxruntime_pybind11_state.schemadef import OpSchema  # noqa: F401
+from onnxruntime.capi.onnxruntime_pybind11_state.schemadef import OpSchema
 
 ONNX_ML = not bool(os.getenv("ONNX_ML") == "0")
 ONNX_DOMAIN = "onnx"
@@ -32,11 +30,11 @@
 def display_number(v):  # type: (int) -> Text
     if OpSchema.is_infinite(v):
         return "&#8734;"
-    return Text(v)
+    return str(v)
 
 
 def should_render_domain(domain, domain_filter):  # type: (Text) -> bool
-    if domain == ONNX_DOMAIN or domain == "" or domain == ONNX_ML_DOMAIN or domain == "ai.onnx.ml":
+    if domain in (ONNX_DOMAIN, ONNX_ML_DOMAIN) or domain == "" or domain == "ai.onnx.ml":
         return False
 
     if domain_filter and domain not in domain_filter:
@@ -47,18 +45,18 @@ def should_render_domain(domain, domain_filter):  # type: (Text) -> bool
 
 def format_name_with_domain(domain, schema_name):  # type: (Text, Text) -> Text
     if domain:
-        return "{}.{}".format(domain, schema_name)
+        return f"{domain}.{schema_name}"
     else:
         return schema_name
 
 
 def format_name_with_version(schema_name, version):  # type: (Text, Text) -> Text
-    return "{}-{}".format(schema_name, version)
+    return f"{schema_name}-{version}"
 
 
 def display_attr_type(v):  # type: (OpSchema.AttrType) -> Text
     assert isinstance(v, OpSchema.AttrType)
-    s = Text(v)
+    s = str(v)
     s = s[s.rfind(".") + 1 :].lower()
     if s[-1] == "s":
         s = "list of " + s
@@ -67,7 +65,7 @@ def display_attr_type(v):  # type: (OpSchema.AttrType) -> Text
 
 def display_domain(domain):  # type: (Text) -> Text
     if domain:
-        return "the '{}' operator set".format(domain)
+        return f"the '{domain}' operator set"
     else:
         return "the default ONNX operator set"
 
@@ -81,14 +79,14 @@ def display_domain_short(domain):  # type: (Text) -> Text
 
 def display_version_link(name, version):  # type: (Text, int) -> Text
     changelog_md = "Changelog" + ext
-    name_with_ver = "{}-{}".format(name, version)
-    return '<a href="{}#{}">{}</a>'.format(changelog_md, name_with_ver, name_with_ver)
+    name_with_ver = f"{name}-{version}"
+    return f'<a href="{changelog_md}#{name_with_ver}">{name_with_ver}</a>'
 
 
 def display_function_version_link(name, version):  # type: (Text, int) -> Text
     changelog_md = "FunctionsChangelog" + ext
-    name_with_ver = "{}-{}".format(name, version)
-    return '<a href="{}#{}">{}</a>'.format(changelog_md, name_with_ver, name_with_ver)
+    name_with_ver = f"{name}-{version}"
+    return f'<a href="{changelog_md}#{name_with_ver}">{name_with_ver}</a>'
 
 
 def get_attribute_value(attr):  # type: (AttributeProto) -> Any
@@ -113,7 +111,7 @@ def get_attribute_value(attr):  # type: (AttributeProto) -> Any
     elif len(attr.graphs):
         return list(attr.graphs)
     else:
-        raise ValueError("Unsupported ONNX attribute: {}".format(attr))
+        raise ValueError(f"Unsupported ONNX attribute: {attr}")
 
 
 def display_schema(schema, versions):  # type: (OpSchema, Sequence[OpSchema]) -> Text
@@ -134,9 +132,9 @@ def display_schema(schema, versions):  # type: (OpSchema, Sequence[OpSchema]) ->
         s += (
             "\nThis version of the operator has been "
             + ("deprecated" if schema.deprecated else "available")
-            + " since version {}".format(schema.since_version)
+            + f" since version {schema.since_version}"
         )
-        s += " of {}.\n".format(display_domain(schema.domain))
+        s += f" of {display_domain(schema.domain)}.\n"
         if len(versions) > 1:
             # TODO: link to the Changelog.md
             s += "\nOther versions of this operator: {}\n".format(
@@ -166,7 +164,7 @@ def display_schema(schema, versions):  # type: (OpSchema, Sequence[OpSchema]) ->
                 def format_value(value):  # type: (Any) -> Text
                     if isinstance(value, float):
                         value = np.round(value, 5)
-                    if isinstance(value, (bytes, bytearray)) and sys.version_info[0] == 3:
+                    if isinstance(value, (bytes, bytearray)) and sys.version_info[0] == 3:  # noqa: YTT201
                         value = value.decode("utf-8")
                     return str(value)
 
@@ -174,18 +172,18 @@ def format_value(value):  # type: (Any) -> Text
                     default_value = [format_value(val) for val in default_value]
                 else:
                     default_value = format_value(default_value)
-                opt = "default is {}".format(default_value)
+                opt = f"default is {default_value}"
 
             s += "<dt><tt>{}</tt> : {}{}</dt>\n".format(
-                attr.name, display_attr_type(attr.type), " ({})".format(opt) if opt else ""
+                attr.name, display_attr_type(attr.type), f" ({opt})" if opt else ""
             )
-            s += "<dd>{}</dd>\n".format(attr.description)
+            s += f"<dd>{attr.description}</dd>\n"
         s += "</dl>\n"
 
     # inputs
     s += "\n#### Inputs"
     if schema.min_input != schema.max_input:
-        s += " ({} - {})".format(display_number(schema.min_input), display_number(schema.max_input))
+        s += f" ({display_number(schema.min_input)} - {display_number(schema.max_input)})"
     s += "\n\n"
 
     inputs = schema.inputs
@@ -200,15 +198,15 @@ def format_value(value):  # type: (Any) -> Text
                     option_str = " (variadic)"
                 else:
                     option_str = " (variadic, heterogeneous)"
-            s += "<dt><tt>{}</tt>{} : {}</dt>\n".format(inp.name, option_str, inp.typeStr)
-            s += "<dd>{}</dd>\n".format(inp.description)
+            s += f"<dt><tt>{inp.name}</tt>{option_str} : {inp.typeStr}</dt>\n"
+            s += f"<dd>{inp.description}</dd>\n"
 
     s += "</dl>\n"
 
     # outputs
     s += "\n#### Outputs"
     if schema.min_output != schema.max_output:
-        s += " ({} - {})".format(display_number(schema.min_output), display_number(schema.max_output))
+        s += f" ({display_number(schema.min_output)} - {display_number(schema.max_output)})"
     s += "\n\n"
     outputs = schema.outputs
     if outputs:
@@ -222,8 +220,8 @@ def format_value(value):  # type: (Any) -> Text
                     option_str = " (variadic)"
                 else:
                     option_str = " (variadic, heterogeneous)"
-            s += "<dt><tt>{}</tt>{} : {}</dt>\n".format(output.name, option_str, output.typeStr)
-            s += "<dd>{}</dd>\n".format(output.description)
+            s += f"<dt><tt>{output.name}</tt>{option_str} : {output.typeStr}</dt>\n"
+            s += f"<dd>{output.description}</dd>\n"
 
     s += "</dl>\n"
 
@@ -238,10 +236,10 @@ def format_value(value):  # type: (Any) -> Text
             allowed_type_str = ""
             if len(allowed_types) > 0:
                 allowed_type_str = allowed_types[0]
-            for allowedType in allowed_types[1:]:
+            for allowedType in allowed_types[1:]:  # noqa: N806
                 allowed_type_str += ", " + allowedType
-            s += "<dt><tt>{}</tt> : {}</dt>\n".format(type_constraint.type_param_str, allowed_type_str)
-            s += "<dd>{}</dd>\n".format(type_constraint.description)
+            s += f"<dt><tt>{type_constraint.type_param_str}</tt> : {allowed_type_str}</dt>\n"
+            s += f"<dd>{type_constraint.description}</dd>\n"
         s += "</dl>\n"
 
     return s
@@ -251,7 +249,7 @@ def display_function(function, versions, domain=ONNX_DOMAIN):  # type: (Function
     s = ""
 
     if domain:
-        domain_prefix = "{}.".format(ONNX_ML_DOMAIN)
+        domain_prefix = f"{ONNX_ML_DOMAIN}."
     else:
         domain_prefix = ""
 
@@ -263,8 +261,8 @@ def display_function(function, versions, domain=ONNX_DOMAIN):  # type: (Function
 
     # since version
     s += "\n#### Version\n"
-    s += "\nThis version of the function has been available since version {}".format(function.since_version)
-    s += " of {}.\n".format(display_domain(domain_prefix))
+    s += f"\nThis version of the function has been available since version {function.since_version}"
+    s += f" of {display_domain(domain_prefix)}.\n"
     if len(versions) > 1:
         s += "\nOther versions of this function: {}\n".format(
             ", ".join(
@@ -280,7 +278,7 @@ def display_function(function, versions, domain=ONNX_DOMAIN):  # type: (Function
     if function.input:
         s += "<dl>\n"
         for input in function.input:
-            s += "<dt>{}; </dt>\n".format(input)
+            s += f"<dt>{input}; </dt>\n"
         s += "<br/></dl>\n"
 
     # outputs
@@ -289,7 +287,7 @@ def display_function(function, versions, domain=ONNX_DOMAIN):  # type: (Function
     if function.output:
         s += "<dl>\n"
         for output in function.output:
-            s += "<dt>{}; </dt>\n".format(output)
+            s += f"<dt>{output}; </dt>\n"
         s += "<br/></dl>\n"
 
         # attributes
@@ -297,7 +295,7 @@ def display_function(function, versions, domain=ONNX_DOMAIN):  # type: (Function
         s += "\n#### Attributes\n\n"
         s += "<dl>\n"
         for attr in function.attribute:
-            s += "<dt>{};<br/></dt>\n".format(attr)
+            s += f"<dt>{attr};<br/></dt>\n"
         s += "</dl>\n"
 
     return s
@@ -313,8 +311,7 @@ def support_level_str(level):  # type: (OpSchema.SupportType) -> Text
 
 
 def main(output_path: str, domain_filter: [str]):
-
-    with io.open(output_path, "w", newline="", encoding="utf-8") as fout:
+    with open(output_path, "w", newline="", encoding="utf-8") as fout:
         fout.write("## Contrib Operator Schemas\n")
         fout.write(
             "*This file is automatically generated from the registered contrib operator schemas by "
@@ -325,7 +322,7 @@ def main(output_path: str, domain_filter: [str]):
         # domain -> support level -> name -> [schema]
         index = defaultdict(
             lambda: defaultdict(lambda: defaultdict(list))
-        )  # type: Dict[Text, Dict[int, Dict[Text, List[OpSchema]]]]  # noqa: E501
+        )  # type: Dict[Text, Dict[int, Dict[Text, List[OpSchema]]]]
 
         for schema in rtpy.get_all_operator_schema():
             index[schema.domain][int(schema.support_level)][schema.name].append(schema)
@@ -336,7 +333,7 @@ def main(output_path: str, domain_filter: [str]):
         # [(domain, [(support_level, [(schema name, current schema, all versions schemas)])])]
         operator_schemas = (
             list()
-        )  # type: List[Tuple[Text, List[Tuple[int, List[Tuple[Text, OpSchema, List[OpSchema]]]]]]]  # noqa: E501
+        )  # type: List[Tuple[Text, List[Tuple[int, List[Tuple[Text, OpSchema, List[OpSchema]]]]]]]
         exsting_ops = set()  # type: Set[Text]
         for domain, _supportmap in sorted(index.items()):
             if not should_render_domain(domain, domain_filter):
@@ -357,11 +354,11 @@ def main(output_path: str, domain_filter: [str]):
 
         # Table of contents
         for domain, supportmap in operator_schemas:
-            s = "* {}\n".format(display_domain_short(domain))
+            s = f"* {display_domain_short(domain)}\n"
             fout.write(s)
 
             for _, namemap in supportmap:
-                for n, schema, versions in namemap:
+                for n, schema, versions in namemap:  # noqa: B007
                     s = '  * {}<a href="#{}">{}</a>\n'.format(
                         support_level_str(schema.support_level),
                         format_name_with_domain(domain, n),
@@ -372,7 +369,7 @@ def main(output_path: str, domain_filter: [str]):
         fout.write("\n")
 
         for domain, supportmap in operator_schemas:
-            s = "## {}\n".format(display_domain_short(domain))
+            s = f"## {display_domain_short(domain)}\n"
             fout.write(s)
 
             for _, namemap in supportmap:
diff --git a/tools/python/gen_opkernel_doc.py b/tools/python/gen_opkernel_doc.py
index e399b00c97fcd..6760490c2d7eb 100644
--- a/tools/python/gen_opkernel_doc.py
+++ b/tools/python/gen_opkernel_doc.py
@@ -3,7 +3,7 @@
 # Licensed under the MIT License.
 
 import argparse
-import io
+import io  # noqa: F401
 import os
 import pathlib
 from collections import defaultdict
@@ -61,10 +61,9 @@ def expand_providers(provider_filter: [str]):
 
 
 def main(output_path: pathlib.Path, provider_filter: [str]):
-
     providers = expand_providers(provider_filter)
 
-    with io.open(output_path, "w", newline="", encoding="utf-8") as fout:
+    with open(output_path, "w", newline="", encoding="utf-8") as fout:
         fout.write("## Supported Operators and Data Types\n")
         fout.write(
             "*This file is automatically generated from the registered kernels by "
@@ -87,7 +86,7 @@ def main(output_path: pathlib.Path, provider_filter: [str]):
                         firstinput = False
                     else:
                         paramstr += "<br> "
-                    paramstr += "*in* {}:**{}**".format(inp.name, inp.typeStr)
+                    paramstr += f"*in* {inp.name}:**{inp.typeStr}**"
 
             outputs = schema.outputs
             if outputs:
@@ -96,7 +95,7 @@ def main(output_path: pathlib.Path, provider_filter: [str]):
                         firstinput = False
                     else:
                         paramstr += "<br> "
-                    paramstr += "*out* {}:**{}**".format(outp.name, outp.typeStr)
+                    paramstr += f"*out* {outp.name}:**{outp.typeStr}**"
 
             paramstr += ""
             paramset = paramdict.get(fullname, None)
@@ -117,15 +116,15 @@ def main(output_path: pathlib.Path, provider_filter: [str]):
         for provider in sorted(index.keys()):
             if providers and provider.lower() not in providers:
                 continue
-            fout.write("- [{}](#{})\n".format(provider, provider.lower()))
+            fout.write(f"- [{provider}](#{provider.lower()})\n")
         fout.write("\n---------------")
 
         for provider, domainmap in sorted(index.items()):
             if providers and provider.lower() not in providers:
                 continue
 
-            fout.write('\n\n<a name="{}"/>\n\n'.format(provider.lower()))
-            fout.write("## Operators implemented by {}\n\n".format(provider))
+            fout.write(f'\n\n<a name="{provider.lower()}"/>\n\n')
+            fout.write(f"## Operators implemented by {provider}\n\n")
             fout.write("| Op Name | Parameters | OpSet Version | Types Supported |\n")
             fout.write("|---------|------------|---------------|-----------------|\n")
             for domain, namemap in sorted(domainmap.items()):
diff --git a/tools/python/gen_ort_mobile_pkg_doc.py b/tools/python/gen_ort_mobile_pkg_doc.py
index db9ca027352f1..482cb05bb50b9 100644
--- a/tools/python/gen_ort_mobile_pkg_doc.py
+++ b/tools/python/gen_ort_mobile_pkg_doc.py
@@ -25,7 +25,7 @@ def generate_docs(output_file, required_ops, op_type_impl_filter):
         assert op_type_impl_filter.__class__ is GloballyAllowedTypesOpTypeImplFilter
         global_types = op_type_impl_filter.global_type_list()
         for type in sorted(global_types):
-            out.write("  - {}\n".format(type))
+            out.write(f"  - {type}\n")
         out.write("\n")
         out.write("NOTE: Operators used to manipulate dimensions and indices will support int32 and int64.\n\n")
 
@@ -36,7 +36,7 @@ def generate_docs(output_file, required_ops, op_type_impl_filter):
             for opset in sorted(required_ops[domain].keys()):
                 str_opset = str(opset)
                 for op in required_ops[domain][opset]:
-                    op_with_domain = "{}:{}".format(domain, op)
+                    op_with_domain = f"{domain}:{op}"
                     if op_with_domain not in op_opsets:
                         op_opsets[op_with_domain] = []
 
@@ -46,7 +46,7 @@ def generate_docs(output_file, required_ops, op_type_impl_filter):
         out.write("|Operator|Opsets|\n")
         out.write("|--------|------|\n")
         for domain, op_opsets in domain_op_opsets:
-            out.write("|**{}**||\n".format(domain))
+            out.write(f"|**{domain}**||\n")
             for op in sorted(op_opsets.keys()):
                 out.write("|{}|{}|\n".format(op, ", ".join(op_opsets[op])))
             out.write("|||\n")
diff --git a/tools/python/onnx_test_data_utils.py b/tools/python/onnx_test_data_utils.py
index c7670e4a84258..399fd36d4a7a0 100644
--- a/tools/python/onnx_test_data_utils.py
+++ b/tools/python/onnx_test_data_utils.py
@@ -20,8 +20,8 @@ def dump_tensorproto_pb_file(filename):
     """Dump the data from a pb file containing a TensorProto."""
 
     name, data = read_tensorproto_pb_file(filename)
-    print("Name: {}".format(name))
-    print("Shape: {}".format(data.shape))
+    print(f"Name: {name}")
+    print(f"Shape: {data.shape}")
     print(data)
 
 
diff --git a/tools/python/ort_test_dir_utils.py b/tools/python/ort_test_dir_utils.py
index 1b5525b9169d2..710390d26fe8e 100644
--- a/tools/python/ort_test_dir_utils.py
+++ b/tools/python/ort_test_dir_utils.py
@@ -17,9 +17,9 @@ def _get_numpy_type(model_info, name):
             if type_name == "tensor_type":
                 return onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[i.type.tensor_type.elem_type]
             else:
-                raise ValueError("Type is not handled: {}".format(type_name))
+                raise ValueError(f"Type is not handled: {type_name}")
 
-    raise ValueError("{} was not found in the model info.".format(name))
+    raise ValueError(f"{name} was not found in the model info.")
 
 
 def _create_missing_input_data(model_inputs, name_input_map, symbolic_dim_values_map, initializer_set):
@@ -39,7 +39,7 @@ def _create_missing_input_data(model_inputs, name_input_map, symbolic_dim_values
             continue
         input_type = input.type.WhichOneof("value")
         if input_type != "tensor_type":
-            raise ValueError("Unsupported model. Need to handle input type of {}".format(input_type))
+            raise ValueError(f"Unsupported model. Need to handle input type of {input_type}")
 
         shape = input.type.tensor_type.shape
         dims = []
@@ -49,7 +49,7 @@ def _create_missing_input_data(model_inputs, name_input_map, symbolic_dim_values
                 dims.append(dim.dim_value)
             elif dim_type == "dim_param":
                 if dim.dim_param not in symbolic_dim_values_map:
-                    raise ValueError("Value for symbolic dim {} was not provided.".format(dim.dim_param))
+                    raise ValueError(f"Value for symbolic dim {dim.dim_param} was not provided.")
 
                 dims.append(symbolic_dim_values_map[dim.dim_param])
             else:
@@ -119,7 +119,7 @@ def save_data(prefix, name_data_map, model_info):
             else:
                 np_type = _get_numpy_type(model_info, name)
                 tensor = numpy_helper.from_array(data.astype(np_type), name)
-                filename = os.path.join(test_data_dir, "{}_{}.pb".format(prefix, idx))
+                filename = os.path.join(test_data_dir, f"{prefix}_{idx}.pb")
                 with open(filename, "wb") as f:
                     f.write(tensor.SerializeToString())
 
@@ -196,18 +196,18 @@ def run_test_dir(model_or_dir):
                 "'Please provide specific .onnx or .ort file as input.".format(model_dir)
             )
         elif len(models) == 0:
-            raise ValueError("'No .onnx or .ort files found in {}.".format(model_dir))
+            raise ValueError(f"'No .onnx or .ort files found in {model_dir}.")
 
         model_path = models[0]
     else:
         model_path = os.path.abspath(model_or_dir)
         model_dir = os.path.dirname(model_path)
 
-    print("Running tests in {} for {}".format(model_dir, model_path))
+    print(f"Running tests in {model_dir} for {model_path}")
 
     test_dirs = [d for d in glob.glob(os.path.join(model_dir, "test*")) if os.path.isdir(d)]
     if not test_dirs:
-        raise ValueError("No directories with name starting with 'test' were found in {}.".format(model_dir))
+        raise ValueError(f"No directories with name starting with 'test' were found in {model_dir}.")
 
     sess = ort.InferenceSession(model_path)
 
@@ -237,11 +237,11 @@ def run_test_dir(model_or_dir):
 
                 if expected.dtype.char in np.typecodes["AllFloat"]:
                     if not np.isclose(expected, actual, rtol=1.0e-3, atol=1.0e-3).all():
-                        print("Mismatch for {}:\nExpected:{}\nGot:{}".format(output_names[idx], expected, actual))
+                        print(f"Mismatch for {output_names[idx]}:\nExpected:{expected}\nGot:{actual}")
                         failed = True
                 else:
                     if not np.equal(expected, actual).all():
-                        print("Mismatch for {}:\nExpected:{}\nGot:{}".format(output_names[idx], expected, actual))
+                        print(f"Mismatch for {output_names[idx]}:\nExpected:{expected}\nGot:{actual}")
                         failed = True
         if failed:
             raise ValueError("FAILED due to output mismatch.")
diff --git a/tools/python/run_CIs_for_external_pr.py b/tools/python/run_CIs_for_external_pr.py
index fe45f72e0d6f5..275faadc89ce5 100644
--- a/tools/python/run_CIs_for_external_pr.py
+++ b/tools/python/run_CIs_for_external_pr.py
@@ -27,7 +27,7 @@ def parse_args():
 
 def run_gh_pr_command(command: typing.List[str]):
     try:
-        return subprocess.run(["gh", "pr"] + command, capture_output=True, text=True, check=True)
+        return subprocess.run(["gh", "pr", *command], capture_output=True, text=True, check=True)
     except subprocess.CalledProcessError as cpe:
         print(cpe)
         print(cpe.stderr)
diff --git a/tools/python/run_android_emulator.py b/tools/python/run_android_emulator.py
index cecb83b68e060..69fa88bd082dc 100755
--- a/tools/python/run_android_emulator.py
+++ b/tools/python/run_android_emulator.py
@@ -80,10 +80,10 @@ def main():
         emulator_proc = android.start_emulator(**start_emulator_args)
 
         with open(args.emulator_pid_file, mode="w") as emulator_pid_file:
-            print("{}".format(emulator_proc.pid), file=emulator_pid_file)
+            print(f"{emulator_proc.pid}", file=emulator_pid_file)
 
     elif args.stop:
-        with open(args.emulator_pid_file, mode="r") as emulator_pid_file:
+        with open(args.emulator_pid_file) as emulator_pid_file:
             emulator_pid = int(emulator_pid_file.readline().strip())
 
         android.stop_emulator(emulator_pid)
diff --git a/tools/python/sparsify_initializers.py b/tools/python/sparsify_initializers.py
index 17bddae6bbe40..8f5034c4ef5cc 100644
--- a/tools/python/sparsify_initializers.py
+++ b/tools/python/sparsify_initializers.py
@@ -9,15 +9,15 @@
 import argparse
 import logging
 import sys
-from typing import List, Tuple
+from typing import List, Tuple  # noqa: F401
 
 import numpy as np
 import onnx
-from onnx import ModelProto, SparseTensorProto, TensorProto, numpy_helper
+from onnx import ModelProto, SparseTensorProto, TensorProto, numpy_helper  # noqa: F401
 
 logger = logging.getLogger(__name__)
 
-real_types = set((int(TensorProto.FLOAT), int(TensorProto.DOUBLE)))
+real_types = {int(TensorProto.FLOAT), int(TensorProto.DOUBLE)}
 
 
 def parse_arguments():
diff --git a/tools/python/update_version.py b/tools/python/update_version.py
index ca94bacd03b9a..415f83b043bc2 100755
--- a/tools/python/update_version.py
+++ b/tools/python/update_version.py
@@ -103,7 +103,7 @@ def run(args, cwd):
         from util import is_windows, run
 
         if is_windows():
-            args = ["cmd", "/c"] + args
+            args = ["cmd", "/c", *args]
         run(*args, cwd=cwd)
 
     # check if node, npm and yarn are installed
diff --git a/tools/python/util/__init__.py b/tools/python/util/__init__.py
index 92209c863f147..0a791165dd151 100644
--- a/tools/python/util/__init__.py
+++ b/tools/python/util/__init__.py
@@ -1,21 +1,21 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-from .get_azcopy import get_azcopy
+from .get_azcopy import get_azcopy  # noqa: F401
 from .logger import get_logger
-from .platform_helpers import is_linux, is_macOS, is_windows
-from .run import run
+from .platform_helpers import is_linux, is_macOS, is_windows  # noqa: F401
+from .run import run  # noqa: F401
 
 try:
-    import flatbuffers  # noqa
+    import flatbuffers  # noqa: F401
 
-    from .reduced_build_config_parser import parse_config
+    from .reduced_build_config_parser import parse_config  # noqa: F401
 except ImportError:
     get_logger("tools_python_utils").info("flatbuffers module is not installed. parse_config will not be available")
 
 # see if we can make the pytorch helpers available.
-import importlib.util  # noqa
+import importlib.util
 
 have_torch = importlib.util.find_spec("torch")
 if have_torch:
-    from .pytorch_export_helpers import infer_input_info
+    from .pytorch_export_helpers import infer_input_info  # noqa: F401
diff --git a/tools/python/util/__init__append.py b/tools/python/util/__init__append.py
index 266fde0a75ae9..8bae45bed8c6c 100644
--- a/tools/python/util/__init__append.py
+++ b/tools/python/util/__init__append.py
@@ -3,4 +3,4 @@
 
 have_torch = importlib.util.find_spec("torch")
 if have_torch:
-    from .pytorch_export_helpers import infer_input_info  # noqa
+    from .pytorch_export_helpers import infer_input_info  # noqa: F401
diff --git a/tools/python/util/android/__init__.py b/tools/python/util/android/__init__.py
index 915cda58f5321..104e3738f96dd 100644
--- a/tools/python/util/android/__init__.py
+++ b/tools/python/util/android/__init__.py
@@ -1,4 +1,10 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-from .android import SdkToolPaths, create_virtual_device, get_sdk_tool_paths, start_emulator, stop_emulator
+from .android import (  # noqa: F401
+    SdkToolPaths,
+    create_virtual_device,
+    get_sdk_tool_paths,
+    start_emulator,
+    stop_emulator,
+)
diff --git a/tools/python/util/android/android.py b/tools/python/util/android/android.py
index 06922ed1dfa2a..0baa21179d32d 100644
--- a/tools/python/util/android/android.py
+++ b/tools/python/util/android/android.py
@@ -23,7 +23,7 @@
 def get_sdk_tool_paths(sdk_root: str):
     def filename(name, windows_extension):
         if is_windows():
-            return "{}.{}".format(name, windows_extension)
+            return f"{name}.{windows_extension}"
         else:
             return name
 
@@ -33,9 +33,9 @@ def resolve_path(dirnames, basename):
             path = shutil.which(os.path.join(dirname, basename))
             if path is not None:
                 path = os.path.realpath(path)
-                _log.debug("Found {} at {}".format(basename, path))
+                _log.debug(f"Found {basename} at {path}")
                 return path
-        raise FileNotFoundError("Failed to resolve path for {}".format(basename))
+        raise FileNotFoundError(f"Failed to resolve path for {basename}")
 
     return SdkToolPaths(
         emulator=resolve_path([os.path.join(sdk_root, "emulator")], filename("emulator", "exe")),
@@ -71,7 +71,7 @@ def create_virtual_device(sdk_tool_paths: SdkToolPaths, system_image_package_nam
 
 
 def _start_process(*args) -> subprocess.Popen:
-    _log.debug("Starting process - args: {}".format([*args]))
+    _log.debug(f"Starting process - args: {[*args]}")
     return subprocess.Popen([*args], creationflags=_process_creationflags)
 
 
@@ -79,7 +79,7 @@ def _start_process(*args) -> subprocess.Popen:
 
 
 def _stop_process(proc: subprocess.Popen):
-    _log.debug("Stopping process - args: {}".format(proc.args))
+    _log.debug(f"Stopping process - args: {proc.args}")
     proc.send_signal(_stop_signal)
 
     try:
@@ -91,7 +91,7 @@ def _stop_process(proc: subprocess.Popen):
 
 def _stop_process_with_pid(pid: int):
     # not attempting anything fancier than just sending _stop_signal for now
-    _log.debug("Stopping process - pid: {}".format(pid))
+    _log.debug(f"Stopping process - pid: {pid}")
     os.kill(pid, _stop_signal)
 
 
@@ -135,12 +135,12 @@ def start_emulator(
 
             if emulator_ret is not None:
                 # emulator exited early
-                raise RuntimeError("Emulator exited early with return code: {}".format(emulator_ret))
+                raise RuntimeError(f"Emulator exited early with return code: {emulator_ret}")
 
             if waiter_ret is not None:
                 if waiter_ret == 0:
                     break
-                raise RuntimeError("Waiter process exited with return code: {}".format(waiter_ret))
+                raise RuntimeError(f"Waiter process exited with return code: {waiter_ret}")
 
             time.sleep(sleep_interval_seconds)
 
diff --git a/tools/python/util/check_onnx_model_mobile_usability.py b/tools/python/util/check_onnx_model_mobile_usability.py
index f8095dd4d1bd4..c295fe52f69f7 100644
--- a/tools/python/util/check_onnx_model_mobile_usability.py
+++ b/tools/python/util/check_onnx_model_mobile_usability.py
@@ -8,7 +8,7 @@
 # need this before the mobile helper imports for some reason
 logging.basicConfig(format="%(levelname)s:  %(message)s")
 
-from .mobile_helpers import check_model_can_use_ort_mobile_pkg, usability_checker  # noqa
+from .mobile_helpers import check_model_can_use_ort_mobile_pkg, usability_checker  # noqa: E402
 
 
 def check_usability():
diff --git a/tools/python/util/convert_onnx_models_to_ort.py b/tools/python/util/convert_onnx_models_to_ort.py
index 9331c48d0c5f5..2db974643c0f5 100644
--- a/tools/python/util/convert_onnx_models_to_ort.py
+++ b/tools/python/util/convert_onnx_models_to_ort.py
@@ -83,7 +83,6 @@ def _convert(
     target_platform: str,
     session_options_config_entries: typing.Dict[str, str],
 ) -> typing.List[pathlib.Path]:
-
     model_dir = model_path_or_dir if model_path_or_dir.is_dir() else model_path_or_dir.parent
     output_dir = output_dir or model_dir
 
@@ -102,7 +101,7 @@ def is_model_file_to_convert(file_path: pathlib.Path):
     models = files_from_file_or_dir(model_path_or_dir, is_model_file_to_convert)
 
     if len(models) == 0:
-        raise ValueError("No model files were found in '{}'".format(model_path_or_dir))
+        raise ValueError(f"No model files were found in '{model_path_or_dir}'")
 
     providers = ["CPUExecutionProvider"]
 
@@ -118,7 +117,6 @@ def is_model_file_to_convert(file_path: pathlib.Path):
 
     for model in models:
         try:
-
             relative_model_path = model.relative_to(model_dir)
 
             (output_dir / relative_model_path).parent.mkdir(parents=True, exist_ok=True)
@@ -142,7 +140,7 @@ def is_model_file_to_convert(file_path: pathlib.Path):
                     # Limit the optimizations to those that can run in a model with runtime optimizations.
                     so.add_session_config_entry("optimization.minimal_build_optimizations", "apply")
 
-                print("Saving optimized ONNX model {} to {}".format(model, optimized_target_path))
+                print(f"Saving optimized ONNX model {model} to {optimized_target_path}")
                 _ = ort.InferenceSession(
                     str(model), sess_options=so, providers=providers, disabled_optimizers=optimizer_filter
                 )
@@ -155,7 +153,7 @@ def is_model_file_to_convert(file_path: pathlib.Path):
             if optimization_style == OptimizationStyle.Runtime:
                 so.add_session_config_entry("optimization.minimal_build_optimizations", "save")
 
-            print("Converting optimized ONNX model {} to ORT format model {}".format(model, ort_target_path))
+            print(f"Converting optimized ONNX model {model} to ORT format model {ort_target_path}")
             _ = ort.InferenceSession(
                 str(model), sess_options=so, providers=providers, disabled_optimizers=optimizer_filter
             )
@@ -167,11 +165,11 @@ def is_model_file_to_convert(file_path: pathlib.Path):
             # print("Serialized {} to {}. Sizes: orig={} new={} diff={} new:old={:.4f}:1.0".format(
             #     onnx_target_path, ort_target_path, orig_size, new_size, new_size - orig_size, new_size / orig_size))
         except Exception as e:
-            print("Error converting {}: {}".format(model, e))
+            print(f"Error converting {model}: {e}")
             if not allow_conversion_failures:
                 raise
 
-    print("Converted {}/{} models successfully.".format(len(converted_models), len(models)))
+    print(f"Converted {len(converted_models)}/{len(models)} models successfully.")
 
     return converted_models
 
@@ -280,10 +278,10 @@ def convert_onnx_models_to_ort():
     custom_op_library = args.custom_op_library.resolve() if args.custom_op_library else None
 
     if not model_path_or_dir.is_dir() and not model_path_or_dir.is_file():
-        raise FileNotFoundError("Model path '{}' is not a file or directory.".format(model_path_or_dir))
+        raise FileNotFoundError(f"Model path '{model_path_or_dir}' is not a file or directory.")
 
     if custom_op_library and not custom_op_library.is_file():
-        raise FileNotFoundError("Unable to find custom operator library '{}'".format(custom_op_library))
+        raise FileNotFoundError(f"Unable to find custom operator library '{custom_op_library}'")
 
     session_options_config_entries = {}
 
diff --git a/tools/python/util/get_azcopy.py b/tools/python/util/get_azcopy.py
index 15aa9ca67e2de..76c75ad8c60eb 100644
--- a/tools/python/util/get_azcopy.py
+++ b/tools/python/util/get_azcopy.py
@@ -27,7 +27,7 @@
 
 
 def _check_version(azcopy_path):
-    proc = subprocess.run([azcopy_path, "--version"], stdout=subprocess.PIPE, universal_newlines=True)
+    proc = subprocess.run([azcopy_path, "--version"], stdout=subprocess.PIPE, text=True)
     match = re.search(r"\d+(?:\.\d+)+", proc.stdout)
 
     if not match:
@@ -41,7 +41,7 @@ def _find_azcopy(start_dir):
         for file_name in file_names:
             if file_name == "azcopy" or file_name == "azcopy.exe":
                 return os.path.join(root, file_name)
-    raise RuntimeError("Failed to azcopy in '{}'.".format(start_dir))
+    raise RuntimeError(f"Failed to azcopy in '{start_dir}'.")
 
 
 @contextlib.contextmanager
@@ -67,7 +67,7 @@ def get_azcopy(local_azcopy_path="azcopy"):
             assert len(download_basename) > 0
             downloaded_path = os.path.join(temp_dir, download_basename)
 
-            _log.info("Downloading azcopy from '{}'...".format(download_url))
+            _log.info(f"Downloading azcopy from '{download_url}'...")
             urllib.request.urlretrieve(download_url, downloaded_path)
 
             extracted_path = os.path.join(temp_dir, "azcopy")
diff --git a/tools/python/util/mobile_helpers/usability_checker.py b/tools/python/util/mobile_helpers/usability_checker.py
index f286544fa510e..9ec2bd3681792 100644
--- a/tools/python/util/mobile_helpers/usability_checker.py
+++ b/tools/python/util/mobile_helpers/usability_checker.py
@@ -32,7 +32,7 @@ def __init__(self, filename):
         self._ops = {}  # op to caveats
         self._ops_seen = set()
 
-        with open(filename, "r") as f:
+        with open(filename) as f:
             for line in f.readlines():
                 # we're looking for a markdown table with 2 columns. first is op name. second is caveats
                 # op name is domain:op
@@ -85,7 +85,6 @@ def __init__(self):
         self.nodes_unsupported_due_to_dynamic_input = -1
 
     def suitability(self):
-
         # for now add up all the nodes. if there are subgraphs, the percentage of covered nodes will be reduced by all
         # nodes in the subgraphs.
         num_nodes = self.num_nodes + self.num_nodes_in_subgraphs
@@ -465,7 +464,6 @@ def check_shapes(graph: onnx.GraphProto, logger: logging.Logger = None):
 
 
 def checker(model_path, logger: logging.Logger):
-
     model = onnx.load(model_path)
     model_with_shape_info = onnx.shape_inference.infer_shapes(model)
 
diff --git a/tools/python/util/onnx_model_utils.py b/tools/python/util/onnx_model_utils.py
index 2d022eaf0ec36..8d30f2fa89aad 100644
--- a/tools/python/util/onnx_model_utils.py
+++ b/tools/python/util/onnx_model_utils.py
@@ -254,8 +254,8 @@ def _create_producer_consumer_link(
 
 
 def _map_node_dependencies(graph: onnx.GraphProto, node_to_producers: dict, node_to_consumers: dict):
-    graph_inputs = set([i.name for i in graph.input])
-    initializers = set([i.name for i in graph.initializer])
+    graph_inputs = {i.name for i in graph.input}
+    initializers = {i.name for i in graph.initializer}
 
     # map of value name to node that creates it. copy parent values but override if values get shadowed
     producers = {}
diff --git a/tools/python/util/ort_format_model/__init__.py b/tools/python/util/ort_format_model/__init__.py
index dd1bcbb277226..318851642d6e5 100644
--- a/tools/python/util/ort_format_model/__init__.py
+++ b/tools/python/util/ort_format_model/__init__.py
@@ -18,10 +18,8 @@
 
 sys.path.append(ort_fbs_py_parent_dir)
 
-from .operator_type_usage_processors import (  # noqa
-    GloballyAllowedTypesOpTypeImplFilter,
-    OperatorTypeUsageManager,
-    OpTypeImplFilterInterface,
-)
-from .ort_model_processor import OrtFormatModelProcessor  # noqa
-from .utils import create_config_from_models  # noqa
+from .operator_type_usage_processors import GloballyAllowedTypesOpTypeImplFilter  # noqa: E402, F401
+from .operator_type_usage_processors import OperatorTypeUsageManager  # noqa: E402, F401
+from .operator_type_usage_processors import OpTypeImplFilterInterface  # noqa: E402, F401
+from .ort_model_processor import OrtFormatModelProcessor  # noqa: E402, F401
+from .utils import create_config_from_models  # noqa: E402, F401
diff --git a/tools/python/util/ort_format_model/operator_type_usage_processors.py b/tools/python/util/ort_format_model/operator_type_usage_processors.py
index 8f21298518f87..f38bdeae75974 100644
--- a/tools/python/util/ort_format_model/operator_type_usage_processors.py
+++ b/tools/python/util/ort_format_model/operator_type_usage_processors.py
@@ -11,7 +11,7 @@
 
 
 def _create_op_key(domain: str, optype: str):
-    return "{}:{}".format(domain, optype)
+    return f"{domain}:{optype}"
 
 
 def _ort_constant_for_domain(domain: str):
@@ -26,7 +26,7 @@ def _ort_constant_for_domain(domain: str):
     domain_to_constant_map = {"ai.onnx": "kOnnxDomain", "ai.onnx.ml": "kMLDomain", "com.microsoft": "kMSDomain"}
 
     if domain not in domain_to_constant_map:
-        raise ValueError("Domain {} not found in map to ONNX Runtime constant. Please update map.".format(domain))
+        raise ValueError(f"Domain {domain} not found in map to ONNX Runtime constant. Please update map.")
 
     return domain_to_constant_map[domain]
 
@@ -76,7 +76,7 @@ def is_typed_registration_needed(
         :return: True is required. False if not.
         """
         # Not all operators have typed registrations, so this is optionally implemented by derived classes
-        raise RuntimeError("Did not expect processor for {} to have typed registrations.".format(self.name))
+        raise RuntimeError(f"Did not expect processor for {self.name} to have typed registrations.")
 
     def get_cpp_entry(self):
         """
@@ -113,10 +113,10 @@ def __init__(
         self,
         domain: str,
         optype: str,
-        inputs: [int] = [0],
-        outputs: [int] = [],
-        required_input_types: typing.Dict[int, typing.Set[str]] = {},
-        required_output_types: typing.Dict[int, typing.Set[str]] = {},
+        inputs: [int] = [0],  # noqa: B006
+        outputs: [int] = [],  # noqa: B006
+        required_input_types: typing.Dict[int, typing.Set[str]] = {},  # noqa: B006
+        required_output_types: typing.Dict[int, typing.Set[str]] = {},  # noqa: B006
     ):
         """
         Create DefaultTypeUsageProcessor. Types for one or more inputs and/or outputs can be tracked by the processor.
@@ -166,7 +166,7 @@ def is_output_type_enabled(self, reg_type, index, allowed_type_set=None):
         return self._is_type_enabled(reg_type, index, self._required_output_types, allowed_type_set)
 
     def process_node(self, node: fbs.Node, value_name_to_typeinfo: dict):
-        for i in self._input_types.keys():
+        for i in self._input_types:
             if i >= node.InputsLength():
                 # Some operators have fewer inputs in earlier versions where data that was as an attribute
                 # become an input in later versions to allow it to be dynamically provided. Allow for that.
@@ -178,7 +178,7 @@ def process_node(self, node: fbs.Node, value_name_to_typeinfo: dict):
                 type_str = value_name_to_typestr(node.Inputs(i), value_name_to_typeinfo)
                 self._input_types[i].add(type_str)
 
-        for o in self._output_types.keys():
+        for o in self._output_types:
             # Don't know of any ops where the number of outputs changed across versions, so require a valid length
             if o >= node.OutputsLength():
                 raise RuntimeError(
@@ -196,7 +196,7 @@ def is_typed_registration_needed(
         if 0 not in self._input_types.keys():
             # currently all standard typed registrations are for input 0.
             # custom registrations can be handled by operator specific processors (e.g. OneHotProcessor below).
-            raise RuntimeError("Expected typed registration to use type from input 0. Node:{}".format(self.name))
+            raise RuntimeError(f"Expected typed registration to use type from input 0. Node:{self.name}")
 
         return self.is_input_type_enabled(type_in_registration, 0, globally_allowed_types)
 
@@ -327,7 +327,7 @@ def from_config_entry(self, entry: str):
         self._triples.clear()
         aggregate_info = json.loads(entry)
         if "custom" in aggregate_info:
-            self._triples = set([tuple(triple) for triple in aggregate_info["custom"]])
+            self._triples = {tuple(triple) for triple in aggregate_info["custom"]}
 
 
 def _create_operator_type_usage_processors():
@@ -589,7 +589,6 @@ def restore_from_config_entry(self, domain: str, optype: str, config_entry: str)
             op_processor.from_config_entry(config_entry)
 
     def debug_dump(self):
-
         print("C++ code that will be emitted:")
         [print(cpp_line) for cpp_line in self.get_cpp_entries()]
 
@@ -597,7 +596,7 @@ def debug_dump(self):
         for key in sorted(self._operator_processors.keys()):
             entry = self._operator_processors[key].to_config_entry()
             if entry:
-                print("{} -> {}".format(key, entry))
+                print(f"{key} -> {entry}")
 
                 # roundtrip test to validate that we can initialize the processor from the entry and get the
                 # same values back
diff --git a/tools/python/util/ort_format_model/ort_model_processor.py b/tools/python/util/ort_format_model/ort_model_processor.py
index 7c65930e4cd0e..d3a07efe92aa5 100644
--- a/tools/python/util/ort_format_model/ort_model_processor.py
+++ b/tools/python/util/ort_format_model/ort_model_processor.py
@@ -17,15 +17,15 @@ def __init__(self, model_path: str, required_ops: dict, processors: OperatorType
         :param processors: Operator type usage processors which will be called for each matching Node.
         """
         self._required_ops = required_ops  # dictionary of {domain: {opset:[operators]}}
-        self._file = open(model_path, "rb").read()
+        self._file = open(model_path, "rb").read()  # noqa: SIM115
         self._buffer = bytearray(self._file)
         if not fbs.InferenceSession.InferenceSession.InferenceSessionBufferHasIdentifier(self._buffer, 0):
-            raise RuntimeError("File does not appear to be a valid ORT format model: '{}'".format(model_path))
+            raise RuntimeError(f"File does not appear to be a valid ORT format model: '{model_path}'")
         self._model = fbs.InferenceSession.InferenceSession.GetRootAsInferenceSession(self._buffer, 0).Model()
         self._op_type_processors = processors
 
     @staticmethod
-    def _setup_type_info(graph: fbs.Graph, outer_scope_value_typeinfo={}):
+    def _setup_type_info(graph: fbs.Graph, outer_scope_value_typeinfo={}):  # noqa: B006
         """
         Setup the node args for this level of Graph.
         We copy the current list which represents the outer scope values, and add the local node args to that
@@ -43,9 +43,9 @@ def _setup_type_info(graph: fbs.Graph, outer_scope_value_typeinfo={}):
 
     def _add_required_op(self, domain: str, opset: int, op_type: str):
         if domain not in self._required_ops:
-            self._required_ops[domain] = {opset: set([op_type])}
+            self._required_ops[domain] = {opset: {op_type}}
         elif opset not in self._required_ops[domain]:
-            self._required_ops[domain][opset] = set([op_type])
+            self._required_ops[domain][opset] = {op_type}
         else:
             self._required_ops[domain][opset].add(op_type)
 
diff --git a/tools/python/util/ort_format_model/types.py b/tools/python/util/ort_format_model/types.py
index 5cac69f4b3319..2b5407a503ccd 100644
--- a/tools/python/util/ort_format_model/types.py
+++ b/tools/python/util/ort_format_model/types.py
@@ -44,7 +44,7 @@ def typeinfo_to_str(type: fbs.TypeInfo):
             key_type_str = FbsTypeInfo.tensordatatype_to_string[key_type]
             value_type = map_type.ValueType()  # TypeInfo
             value_type_str = FbsTypeInfo.typeinfo_to_str(value_type)
-            type_str = "std::map<{},{}>".format(key_type_str, value_type_str)
+            type_str = f"std::map<{key_type_str},{value_type_str}>"
 
         elif value_type == fbs.TypeInfoValue.TypeInfoValue.sequence_type:
             sequence_type = fbs.SequenceType.SequenceType()
@@ -60,7 +60,7 @@ def typeinfo_to_str(type: fbs.TypeInfo):
             # due to this).
             type_str = elem_type_str
         else:
-            raise ValueError("Unknown or missing value type of {}".format(value_type))
+            raise ValueError(f"Unknown or missing value type of {value_type}")
 
         return type_str
 
diff --git a/tools/python/util/ort_format_model/utils.py b/tools/python/util/ort_format_model/utils.py
index 83ffaac5e2e80..5eb90bad68fa9 100644
--- a/tools/python/util/ort_format_model/utils.py
+++ b/tools/python/util/ort_format_model/utils.py
@@ -47,7 +47,7 @@ def create_config_from_models(
             for opset in sorted(required_ops[domain].keys()):
                 ops = required_ops[domain][opset]
                 if ops:
-                    out.write("{};{};".format(domain, opset))
+                    out.write(f"{domain};{opset};")
                     if enable_type_reduction:
                         # type string is empty if op hasn't been seen
                         entries = [
diff --git a/tools/python/util/platform_helpers.py b/tools/python/util/platform_helpers.py
index afb5b039d454e..bd5006cdf2373 100644
--- a/tools/python/util/platform_helpers.py
+++ b/tools/python/util/platform_helpers.py
@@ -8,7 +8,7 @@ def is_windows():
     return sys.platform.startswith("win")
 
 
-def is_macOS():
+def is_macOS():  # noqa: N802
     return sys.platform.startswith("darwin")
 
 
diff --git a/tools/python/util/pytorch_export_helpers.py b/tools/python/util/pytorch_export_helpers.py
index 0ab7689f378c3..293caad38b0d3 100644
--- a/tools/python/util/pytorch_export_helpers.py
+++ b/tools/python/util/pytorch_export_helpers.py
@@ -8,7 +8,7 @@
 
 
 def _parse_inputs_for_onnx_export(all_input_parameters, inputs, kwargs):
-    # extracted from https://github.com/microsoft/onnxruntime/blob/239c6ad3f021ff7cc2e6247eb074bd4208dc11e2/orttraining/orttraining/python/training/ortmodule/_io.py#L433  # noqa
+    # extracted from https://github.com/microsoft/onnxruntime/blob/239c6ad3f021ff7cc2e6247eb074bd4208dc11e2/orttraining/orttraining/python/training/ortmodule/_io.py#L433
 
     def _add_input(name, input):
         """Returns number of expanded inputs that _add_input processed"""
@@ -87,7 +87,7 @@ def _add_input(name, input):
 
 def _flatten_module_input(names, args, kwargs):
     """Flatten args and kwargs in a single tuple of tensors."""
-    # extracted from https://github.com/microsoft/onnxruntime/blob/239c6ad3f021ff7cc2e6247eb074bd4208dc11e2/orttraining/orttraining/python/training/ortmodule/_io.py#L110  # noqa
+    # extracted from https://github.com/microsoft/onnxruntime/blob/239c6ad3f021ff7cc2e6247eb074bd4208dc11e2/orttraining/orttraining/python/training/ortmodule/_io.py#L110
 
     def is_primitive_type(value):
         return type(value) in {int, bool, float}
diff --git a/tools/python/util/qdq_helpers/qdq_model_utils.py b/tools/python/util/qdq_helpers/qdq_model_utils.py
index 7aac6f892880b..d15c9d65d26a8 100644
--- a/tools/python/util/qdq_helpers/qdq_model_utils.py
+++ b/tools/python/util/qdq_helpers/qdq_model_utils.py
@@ -36,7 +36,7 @@ def _duplicate_dq_nodes_with_multiple_consumers(graph: onnx.GraphProto, **kwargs
     if nodes_to_update:
         dup_idx = 0
         new_graph = onnx.GraphProto()
-        graph_outputs = set([output.name for output in graph.output])
+        graph_outputs = {output.name for output in graph.output}
         for node in graph.node:
             new_graph.node.append(node)
             if node in nodes_to_update:
diff --git a/tools/python/util/qdq_helpers/test/test_qdq_model_utils.py b/tools/python/util/qdq_helpers/test/test_qdq_model_utils.py
index a360ffe91568a..d7cd7ac2005ba 100644
--- a/tools/python/util/qdq_helpers/test/test_qdq_model_utils.py
+++ b/tools/python/util/qdq_helpers/test/test_qdq_model_utils.py
@@ -17,7 +17,7 @@
 
 
 class TestQDQUtils(unittest.TestCase):
-    def test_fix_DQ_with_multiple_consumers(self):
+    def test_fix_DQ_with_multiple_consumers(self):  # noqa: N802
         """ """
         model_path = ort_root / "onnxruntime" / "test" / "testdata" / "qdq_with_multi_consumer_dq_nodes.onnx"
         model = onnx.load(str(model_path))
diff --git a/tools/python/util/reduced_build_config_parser.py b/tools/python/util/reduced_build_config_parser.py
index 3a2210ccda6e6..cb90026808fde 100644
--- a/tools/python/util/reduced_build_config_parser.py
+++ b/tools/python/util/reduced_build_config_parser.py
@@ -5,10 +5,10 @@
 
 # Check if the flatbuffers module is available. If not we cannot handle type reduction information in the config.
 try:
-    import flatbuffers  # noqa
+    import flatbuffers  # noqa: F401
 
     have_flatbuffers = True
-    from .ort_format_model import GloballyAllowedTypesOpTypeImplFilter, OperatorTypeUsageManager  # noqa
+    from .ort_format_model import GloballyAllowedTypesOpTypeImplFilter, OperatorTypeUsageManager
 except ImportError:
     have_flatbuffers = False
 
@@ -82,7 +82,7 @@ def parse_config(config_file: str, enable_type_reduction: bool = False):
     """
 
     if not os.path.isfile(config_file):
-        raise ValueError("Configuration file {} does not exist".format(config_file))
+        raise ValueError(f"Configuration file {config_file} does not exist")
 
     # only enable type reduction when flatbuffers is available
     enable_type_reduction = enable_type_reduction and have_flatbuffers
@@ -102,7 +102,7 @@ def process_non_op_line(line):
                 nonlocal globally_allowed_types
                 if globally_allowed_types is not None:
                     raise RuntimeError("Globally allowed types were already specified.")
-                globally_allowed_types = set(segment.strip() for segment in line.split(";")[1].split(","))
+                globally_allowed_types = {segment.strip() for segment in line.split(";")[1].split(",")}
             return True
 
         if line == "!no_ops_specified_means_all_ops_are_required":  # handle all ops required line
@@ -112,12 +112,12 @@ def process_non_op_line(line):
 
         return False
 
-    with open(config_file, "r") as config:
+    with open(config_file) as config:
         for line in [orig_line.strip() for orig_line in config.readlines()]:
             if process_non_op_line(line):
                 continue
 
-            domain, opset_str, operators_str = [segment.strip() for segment in line.split(";")]
+            domain, opset_str, operators_str = (segment.strip() for segment in line.split(";"))
             opsets = [int(s) for s in opset_str.split(",")]
 
             # any type reduction information is serialized json that starts/ends with { and }.
@@ -172,7 +172,7 @@ def process_non_op_line(line):
                         cur = end_str + 1
 
             else:
-                operators = set([op.strip() for op in operators_str.split(",")])
+                operators = {op.strip() for op in operators_str.split(",")}
 
             for opset in opsets:
                 if domain not in required_ops:
diff --git a/tools/python/util/run.py b/tools/python/util/run.py
index c3a389233ff72..838db8f789eac 100644
--- a/tools/python/util/run.py
+++ b/tools/python/util/run.py
@@ -40,7 +40,7 @@ def run(
     cmd = [*args]
 
     _log.info(
-        "Running subprocess in '{0}'\n  {1}".format(cwd or os.getcwd(), " ".join([shlex.quote(arg) for arg in cmd]))
+        "Running subprocess in '{}'\n  {}".format(cwd or os.getcwd(), " ".join([shlex.quote(arg) for arg in cmd]))
     )
 
     def output(is_stream_captured):
@@ -57,6 +57,6 @@ def output(is_stream_captured):
         shell=shell,
     )
 
-    _log.debug("Subprocess completed. Return code: {}".format(completed_process.returncode))
+    _log.debug(f"Subprocess completed. Return code: {completed_process.returncode}")
 
     return completed_process
diff --git a/tools/python/util/test/test_pytorch_export_helpers.py b/tools/python/util/test/test_pytorch_export_helpers.py
index e1201f9a63634..bf4315e27a03e 100644
--- a/tools/python/util/test/test_pytorch_export_helpers.py
+++ b/tools/python/util/test/test_pytorch_export_helpers.py
@@ -14,7 +14,7 @@
 
 class TestModel(torch.nn.Module):
     def __init__(self, D_in, H, D_out):
-        super(TestModel, self).__init__()
+        super().__init__()
         self.linear1 = torch.nn.Linear(D_in, H)
         self.linear2 = torch.nn.Linear(H, D_out)
 

From 5a2e43bdd5f03450015c7b3e33eb25264a8f73c0 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Fri, 24 Mar 2023 16:07:06 -0700
Subject: [PATCH 20/20] [QNN EP] Improve Slice to support opset 9 (#15186)

### Description
Improve Slice to support Onnx opset9 which has starts, ends & axes in
node attributes.

### Motivation and Context
To unblock some models.
---
 .../qnn/builder/opbuilder/slice_op_builder.cc | 47 ++++++++++++++-----
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
index 71e3150ae74e2..ea8054a7a1f60 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
@@ -13,9 +13,6 @@
 namespace onnxruntime {
 namespace qnn {
 
-const int SLICE_MIN_INPUT = 3;
-const int SLICE_MAX_INPUT = 5;
-
 class SliceOpBuilder : public BaseOpBuilder {
  public:
   SliceOpBuilder() : BaseOpBuilder("SliceOpBuilder") {}
@@ -38,6 +35,10 @@ class SliceOpBuilder : public BaseOpBuilder {
 
  private:
   Status ExplictOpCheck(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const;
+  void GetDataFromAttribute(const NodeUnit& node_unit,
+                            TensorShapeVector& raw_starts,
+                            TensorShapeVector& raw_ends,
+                            TensorShapeVector& raw_axes) const;
   typedef struct {
     int32_t begin, end, stride;
   } Range;
@@ -46,19 +47,36 @@ class SliceOpBuilder : public BaseOpBuilder {
 
 Status SliceOpBuilder::ExplictOpCheck(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const {
   size_t input_count = node_unit.Inputs().size();
-  ORT_RETURN_IF_NOT(input_count >= SLICE_MIN_INPUT && input_count <= SLICE_MAX_INPUT,
-                    "For ONNX Slice operation the expected number of inputs is between 3 and 5.");
-  // Skip the first input. All other input need to be initializer
-  for (size_t i = 1; i < node_unit.Inputs().size(); i++) {
-    const auto& next_input = node_unit.Inputs()[i].node_arg.Name();
-    if (!qnn_model_wrapper.IsInitializerInput(next_input)) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN desn't support dynamic slice.");
+  // Op set 9 only has 1 input with starts, ends, axes attribute
+  // Op set > 9, starts, ends, axes are from node input
+  if (input_count > 1) {
+    // Skip the first input. All other input need to be initializer
+    for (size_t i = 1; i < input_count; i++) {
+      const auto& next_input = node_unit.Inputs()[i].node_arg.Name();
+      if (!qnn_model_wrapper.IsInitializerInput(next_input)) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN desn't support dynamic slice.");
+      }
     }
   }
 
   return Status::OK();
 }
 
+void SliceOpBuilder::GetDataFromAttribute(const NodeUnit& node_unit,
+                                          TensorShapeVector& raw_starts,
+                                          TensorShapeVector& raw_ends,
+                                          TensorShapeVector& raw_axes) const {
+  NodeAttrHelper node_helper(node_unit);
+  auto starts = node_helper.Get("starts", std::vector<int64_t>{0});
+  raw_starts.assign(starts.begin(), starts.end());
+  auto ends = node_helper.Get("ends", std::vector<int64_t>{0});
+  raw_ends.assign(ends.begin(), ends.end());
+  if (node_helper.HasAttr("axes")) {
+    auto axes = node_helper.Get("axes", std::vector<int64_t>{0});
+    raw_axes.assign(axes.begin(), axes.end());
+  }
+}
+
 // Note: For ONNX Slice operation the expected number of inputs is between 3 and 5
 Status SliceOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
                                      const NodeUnit& node_unit,
@@ -79,7 +97,13 @@ Status SliceOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
   std::vector<uint32_t> input0_shape;
 
   auto inputs = node_unit.Inputs();
-  for (size_t input_i = 0; input_i < inputs.size(); ++input_i) {
+  auto input_count = inputs.size();
+  // Opset 9, only 1 input, starts, ends, axes are in attribute
+  if (1 == input_count) {
+    GetDataFromAttribute(node_unit, raw_starts, raw_ends, raw_axes);
+  }
+
+  for (size_t input_i = 0; input_i < input_count; ++input_i) {
     auto& input_name = inputs[input_i].node_arg.Name();
     if (input_name.empty()) {
       // Ignore unspecified/unused optional input
@@ -144,7 +168,6 @@ Status SliceOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
         continue;
       }
     }
-    ORT_ENFORCE(input_i == 0, "QNN ReluMinMax operator expects only one input. other inputs, starts, ends, axes and steps are expected to be parameters, ie. initializer inputs in ONNX model");
     input0_shape = input_shape;
 
     input_names.push_back(input_name);