From 0f8c3a6a5d3413920dbcfd925efdcb2de02ba06a Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Tue, 4 Jun 2024 02:18:17 -0700
Subject: [PATCH] Adopt lintrunner and enable github actions lint checks 1/2
 (#16)

Signed-off-by: Justin Chu <justinchu@microsoft.com>
---
 .github/workflows/lint.yaml                   |  84 +++++
 .lintrunner.toml                              |  97 +++++
 .pre-commit-config.yaml                       | 123 -------
 .../weight_only/evaluation/accuracy.py        |  45 +--
 .../weight_only/evaluation/evaluator.py       | 111 ++----
 .../weight_only/evaluation/models/__init__.py |   1 -
 .../evaluation/models/huggingface.py          | 336 ++++++------------
 .../weight_only/evaluation/utils.py           |  58 +--
 .../llama/quantization/weight_only/main.py    | 225 +++++-------
 .../quantization/weight_only/prepare_model.py |   4 +-
 .../algorithms/layer_wise/core.py             |   6 +-
 .../algorithms/smoother/calibrator.py         |   6 +-
 .../algorithms/smoother/core.py               |   6 +-
 .../algorithms/weight_only/awq.py             |  17 +-
 .../algorithms/weight_only/gptq.py            |   9 +-
 .../algorithms/weight_only/rtn.py             |   8 +-
 .../algorithms/weight_only/utility.py         |  11 +-
 onnx_neural_compressor/config.py              |  11 +-
 onnx_neural_compressor/onnx_model.py          |   5 +-
 .../quantization/__init__.py                  |   4 +-
 .../quantization/algorithm_entry.py           |  13 +-
 .../quantization/matmul_4bits_quantizer.py    |   3 +-
 .../quantization/matmul_nbits_quantizer.py    |   9 +-
 .../quantization/quantize.py                  |   3 +-
 onnx_neural_compressor/quantization/tuning.py |   6 +-
 onnx_neural_compressor/utility.py             |   4 +-
 pyproject.toml                                |   4 -
 requirements-lintrunner.txt                   |   4 +
 setup.py                                      |  12 +-
 .../layer_wise/test_layer_wise.py             |   7 +-
 test/quantization/test_autotune.py            |   6 +-
 test/quantization/test_config.py              |   7 +-
 test/quantization/test_smooth_quant.py        |   6 +-
 test/quantization/weight_only/test_awq.py     |  10 +-
 test/quantization/weight_only/test_gptq.py    |  10 +-
 test/quantization/weight_only/test_rtn.py     |   9 +-
 test/utils/test_general.py                    |   4 +-
 37 files changed, 546 insertions(+), 738 deletions(-)
 create mode 100644 .github/workflows/lint.yaml
 create mode 100644 .lintrunner.toml
 delete mode 100644 .pre-commit-config.yaml
 create mode 100644 requirements-lintrunner.txt

diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
new file mode 100644
index 000000000..9839352d0
--- /dev/null
+++ b/.github/workflows/lint.yaml
@@ -0,0 +1,84 @@
+# Copyright (c) ONNX Neural Compressor Project Contributors
+#
+# SPDX-License-Identifier: Apache-2.0
+
+name: Lint
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+  merge_group:
+
+permissions:  # set top-level default permissions as security best practice
+  contents: read
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  optional-lint:
+    name: Optional Lint
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      - name: misspell # Check spellings as well
+        uses: reviewdog/action-misspell@5bd7be2fc7ae56a517184f5c4bbcf2fd7afe3927 # v1.17.0
+        with:
+          github_token: ${{ secrets.github_token }}
+          locale: "US"
+          reporter: github-pr-check
+          level: info
+          filter_mode: diff_context
+      - name: shellcheck # Static check shell scripts
+        uses: reviewdog/action-shellcheck@72365a51bf6476fe952a117c3ff703eb7775e40a # v1.20.0
+        with:
+          github_token: ${{ secrets.github_token }}
+          reporter: github-pr-check
+          level: info
+          filter_mode: diff_context
+
+  enforce-style:
+    name: Enforce style
+    runs-on: ubuntu-latest
+    permissions:
+      security-events: write
+    steps:
+      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      - name: Setup Python
+        uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
+        with:
+          python-version: "3.12"
+      - name: Install ONNX Neural Compressor
+        run: |
+          pip install .
+      - name: Install dependencies
+        run: |
+          python -m pip install lintrunner lintrunner-adapters
+          lintrunner init
+      - name: Run lintrunner on all files
+        run: |
+          set +e
+          if ! lintrunner --force-color --all-files --tee-json=lint.json -v; then
+              echo ""
+              echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`.\e[0m"
+              echo -e "\e[1m\e[36mSee https://github.com/onnx/neural-compressor/blob/main/.lintrunner.toml for setup instructions.\e[0m"
+              exit 1
+          fi
+      - name: Produce SARIF
+        if: always()
+        run: |
+          python -m lintrunner_adapters to-sarif lint.json lintrunner.sarif
+      - name: Upload SARIF file
+        # Use always() to always upload SARIF even if lintrunner returns with error code
+        # To toggle linter comments in the files page, press `i` on the keyboard
+        if: always()
+        continue-on-error: true
+        uses: github/codeql-action/upload-sarif@cdcdbb579706841c47f7063dda365e292e5cad7a # v2.13.4
+        with:
+          # Path to SARIF file relative to the root of the repository
+          sarif_file: lintrunner.sarif
+          category: lintrunner
+          checkout_path: ${{ github.workspace }}
diff --git a/.lintrunner.toml b/.lintrunner.toml
new file mode 100644
index 000000000..5a5298134
--- /dev/null
+++ b/.lintrunner.toml
@@ -0,0 +1,97 @@
+# Configuration for lintrunner https://github.com/suo/lintrunner
+# You can install the dependencies and initialize with
+#
+# ```sh
+# pip install lintrunner lintrunner-adapters
+# lintrunner init
+# ```
+#
+# This will install lintrunner on your system and download all the necessary
+# dependencies to run linters locally.
+# If you want to see what lintrunner init will install, run
+# `lintrunner init --dry-run`.
+#
+# To lint local changes:
+#
+# ```bash
+# lintrunner
+# ```
+#
+# To lint all files:
+#
+# ```bash
+# lintrunner --all-files
+# ```
+#
+# To format files:
+#
+# ```bash
+# lintrunner -a
+# ```
+#
+# To read more about lintrunner, see [wiki](https://github.com/pytorch/pytorch/wiki/lintrunner).
+# To update an existing linting rule or create a new one, modify this file or create a
+# new adapter following examples in https://github.com/justinchuby/lintrunner-adapters.
+merge_base_with = 'main'
+
+[[linter]]
+code = 'RUFF'
+include_patterns = [
+    '**/*.py',
+    '**/*.pyi',
+]
+exclude_patterns = [
+    '*_pb2*',
+    '.setuptools-cmake-build/*',
+    'docs/**',
+]
+command = [
+    'python',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'ruff_linter',
+    '--config=pyproject.toml',
+    '@{{PATHSFILE}}'
+]
+init_command = [
+    'python',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'pip_init',
+    '--dry-run={{DRYRUN}}',
+    '--requirement=requirements-lintrunner.txt',
+]
+is_formatter = true
+
+[[linter]]
+code = 'BLACK-ISORT'
+include_patterns = [
+    '**/*.py',
+]
+exclude_patterns = [
+    '*_pb2*',
+    '.setuptools-cmake-build/*',
+    'cmake/**',
+    'docs/**',
+]
+command = [
+    'python',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'black_isort_linter',
+    '--',
+    '@{{PATHSFILE}}'
+]
+init_command = [
+    'python',
+    '-m',
+    'lintrunner_adapters',
+    'run',
+    'pip_init',
+    '--dry-run={{DRYRUN}}',
+    '--requirement=requirements-lintrunner.txt',
+]
+is_formatter = true
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
deleted file mode 100644
index a028ec376..000000000
--- a/.pre-commit-config.yaml
+++ /dev/null
@@ -1,123 +0,0 @@
-ci:
-  autofix_prs: true
-  autoupdate_schedule: quarterly
-
-exclude: |
-  (?x)^(
-      conda_meta/.+|
-  )$
-
-repos:
-  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
-    hooks:
-      - id: end-of-file-fixer
-        files: (.*\.(py|md|rst|yaml|yml))$
-        exclude: |
-          (?x)^(
-              examples/.+
-          )$
-      - id: check-json
-        exclude: |
-          (?x)^(
-              .vscode/settings_recommended.json
-          )$
-      - id: check-yaml
-        exclude: |
-          (?x)^(
-            conda_meta/|
-          )$
-      - id: debug-statements
-      - id: file-contents-sorter
-        exclude: |
-          (?x)^(
-              examples/.+
-          )$
-        args: [--unique]
-      - id: requirements-txt-fixer
-        exclude: |
-          (?x)^(
-              examples/.+
-          )$
-      - id: trailing-whitespace
-        files: (.*\.(py|rst|cmake|yaml|yml))$
-        exclude: |
-          (?x)^(
-              examples/.+
-          )$
-
-  - repo: https://github.com/asottile/yesqa
-    rev: v1.5.0
-    hooks:
-      - id: yesqa
-        name: Unused noqa
-
-  - repo: https://github.com/pycqa/isort
-    rev: 5.13.2
-    hooks:
-      - id: isort
-        exclude: |
-          (?x)^(
-              examples/.+
-          )$
-
-  - repo: https://github.com/PyCQA/docformatter
-    rev: v1.7.5
-    hooks:
-      - id: docformatter
-        args: [
-            --in-place,
-            --wrap-summaries=0, # 0 means disable wrap
-            --wrap-descriptions=0, # 0 means disable wrap
-            --black,
-            --style=google,
-          ]
-        exclude: |
-          (?x)^(
-              examples/.+
-          )$
-
-  - repo: https://github.com/psf/black.git
-    rev: 24.3.0
-    hooks:
-      - id: black
-        files: (.*\.py)$
-        exclude: |
-          (?x)^(
-              examples/.+
-          )$
-
-  - repo: https://github.com/asottile/blacken-docs
-    rev: 1.16.0
-    hooks:
-      - id: blacken-docs
-        args: [--line-length=120, --skip-errors]
-        additional_dependencies:
-          - black==24.3.0
-        exclude: |
-          (?x)^(
-              examples/.+|
-              docs/source-app
-          )$
-
-  - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.6
-    hooks:
-      - id: codespell
-        args: [-w]
-        additional_dependencies:
-          - tomli
-        exclude: |
-          (?x)^(
-              examples/.*(txt|patch)|
-          )$
-
-  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.3.5
-    hooks:
-      - id: ruff
-        args: [--fix, --exit-non-zero-on-fix, --no-cache]
-        exclude: |
-          (?x)^(
-              examples/.+
-          )$
diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/accuracy.py b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/accuracy.py
index 045d28c8b..5608307f6 100644
--- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/accuracy.py
+++ b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/accuracy.py
@@ -13,19 +13,18 @@
 # limitations under the License.
 
 
+import glob
 import json
 import logging
 import os
 import re
 import sys
-import glob
 from pathlib import Path
-import numpy as np
 
 import lm_eval.logging_utils
 import lm_eval.tasks
 import lm_eval.utils
-
+import numpy as np
 from evaluation import evaluator
 
 DEFAULT_RESULTS_FILE = "results.json"
@@ -52,9 +51,7 @@ def cli_evaluate(args) -> None:
     if args.predict_only:
         args.log_samples = True
     if (args.log_samples or args.predict_only) and not args.output_path:
-        raise ValueError(
-            "Specify --output_path if providing --log_samples or --predict_only"
-        )
+        raise ValueError("Specify --output_path if providing --log_samples or --predict_only")
 
     if args.include_path is not None:
         eval_logger.info(f"Including path: {args.include_path}")
@@ -62,17 +59,14 @@ def cli_evaluate(args) -> None:
 
     if args.limit:
         eval_logger.warning(
-            " --limit SHOULD ONLY BE USED FOR TESTING."
-            "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
+            " --limit SHOULD ONLY BE USED FOR TESTING." "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
         )
 
     if args.tasks is None:
         eval_logger.error("Need to specify task to evaluate.")
         sys.exit()
     elif args.tasks == "list":
-        eval_logger.info(
-            "Available Tasks:\n - {}".format("\n - ".join(task_manager.all_tasks))
-        )
+        eval_logger.info("Available Tasks:\n - {}".format("\n - ".join(task_manager.all_tasks)))
         sys.exit()
     else:
         if os.path.isdir(args.tasks):
@@ -99,8 +93,8 @@ def cli_evaluate(args) -> None:
                     f"{lm_eval.utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
                 )
                 raise ValueError(
-                    f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks," + \
-                    " or '--verbosity DEBUG' to troubleshoot task registration issues."
+                    f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks,"
+                    + " or '--verbosity DEBUG' to troubleshoot task registration issues."
                 )
 
     if args.output_path:
@@ -110,9 +104,7 @@ def cli_evaluate(args) -> None:
             raise FileExistsError(f"File already exists at {path}")
         output_path_file = path.joinpath(DEFAULT_RESULTS_FILE)
         if output_path_file.is_file():
-            eval_logger.warning(
-                f"File {output_path_file} already exists. Results will be overwritten."
-            )
+            eval_logger.warning(f"File {output_path_file} already exists. Results will be overwritten.")
         # if path json then get parent dir
         elif path.suffix in (".json", ".jsonl"):
             output_path_file = path
@@ -124,17 +116,12 @@ def cli_evaluate(args) -> None:
     # Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args
     if args.trust_remote_code:
         os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = str(args.trust_remote_code)
-        args.model_args = (
-            args.model_args
-            + f",trust_remote_code={os.environ['HF_DATASETS_TRUST_REMOTE_CODE']}"
-        )
+        args.model_args = args.model_args + f",trust_remote_code={os.environ['HF_DATASETS_TRUST_REMOTE_CODE']}"
 
     eval_logger.info(f"Selected Tasks: {task_names}")
     eval_logger.info("Loading selected tasks...")
 
-    request_caching_args = evaluator.request_caching_arg_to_dict(
-        cache_requests=args.cache_requests
-    )
+    request_caching_args = evaluator.request_caching_arg_to_dict(cache_requests=args.cache_requests)
 
     results = evaluator.simple_evaluate(
         model=args.model,
@@ -156,17 +143,15 @@ def cli_evaluate(args) -> None:
         random_seed=args.seed[0],
         numpy_random_seed=args.seed[1],
         torch_random_seed=args.seed[2],
-        user_model=args.user_model, # to validate the model in memory,
-        tokenizer=args.tokenizer, # to use tokenizer in mem,
+        user_model=args.user_model,  # to validate the model in memory,
+        tokenizer=args.tokenizer,  # to use tokenizer in mem,
         **request_caching_args,
     )
 
     if results is not None:
         if args.log_samples:
             samples = results.pop("samples")
-        dumped = json.dumps(
-            results, indent=2, default=_handle_non_serializable, ensure_ascii=False
-        )
+        dumped = json.dumps(results, indent=2, default=_handle_non_serializable, ensure_ascii=False)
         if args.show_config:
             print(dumped)
 
@@ -187,9 +172,7 @@ def cli_evaluate(args) -> None:
 
             if args.log_samples:
                 for task_name, config in results["configs"].items():
-                    output_name = "{}_{}".format(
-                        re.sub("/|=", "__", args.model_args), task_name
-                    )
+                    output_name = "{}_{}".format(re.sub("/|=", "__", args.model_args), task_name)
                     filename = path.joinpath(f"{output_name}.jsonl")
                     samples_dumped = json.dumps(
                         samples[task_name],
diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/evaluator.py b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/evaluator.py
index e4a92565b..2b4a8b2d2 100644
--- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/evaluator.py
+++ b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/evaluator.py
@@ -13,36 +13,34 @@
 # limitations under the License.
 
 
+import collections
 import itertools
 import logging
 import random
 import time
-import collections
 from typing import TYPE_CHECKING, List, Optional, Union
 
-import numpy as np
-import torch
-
 import lm_eval.api.metrics
 import lm_eval.api.registry
-import lm_eval.models
 import lm_eval.caching.cache
 import lm_eval.evaluator_utils
 import lm_eval.logging_utils
+import lm_eval.models
 import lm_eval.utils
-
+import numpy as np
 import optimum.onnxruntime
-
+import torch
 from evaluation.models import huggingface
 
 if TYPE_CHECKING:
     import lm_eval.api.model
     import lm_eval.tasks
 
+
 @lm_eval.utils.positional_deprecated
 def simple_evaluate(
     model,
-    model_args: Optional[Union[str, dict,object]] = None,
+    model_args: Optional[Union[str, dict, object]] = None,
     tasks: Optional[List[Union[str, dict, object]]] = None,
     num_fewshot: Optional[int] = None,
     batch_size: Optional[int] = None,
@@ -152,9 +150,7 @@ def simple_evaluate(
     if tasks is None:
         tasks = []
     if len(tasks) == 0:
-        raise ValueError(
-            "No tasks specified, or no tasks found. Please verify the task names."
-        )
+        raise ValueError("No tasks specified, or no tasks found. Please verify the task names.")
 
     if gen_kwargs is not None:
         gen_kwargs = lm_eval.utils.simple_parse_args_string(gen_kwargs)
@@ -181,9 +177,9 @@ def simple_evaluate(
                 model_id = "fxmarty/onnx-tiny-random-gpt2-with-merge"
             elif isinstance(user_model, optimum.onnxruntime.ORTModelForSeq2SeqLM):
                 model_id = "optimum/t5-small"
-            lm_eval.utils.eval_logger.info("We use '{}' to build `LM` instance, the actually run model is user_model you passed.".format(
-                model_id
-            ))
+            lm_eval.utils.eval_logger.info(
+                "We use '{}' to build `LM` instance, the actually run model is user_model you passed.".format(model_id)
+            )
             lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
                 "pretrained=" + model_id,
                 {
@@ -244,9 +240,7 @@ def simple_evaluate(
 
         if task_obj.get_config("output_type") == "generate_until":
             if gen_kwargs is not None:
-                task_obj.set_config(
-                    key="generation_kwargs", value=gen_kwargs, update=True
-                )
+                task_obj.set_config(key="generation_kwargs", value=gen_kwargs, update=True)
 
         if predict_only:
             log_samples = True
@@ -261,8 +255,8 @@ def simple_evaluate(
         if num_fewshot is not None:
             if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
                 lm_eval.utils.eval_logger.info(
-                    f"num_fewshot has been set to 0 for {task_name} in its config." + \
-                    "Manual configuration will be ignored."
+                    f"num_fewshot has been set to 0 for {task_name} in its config."
+                    + "Manual configuration will be ignored."
                 )
             else:
                 lm_eval.utils.eval_logger.warning(
@@ -302,9 +296,7 @@ def simple_evaluate(
             "model": model_name,
             "model_args": model_args,
             "batch_size": batch_size,
-            "batch_sizes": (
-                list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else []
-            ),
+            "batch_sizes": (list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else []),
             "provider": provider,
             "use_cache": use_cache,
             "limit": limit,
@@ -316,9 +308,7 @@ def simple_evaluate(
         try:
             lm_eval.logging_utils.add_env_info(results)  # additional environment info to results
         except:
-            lm_eval.utils.eval_logger.info(
-                    f"get env info failed."
-                )
+            lm_eval.utils.eval_logger.info("get env info failed.")
         return results
     else:
         return None
@@ -373,8 +363,7 @@ def evaluate(
     task_hierarchy, eval_tasks = lm_eval.evaluator_utils.get_task_list(task_dict)
     if not log_samples:
         if not all(
-            "bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys()
-            for task_output in eval_tasks
+            "bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys() for task_output in eval_tasks
         ):
             raise ValueError("log_samples must be True for 'bypass' metric-only tasks")
     for task_output in eval_tasks:
@@ -400,15 +389,9 @@ def evaluate(
 
         if lm.world_size > 1:
             instances_rnk = torch.tensor(len(task._instances), device=torch.device("cpu"))
-            gathered_item = (
-                lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist()
-            )
+            gathered_item = lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist()
             # "multiple_choice" task types dispatch (several) "loglikelihood" request types
-            reqtype = (
-                "loglikelihood"
-                if task.OUTPUT_TYPE == "multiple_choice"
-                else task.OUTPUT_TYPE
-            )
+            reqtype = "loglikelihood" if task.OUTPUT_TYPE == "multiple_choice" else task.OUTPUT_TYPE
             # compute number of pseudo-batches to pad with (FSDP/DDP require even batches among ranks)
             numpad = max(gathered_item) - gathered_item[lm.rank]
             # todo: may not account for padding in cases like SquadV2 which has multiple req types
@@ -457,14 +440,10 @@ def evaluate(
             instances.sort(key=lambda x: x.idx)
         # iterate over different filters used
         for filter_key in task.instances[0].filtered_resps.keys():
-            doc_iterator = task.doc_iterator(
-                rank=RANK, limit=limit, world_size=WORLD_SIZE
-            )
+            doc_iterator = task.doc_iterator(rank=RANK, limit=limit, world_size=WORLD_SIZE)
             for doc_id, doc in doc_iterator:
                 requests = instances_by_doc_id[doc_id]
-                metrics = task.process_results(
-                    doc, [req.filtered_resps[filter_key] for req in requests]
-                )
+                metrics = task.process_results(doc, [req.filtered_resps[filter_key] for req in requests])
                 if log_samples:
                     target = task.doc_to_target(doc)
                     example = {
@@ -473,9 +452,7 @@ def evaluate(
                         "target": target,
                         "arguments": [req.args for req in requests],
                         "resps": [req.resps for req in requests],
-                        "filtered_resps": [
-                            req.filtered_resps[filter_key] for req in requests
-                        ],
+                        "filtered_resps": [req.filtered_resps[filter_key] for req in requests],
                     }
                     example.update(metrics)
                     task_output.logged_samples.append(example)
@@ -496,9 +473,7 @@ def evaluate(
                 )
 
                 if RANK == 0:
-                    task_output.logged_samples = list(
-                        itertools.chain.from_iterable(full_samples)
-                    )
+                    task_output.logged_samples = list(itertools.chain.from_iterable(full_samples))
 
             # then collect metrics across all ranks
             for metrics in task_output.sample_metrics:
@@ -509,18 +484,14 @@ def evaluate(
                     dst=0,
                 )
                 if RANK == 0:
-                    task_output.sample_metrics[metrics] = list(
-                        itertools.chain.from_iterable(metric_list)
-                    )
+                    task_output.sample_metrics[metrics] = list(itertools.chain.from_iterable(metric_list))
 
     if RANK == 0:
         ### Aggregate results over all datapoints ###
         # aggregate results ; run bootstrap CIs
         for task_output in eval_tasks:
             task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters)
-        results, samples, configs, versions, num_fewshot = lm_eval.evaluator_utils.consolidate_results(
-            eval_tasks
-        )
+        results, samples, configs, versions, num_fewshot = lm_eval.evaluator_utils.consolidate_results(eval_tasks)
 
         ### Calculate group metrics ###
         if bool(results):
@@ -543,33 +514,17 @@ def evaluate(
                     stderr = "_stderr,".join(metric.split(","))
 
                     # gather metrics, sizes, and stderrs from subtasks
-                    metrics = [
-                        results[task][metric]
-                        for task in task_list
-                        if metric in results[task]
-                    ]  # TODO: copy?
-                    stderrs = [
-                        results[task][stderr]
-                        for task in task_list
-                        if stderr in results[task]
-                    ]
-                    sizes = [
-                        results[task]["samples"]
-                        for task in task_list
-                        if metric in results[task]
-                    ]
+                    metrics = [results[task][metric] for task in task_list if metric in results[task]]  # TODO: copy?
+                    stderrs = [results[task][stderr] for task in task_list if stderr in results[task]]
+                    sizes = [results[task]["samples"] for task in task_list if metric in results[task]]
 
                     # compute group's pooled metric and stderr
-                    results[group][metric] = (
-                        lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
-                    )
+                    results[group][metric] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
                     # TODO: calculate grouped metric using aggregation fn
                     if "N/A" in stderrs:
                         results[group][stderr] = "N/A"
                     else:
-                        results[group][stderr] = (
-                            lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
-                        )
+                        results[group][stderr] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
                         # TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
                         # To use the old (likely incorrect) variance formula,
                         # comment out the above and uncomment this line:
@@ -587,9 +542,7 @@ def evaluate(
             if len(left_tasks_list) == 0:
                 break
 
-            _task_hierarchy = {
-                k: v for k, v in task_hierarchy.items() if k in left_tasks_list
-            }
+            _task_hierarchy = {k: v for k, v in task_hierarchy.items() if k in left_tasks_list}
             _results_agg, _groups_agg = lm_eval.evaluator_utils.prepare_print_tasks(_task_hierarchy, results)
 
             results_agg = {**results_agg, **_results_agg}
@@ -597,9 +550,7 @@ def evaluate(
 
         for group_name, task_list in task_hierarchy.items():
             if task_list:
-                num_fewshot[group_name] = num_fewshot[
-                    task_list[0]
-                ]  # TODO: validate this
+                num_fewshot[group_name] = num_fewshot[task_list[0]]  # TODO: validate this
 
         results_dict = {
             "results": dict(results_agg.items()),
diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/models/__init__.py b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/models/__init__.py
index 6a7755e15..8a19e05fd 100644
--- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/models/__init__.py
+++ b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/models/__init__.py
@@ -15,7 +15,6 @@
 
 from evaluation.models import huggingface
 
-
 # TODO: implement __all__
 
 
diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/models/huggingface.py b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/models/huggingface.py
index 1eb8cd49a..b682e4f47 100644
--- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/models/huggingface.py
+++ b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/models/huggingface.py
@@ -13,30 +13,29 @@
 # limitations under the License.
 
 
-import accelerate
 import copy
-import huggingface_hub
 import os
-import packaging.version
 import tempfile
-import tqdm
-import transformers
 from typing import List, Literal, Optional, Tuple, Union
 
-import torch
-import torch.nn.functional as F
-
-import lm_eval.utils
+import accelerate
+import huggingface_hub
 import lm_eval.api.instance
 import lm_eval.api.model
 import lm_eval.models.utils
-
+import lm_eval.utils
 import onnxruntime
-import optimum.version
 import optimum.onnxruntime
+import optimum.version
+import packaging.version
+import torch
+import torch.nn.functional as F
+import tqdm
+import transformers
 
 eval_logger = lm_eval.utils.eval_logger
 
+
 class HFLM(lm_eval.api.model.TemplateLM):
     """An abstracted Huggingface model class. Enables usage with both models of
     `optimum.onnxruntime.ORTModelForCausalLM` and
@@ -75,22 +74,21 @@ def __init__(
         available_providers = onnxruntime.get_available_providers()
         assert provider in available_providers, "{} is not available.".format(provider)
         self._provider = provider
-        self._device = torch.device("cpu") # use cpu to generate torch tensor
+        self._device = torch.device("cpu")  # use cpu to generate torch tensor
 
         # optionally: take in an already-initialized ORTModel
         if not isinstance(pretrained, str):
             eval_logger.warning(
-                "`pretrained` model kwarg is not of type `str`. "+ \
-                "Many other model arguments may be ignored. "
+                "`pretrained` model kwarg is not of type `str`. " + "Many other model arguments may be ignored. "
             )
             self._model = pretrained
             self._config = self._model.config
             self.model.providers
 
             if tokenizer:
-                assert isinstance(
-                    tokenizer, transformers.PreTrainedTokenizer
-                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
+                assert isinstance(tokenizer, transformers.PreTrainedTokenizer) or isinstance(
+                    tokenizer, transformers.PreTrainedTokenizerFast
+                )
                 self.tokenizer = tokenizer
             else:
                 # Get tokenizer
@@ -112,9 +110,7 @@ def __init__(
             )
 
         # determine which of 'causal' and 'seq2seq' backends to use
-        self._get_backend(
-            config=self.config, backend=backend, trust_remote_code=trust_remote_code
-        )
+        self._get_backend(config=self.config, backend=backend, trust_remote_code=trust_remote_code)
 
         # if we passed `pretrained` as a string, initialize our model now
         if isinstance(pretrained, str):
@@ -162,8 +158,8 @@ def __init__(
         if getattr(self.config, "model_type", None) == "gemma":
             self.add_bos_token = True
             eval_logger.info(
-                f"Model type is '{self.config.model_type}', " + \
-                 "a BOS token will be used as Gemma underperforms without it."
+                f"Model type is '{self.config.model_type}', "
+                + "a BOS token will be used as Gemma underperforms without it."
             )
 
         self._max_length = max_length
@@ -182,8 +178,8 @@ def __init__(
         if not isinstance(pretrained, str):
             # if a PreTrainedModel was passed into HFLM, we forgo distributed setup.
             eval_logger.warning(
-                "Passed an already-initialized model through `pretrained`," + \
-                " assuming single-process call to evaluate() or custom distributed integration"
+                "Passed an already-initialized model through `pretrained`,"
+                + " assuming single-process call to evaluate() or custom distributed integration"
             )
             self._rank = 0
             self._world_size = 1
@@ -254,9 +250,7 @@ def _get_backend(
                 self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
             elif backend == "seq2seq":
                 self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
-            eval_logger.info(
-                f"Overrode HF model backend type, and using type '{backend}'"
-            )
+            eval_logger.info(f"Overrode HF model backend type, and using type '{backend}'")
         else:
             # determine and use the default HF backend for this model, based on its config + metadata.
             if (
@@ -268,8 +262,8 @@ def _get_backend(
                 # these special cases should be treated as seq2seq models.
                 self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM
             elif (
-                getattr(self.config, "model_type") in
-                transformers.models.auto.modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+                getattr(self.config, "model_type")
+                in transformers.models.auto.modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
             ):
                 self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM
             else:
@@ -308,8 +302,8 @@ def _create_model(
         if not os.path.exists(pretrained):
             eval_logger.warning("`{}` path does not exist. Will try to download it from huggingface.")
             try:
-                local_dir  = tempfile.TemporaryDirectory().name
-                huggingface_hub.snapshot_download(pretrained, local_dir =local_dir )
+                local_dir = tempfile.TemporaryDirectory().name
+                huggingface_hub.snapshot_download(pretrained, local_dir=local_dir)
                 pretrained = local_dir
             except Exception as e:
                 raise e
@@ -317,121 +311,106 @@ def _create_model(
         if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
             if (
                 not os.path.exists(os.path.join(pretrained, "decoder_model.onnx"))
-                and not os.path.exists(
-                    os.path.join(pretrained, "decoder_with_past_model.onnx")
-                )
-                and not os.path.exists(
-                    os.path.join(pretrained, "decoder_model_merged.onnx")
-                )
+                and not os.path.exists(os.path.join(pretrained, "decoder_with_past_model.onnx"))
+                and not os.path.exists(os.path.join(pretrained, "decoder_model_merged.onnx"))
                 and not os.path.exists(os.path.join(pretrained, "model.onnx"))
             ):
                 raise ValueError(
-                    "Couldn't find any ONNX model name in " + \
-                    "['decoder_model.onnx', 'decoder_with_past_model.onnx', "
-                    "'decoder_model_merged.onnx', 'model.onnx'] in {}.".format(
-                        pretrained
-                    )
+                    "Couldn't find any ONNX model name in " + "['decoder_model.onnx', 'decoder_with_past_model.onnx', "
+                    "'decoder_model_merged.onnx', 'model.onnx'] in {}.".format(pretrained)
                 )
 
             sess_options = onnxruntime.SessionOptions()
-            sess_options.graph_optimization_level = (
-                onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
-            )
+            sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
 
             if packaging.version.Version(optimum.version.__version__) >= packaging.version.Version("1.14.0"):
                 if os.path.exists(os.path.join(pretrained, "model.onnx")):
                     session = optimum.onnxruntime.ORTModelForCausalLM.load_model(
-                        os.path.join(pretrained, "model.onnx"),
-                        provider=self.provider,
-                        session_options=sess_options)
+                        os.path.join(pretrained, "model.onnx"), provider=self.provider, session_options=sess_options
+                    )
                     inputs_names = [input.name for input in session.get_inputs()]
                     key_value_input_names = [key for key in inputs_names if (".key" in key) or (".value" in key)]
                     use_cache = len(key_value_input_names) > 0
 
-                    self._model = optimum.onnxruntime.ORTModelForCausalLM(session,
-                                                      self.config,
-                                                      use_cache=True if use_cache else False,
-                                                      use_io_binding=True if use_cache else False)
+                    self._model = optimum.onnxruntime.ORTModelForCausalLM(
+                        session,
+                        self.config,
+                        use_cache=True if use_cache else False,
+                        use_io_binding=True if use_cache else False,
+                    )
                 else:
                     if os.path.exists(os.path.join(pretrained, "decoder_model_merged.onnx")):
                         session = optimum.onnxruntime.ORTModelForCausalLM.load_model(
                             os.path.join(pretrained, "decoder_model_merged.onnx"),
                             provider=self.provider,
-                            session_options=sess_options)
-                        self._model = optimum.onnxruntime.ORTModelForCausalLM(session,
-                                                        self.config,
-                                                        use_cache=True)
+                            session_options=sess_options,
+                        )
+                        self._model = optimum.onnxruntime.ORTModelForCausalLM(session, self.config, use_cache=True)
                     elif os.path.exists(os.path.join(pretrained, "decoder_with_past_model.onnx")):
                         session = optimum.onnxruntime.ORTModelForCausalLM.load_model(
                             os.path.join(pretrained, "decoder_with_past_model.onnx"),
                             provider=self.provider,
-                            session_options=sess_options)
-                        self._model = optimum.onnxruntime.ORTModelForCausalLM(session,
-                                                        self.config,
-                                                        use_cache=True)
+                            session_options=sess_options,
+                        )
+                        self._model = optimum.onnxruntime.ORTModelForCausalLM(session, self.config, use_cache=True)
                     elif os.path.exists(os.path.join(pretrained, "decoder_model.onnx")):
                         session = optimum.onnxruntime.ORTModelForCausalLM.load_model(
                             os.path.join(pretrained, "decoder_model.onnx"),
                             provider=self.provider,
-                            session_options=sess_options)
-                        self._model = optimum.onnxruntime.ORTModelForCausalLM(session,
-                                                        self.config,
-                                                        use_cache=False,
-                                                        use_io_binding=False)
+                            session_options=sess_options,
+                        )
+                        self._model = optimum.onnxruntime.ORTModelForCausalLM(
+                            session, self.config, use_cache=False, use_io_binding=False
+                        )
             else:
                 if os.path.exists(os.path.join(pretrained, "model.onnx")):
                     session = optimum.onnxruntime.ORTModelForCausalLM.load_model(
-                        os.path.join(pretrained, "model.onnx"),
-                        provider=self.provider,
-                        session_options=sess_options)
+                        os.path.join(pretrained, "model.onnx"), provider=self.provider, session_options=sess_options
+                    )
                     inputs_names = session.get_inputs()
                     key_value_input_names = [key for key in inputs_names if (".key" in key) or (".value" in key)]
                     use_cache = len(key_value_input_names) > 0
 
-                    self._model = optimum.onnxruntime.ORTModelForCausalLM(session[0],
-                                                    self.config,
-                                                    pretrained,
-                                                    use_cache=True if use_cache else False,
-                                                    use_io_binding=True if use_cache else False,)
+                    self._model = optimum.onnxruntime.ORTModelForCausalLM(
+                        session[0],
+                        self.config,
+                        pretrained,
+                        use_cache=True if use_cache else False,
+                        use_io_binding=True if use_cache else False,
+                    )
                 else:
                     if os.path.exists(os.path.join(pretrained, "decoder_model_merged.onnx")):
                         sessions = optimum.onnxruntime.ORTModelForCausalLM.load_model(
                             os.path.join(pretrained, "decoder_model_merged.onnx"),
                             provider=self.provider,
-                            session_options=sess_options)
-                        self._model = optimum.onnxruntime.ORTModelForCausalLM(sessions[0],
-                                                        self.config,
-                                                        pretrained,
-                                                        use_cache=True)
+                            session_options=sess_options,
+                        )
+                        self._model = optimum.onnxruntime.ORTModelForCausalLM(
+                            sessions[0], self.config, pretrained, use_cache=True
+                        )
                     elif os.path.exists(os.path.join(pretrained, "decoder_with_past_model.onnx")):
                         sessions = optimum.onnxruntime.ORTModelForCausalLM.load_model(
                             os.path.join(pretrained, "decoder_model.onnx"),
                             os.path.join(pretrained, "decoder_with_past_model.onnx"),
                             provider=self.provider,
-                            session_options=sess_options)
-                        self._model = optimum.onnxruntime.ORTModelForCausalLM(sessions[0],
-                                                        self.config,
-                                                        pretrained,
-                                                        sessions[1],
-                                                        use_cache=True)
+                            session_options=sess_options,
+                        )
+                        self._model = optimum.onnxruntime.ORTModelForCausalLM(
+                            sessions[0], self.config, pretrained, sessions[1], use_cache=True
+                        )
                     else:
                         sessions = optimum.onnxruntime.ORTModelForCausalLM.load_model(
                             os.path.join(pretrained, "decoder_model.onnx"),
                             provider=self.provider,
-                            session_options=sess_options)
-                        self._model = optimum.onnxruntime.ORTModelForCausalLM(sessions[0],
-                                                        self.config,
-                                                        pretrained,
-                                                        use_cache=False,
-                                                        use_io_binding=False)
+                            session_options=sess_options,
+                        )
+                        self._model = optimum.onnxruntime.ORTModelForCausalLM(
+                            sessions[0], self.config, pretrained, use_cache=False, use_io_binding=False
+                        )
         elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
-            if not os.path.exists(
-                os.path.join(pretrained, "encoder_model.onnx")
-            ) or (
+            if not os.path.exists(os.path.join(pretrained, "encoder_model.onnx")) or (
                 not os.path.exists(os.path.join(pretrained, "decoder_model.onnx"))
-                and not os.path.exists(
-                    os.path.join(pretrained, "decoder_model_merged.onnx")
-                )
+                and not os.path.exists(os.path.join(pretrained, "decoder_model_merged.onnx"))
             ):
                 raise ValueError(
                     "Please ensure encoder_model.onnx and "
@@ -439,12 +418,8 @@ def _create_model(
                 )
 
             sess_options = onnxruntime.SessionOptions()
-            sess_options.graph_optimization_level = (
-                onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
-            )
-            if os.path.exists(
-                os.path.join(pretrained, "decoder_model_merged.onnx")
-            ):
+            sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
+            if os.path.exists(os.path.join(pretrained, "decoder_model_merged.onnx")):
                 sessions = optimum.onnxruntime.ORTModelForSeq2SeqLM.load_model(
                     os.path.join(pretrained, "encoder_model.onnx"),
                     os.path.join(pretrained, "decoder_model_merged.onnx"),
@@ -458,9 +433,7 @@ def _create_model(
                     use_cache=True,
                 )
 
-            elif os.path.exists(
-                os.path.join(pretrained, "decoder_with_past_model.onnx")
-            ):
+            elif os.path.exists(os.path.join(pretrained, "decoder_with_past_model.onnx")):
                 sessions = optimum.onnxruntime.ORTModelForSeq2SeqLM.load_model(
                     os.path.join(pretrained, "encoder_model.onnx"),
                     os.path.join(pretrained, "decoder_model.onnx"),
@@ -520,9 +493,9 @@ def _create_tokenizer(
                     use_fast=use_fast_tokenizer,
                 )
             else:
-                assert isinstance(
-                    tokenizer, transformers.PreTrainedTokenizer
-                ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
+                assert isinstance(tokenizer, transformers.PreTrainedTokenizer) or isinstance(
+                    tokenizer, transformers.PreTrainedTokenizerFast
+                )
                 self.tokenizer = tokenizer
         else:
             # Get tokenizer based on 'pretrained'
@@ -542,9 +515,7 @@ def _create_tokenizer(
     def _detect_batch_size(self, requests=None, pos: int = 0):
         if requests:
             _, context_enc, continuation_enc = requests[pos]
-            max_length = len(
-                (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1]
-            )
+            max_length = len((context_enc + continuation_enc)[-(self.max_length + 1) :][:-1])
             max_context_enc = len(context_enc[-(self.max_length + 1) :])
             max_cont_enc = len(continuation_enc[-(self.max_length + 1) :])
         else:
@@ -555,9 +526,7 @@ def _detect_batch_size(self, requests=None, pos: int = 0):
         def forward_batch(batch_size):
             if self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
                 length = max(max_context_enc, max_cont_enc)
-                batched_conts = torch.ones(
-                    (batch_size, length), device=self._device
-                ).long()
+                batched_conts = torch.ones((batch_size, length), device=self._device).long()
                 test_batch = torch.ones((batch_size, length), device=self._device).long()
                 call_kwargs = {
                     "attn_mask": test_batch,
@@ -565,13 +534,9 @@ def forward_batch(batch_size):
                 }
             else:
                 call_kwargs = {}
-                test_batch = torch.ones(
-                    (batch_size, max_length), device=self._device
-                ).long()
+                test_batch = torch.ones((batch_size, max_length), device=self._device).long()
             for _ in range(5):
-                out = F.log_softmax(
-                    self._model_call(test_batch, **call_kwargs), dim=-1
-                )
+                out = F.log_softmax(self._model_call(test_batch, **call_kwargs), dim=-1)
 
             return batch_size
 
@@ -586,9 +551,7 @@ def forward_batch(batch_size):
         if self.world_size > 1:
             # if multi-GPU, always take minimum over all selected batch sizes
             max_rnk_bs = torch.tensor([batch_size], device=self._device)
-            gathered = (
-                self.accelerator.gather(max_rnk_bs).cpu().detach().numpy().tolist()
-            )
+            gathered = self.accelerator.gather(max_rnk_bs).cpu().detach().numpy().tolist()
             batch_size = min(gathered)
             lm_eval.models.utils.clear_torch_cache()
             return batch_size
@@ -596,9 +559,7 @@ def forward_batch(batch_size):
         lm_eval.models.utils.clear_torch_cache()
         return batch_size
 
-    def tok_encode(
-        self, string: str, left_truncate_len=None, add_special_tokens=None
-    ) -> List[int]:
+    def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None) -> List[int]:
         if add_special_tokens is None:
             if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
                 add_special_tokens = False or self.add_bos_token
@@ -639,22 +600,16 @@ def tok_batch_encode(
         )
         if left_truncate_len:
             encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
-            encoding["attention_mask"] = encoding["attention_mask"][
-                :, -left_truncate_len:
-            ]
+            encoding["attention_mask"] = encoding["attention_mask"][:, -left_truncate_len:]
         self.tokenizer.padding_side = old_padding_side
 
         return encoding["input_ids"], encoding["attention_mask"]
 
     def tok_decode(self, tokens, skip_special_tokens=True):
         if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
-            return self.tokenizer.decode(
-                tokens, skip_special_tokens=skip_special_tokens
-            )
+            return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
         elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
-            return self.tokenizer.decode(
-                tokens, skip_special_tokens=skip_special_tokens
-            )
+            return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens)
 
     def _model_call(self, inps, attn_mask=None, labels=None):
         """Call model to get logits results.
@@ -684,9 +639,7 @@ def _model_call(self, inps, attn_mask=None, labels=None):
             shifted_input_ids = labels.new_zeros(labels.shape)
             shifted_input_ids[..., 1:] = labels[..., :-1].clone()
             shifted_input_ids[..., 0] = decoder_start_token_id
-            shifted_input_ids.masked_fill_(
-                shifted_input_ids == -100, pad_token_id
-            )
+            shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
             return self.model(
                 inps,
                 attention_mask=attn_mask,
@@ -695,32 +648,27 @@ def _model_call(self, inps, attn_mask=None, labels=None):
             ).logits
         else:
             assert self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM
-            if hasattr(self.model, "config") and hasattr(self.model.config, "auto_map") and \
-                "chatglm2" in self.model.config.auto_map["AutoConfig"]:
+            if (
+                hasattr(self.model, "config")
+                and hasattr(self.model.config, "auto_map")
+                and "chatglm2" in self.model.config.auto_map["AutoConfig"]
+            ):
                 input_bs, input_len = inps.shape
                 bos = torch.tensor([64790, 64792]).repeat(input_bs, 1)
                 inps = torch.cat((bos, inps), 1)
 
-            inputs_names = [
-                input.name for input in self.model.model.get_inputs()
-            ]
+            inputs_names = [input.name for input in self.model.model.get_inputs()]
             if "position_ids" in inputs_names:
                 # model is exported with optimum >= 1.14.0 with new input 'position_ids'
                 input_shape = inps.shape
-                position_ids = (
-                    torch.arange(0, input_shape[-1], dtype=torch.long)
-                    .unsqueeze(0)
-                    .view(-1, input_shape[-1])
-                )
+                position_ids = torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1])
                 output = self.model(
                     inps,
                     torch.ones(inps.shape, dtype=torch.int64),
                     position_ids,
                 ).logits
             else:
-                output = self.model(
-                    inps, torch.ones(inps.shape, dtype=torch.int64)
-                ).logits
+                output = self.model(inps, torch.ones(inps.shape, dtype=torch.int64)).logits
             return output
 
     def _model_generate(self, context, max_length, stop, **generation_kwargs):
@@ -750,20 +698,14 @@ def _model_generate(self, context, max_length, stop, **generation_kwargs):
             **generation_kwargs,
         )
 
-    def _select_cont_toks(
-        self, logits: torch.Tensor, contlen: int = None, inplen: int = None
-    ) -> torch.Tensor:
+    def _select_cont_toks(self, logits: torch.Tensor, contlen: int = None, inplen: int = None) -> torch.Tensor:
         if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
-            assert (
-                contlen and inplen
-            ), "Must pass input len and cont. len to select scored logits for causal LM"
+            assert contlen and inplen, "Must pass input len and cont. len to select scored logits for causal LM"
             # discard right-padding.
             # also discard the input/context tokens. we'll only score continuations.
             logits = logits[inplen - contlen : inplen]
         elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
-            assert (
-                contlen and not inplen
-            ), "Selecting scored logits for Seq2SeqLM requires only cont. len"
+            assert contlen and not inplen, "Selecting scored logits for Seq2SeqLM requires only cont. len"
             # only discard right-padding.
             # the logits input to this fn only contain decoder-side tokens.
             logits = logits[:contlen]
@@ -783,9 +725,7 @@ def loglikelihood_rolling(
             print(f"Determined Largest batch size: {batch_size}")
             adaptive_batch_size = batch_size
 
-        for (string,) in tqdm.tqdm(
-            [req.args for req in requests], disable=(disable_tqdm or (self.rank != 0))
-        ):
+        for (string,) in tqdm.tqdm([req.args for req in requests], disable=(disable_tqdm or (self.rank != 0))):
             rolling_token_windows = list(
                 map(
                     lm_eval.utils.make_disjoint_window,
@@ -806,9 +746,7 @@ def loglikelihood_rolling(
             if self.world_size > 1:
                 # We pad out the external document-level iterator so the inner iterator doesn't hang
                 mytensor = torch.tensor(len(rolling_token_windows), device=self._device)
-                gathered = (
-                    self.accelerator.gather(mytensor).cpu().detach().numpy().tolist()
-                )
+                gathered = self.accelerator.gather(mytensor).cpu().detach().numpy().tolist()
 
                 pad_amnt = max(gathered) - gathered[self.rank]
                 if pad_amnt > 0:
@@ -835,15 +773,11 @@ def _batch_scheduler(self, pos, n_reordered_requests):
         sched = pos // int(len(n_reordered_requests) / self.batch_schedule)
         if sched in self.batch_sizes:
             return self.batch_sizes[sched]
-        if (len(self.batch_sizes) > 1) and (
-            self.batch_sizes[sched - 1] == self.max_batch_size
-        ):
+        if (len(self.batch_sizes) > 1) and (self.batch_sizes[sched - 1] == self.max_batch_size):
             # if previous batch size is already maximal, skip recomputation
             self.batch_sizes[sched] = self.max_batch_size
             return self.batch_sizes[sched]
-        print(
-            f"Passed argument batch_size = auto:{self.batch_schedule}. Detecting largest batch size"
-        )
+        print(f"Passed argument batch_size = auto:{self.batch_schedule}. Detecting largest batch size")
         self.batch_sizes[sched] = self._detect_batch_size(n_reordered_requests, pos)
         print(f"Determined largest batch size: {self.batch_sizes[sched]}")
         return self.batch_sizes[sched]
@@ -882,9 +816,7 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
             requests,
             sort_fn=_collate,
             group_by=(
-                "contexts"
-                if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM and self.logits_cache
-                else None
+                "contexts" if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM and self.logits_cache else None
             ),
             group_fn=_lookup_one_token_cont,
         )
@@ -892,16 +824,10 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
         # automatic (variable) batch size detection for vectorization
         # pull longest context sample from request
         n_reordered_requests = len(re_ord)
-        batch_size = (
-            self.batch_size
-            if self.batch_size != "auto"
-            else override_bs if override_bs is not None else 0
-        )
+        batch_size = self.batch_size if self.batch_size != "auto" else override_bs if override_bs is not None else 0
         batch_fn = (
             self._batch_scheduler
-            if self.batch_size == "auto"
-            and n_reordered_requests > 0
-            and not override_bs
+            if self.batch_size == "auto" and n_reordered_requests > 0 and not override_bs
             else None
         )
 
@@ -968,17 +894,9 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
 
                     conts.append(cont)
 
-                    padding_len_cont = (
-                        max(padding_len_cont, contlen)
-                        if padding_len_cont is not None
-                        else contlen
-                    )
+                    padding_len_cont = max(padding_len_cont, contlen) if padding_len_cont is not None else contlen
 
-                padding_len_inp = (
-                    max(padding_len_inp, inplen)
-                    if padding_len_inp is not None
-                    else inplen
-                )
+                padding_len_inp = max(padding_len_inp, inplen) if padding_len_inp is not None else inplen
 
                 inps.append(inp)  # [1, inp_length]
                 cont_toks_list.append(continuation_enc)
@@ -992,9 +910,7 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
                 )  # [batch, padding_len_inp]
             elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
                 # TODO: left-pad encoder inps and mask?
-                batched_inps = lm_eval.models.utils.pad_and_concat(
-                    padding_len_inp, inps
-                )  # [batch, padding_len_inp]
+                batched_inps = lm_eval.models.utils.pad_and_concat(padding_len_inp, inps)  # [batch, padding_len_inp]
                 batched_conts = lm_eval.models.utils.pad_and_concat(
                     padding_len_cont, conts
                 )  # [batch, padding_len_cont]
@@ -1040,18 +956,12 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
                     cont_toks=cont_toks,
                     logits=logits,
                 ):
-                    cont_toks = torch.tensor(
-                        cont_toks, dtype=torch.long, device=self._device
-                    ).unsqueeze(
-                        0
-                    )  # [1, seq]
+                    cont_toks = torch.tensor(cont_toks, dtype=torch.long, device=self._device).unsqueeze(0)  # [1, seq]
                     max_equal = (greedy_tokens == cont_toks).all()
 
                     # Obtain log-probs at the corresponding continuation token indices
                     # last_token_slice = logits[:, -1, :].squeeze(0).tolist()
-                    logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(
-                        -1
-                    )  # [1, seq]
+                    logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1)  # [1, seq]
 
                     # Answer: (log prob, is-exact-match)
                     answer = (float(logits.sum()), bool(max_equal))
@@ -1065,9 +975,7 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]):
 
         return re_ord.get_original(res)
 
-    def generate_until(
-        self, requests: List[lm_eval.api.instance.Instance], disable_tqdm: bool = False
-    ) -> List[str]:
+    def generate_until(self, requests: List[lm_eval.api.instance.Instance], disable_tqdm: bool = False) -> List[str]:
         res = []
 
         def _collate(req: Tuple[str, dict]):
@@ -1099,11 +1007,7 @@ def _collate(req: Tuple[str, dict]):
             if self.batch_size != "auto"
             else adaptive_batch_size if adaptive_batch_size is not None else 0
         )
-        batch_fn = (
-            self._batch_scheduler
-            if self.batch_size == "auto" and not adaptive_batch_size
-            else None
-        )
+        batch_fn = self._batch_scheduler if self.batch_size == "auto" and not adaptive_batch_size else None
 
         # we group requests by their generation_kwargs,
         # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
@@ -1130,13 +1034,9 @@ def _collate(req: Tuple[str, dict]):
                     if isinstance(until, str):
                         until = [kwargs]
                     elif not isinstance(until, list):
-                        raise ValueError(
-                            f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
-                        )
+                        raise ValueError(f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}")
             else:
-                raise ValueError(
-                    f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
-                )
+                raise ValueError(f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}")
             # add EOS token to stop sequences
             eos = self.tok_decode(self.eot_token_id, skip_special_tokens=False)
             if not until:
diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/utils.py b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/utils.py
index 9bc338917..a9845eb41 100644
--- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/utils.py
+++ b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/utils.py
@@ -12,38 +12,40 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 class LMEvalParser:
-    def __init__(self,
-                 model="hf",
-                 tasks="lambada_openai",
-                 model_args="",
-                 user_model=None,
-                 tokenizer=None,
-                 num_fewshot=None,
-                 batch_size=1,
-                 max_batch_size=None,
-                 provider=None,
-                 output_path=None,
-                 limit=None,
-                 use_cache=None,
-                 cache_requests=None,
-                 check_integrity=False,
-                 write_out=False,
-                 log_samples=False,
-                 show_config=False,
-                 include_path=None,
-                 gen_kwargs=None,
-                 verbosity="INFO",
-                 wandb_args="",
-                 predict_only=False,
-                 seed=[0, 1234, 1234],
-                 trust_remote_code=False
-                 ):
+    def __init__(
+        self,
+        model="hf",
+        tasks="lambada_openai",
+        model_args="",
+        user_model=None,
+        tokenizer=None,
+        num_fewshot=None,
+        batch_size=1,
+        max_batch_size=None,
+        provider=None,
+        output_path=None,
+        limit=None,
+        use_cache=None,
+        cache_requests=None,
+        check_integrity=False,
+        write_out=False,
+        log_samples=False,
+        show_config=False,
+        include_path=None,
+        gen_kwargs=None,
+        verbosity="INFO",
+        wandb_args="",
+        predict_only=False,
+        seed=[0, 1234, 1234],
+        trust_remote_code=False,
+    ):
         self.model = model
         self.tasks = tasks
         self.model_args = model_args
-        self.user_model=user_model
-        self.tokenizer=tokenizer
+        self.user_model = user_model
+        self.tokenizer = tokenizer
         self.num_fewshot = num_fewshot
         self.batch_size = batch_size
         self.max_batch_size = max_batch_size
diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py
index 0dac48ba9..9cafe62d3 100644
--- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py
+++ b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py
@@ -15,74 +15,52 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint:disable=redefined-outer-name,logging-format-interpolation
-import os
-import onnx
-import time
+import argparse
 import json
-import random
-import torch
 import logging
-import argparse
+import os
 import random
-import numpy as np
+import time
+
 import datasets
+import evaluation
+import numpy as np
+import onnx
 import onnxruntime as ort
+import torch
 import transformers
-import evaluation
+from optimum import onnxruntime as optimum_ort
 from torch.nn import functional
 from torch.utils import data
-from optimum import onnxruntime as optimum_ort
-from onnx_neural_compressor.quantization import matmul_nbits_quantizer
-from onnx_neural_compressor import config
-from onnx_neural_compressor import logger
-from onnx_neural_compressor.quantization import tuning
-from onnx_neural_compressor import data_reader
-from onnx_neural_compressor import utility
-
-logger = logging.getLogger(__name__)
+
+from onnx_neural_compressor import config, data_reader, logger, utility
+from onnx_neural_compressor.quantization import matmul_nbits_quantizer, tuning
+
 logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
-    datefmt="%m/%d/%Y %H:%M:%S",
-    level=logging.WARN)
-
-parser = argparse.ArgumentParser(
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument("--model_path",
-                    type=str,
-                    help="Folder path of pre-trained onnx model")
-parser.add_argument(
-    "--benchmark",
-    action="store_true", \
-    default=False
-)
-parser.add_argument(
-    "--tune",
-    action="store_true", \
-    default=False,
-    help="whether quantize the model"
+    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.WARN
 )
-parser.add_argument("--output_model",
-                    type=str,
-                    default=None,
-                    help="output model path")
+
+parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument("--model_path", type=str, help="Folder path of pre-trained onnx model")
+parser.add_argument("--benchmark", action="store_true", default=False)
+parser.add_argument("--tune", action="store_true", default=False, help="whether quantize the model")
+parser.add_argument("--output_model", type=str, default=None, help="output model path")
 parser.add_argument(
     "--batch_size",
     default=1,
     type=int,
 )
-parser.add_argument("--tokenizer",
-                    type=str,
-                    help="pretrained model name or path of tokenizer files",
-                    default="meta-llama/Llama-2-7b-hf")
-parser.add_argument("--workspace",
-                    type=str,
-                    help="workspace to save intermediate files",
-                    default="nc_workspace")
-parser.add_argument("--algorithm",
-                    type=str,
-                    default="WOQ_TUNE",
-                    choices=["WOQ_TUNE", "RTN", "AWQ", "GPTQ"],
-                    help="weight only algorithm")
+parser.add_argument(
+    "--tokenizer", type=str, help="pretrained model name or path of tokenizer files", default="meta-llama/Llama-2-7b-hf"
+)
+parser.add_argument("--workspace", type=str, help="workspace to save intermediate files", default="nc_workspace")
+parser.add_argument(
+    "--algorithm",
+    type=str,
+    default="WOQ_TUNE",
+    choices=["WOQ_TUNE", "RTN", "AWQ", "GPTQ"],
+    help="weight only algorithm",
+)
 parser.add_argument(
     "--pad_max",
     default=196,
@@ -96,18 +74,22 @@
 parser.add_argument(
     "--tasks",
     nargs="+",
-    default=["winogrande", "copa", "piqa", "rte", "hellaswag", "openbookqa", \
-             "lambada_openai", "lambada_standard", "wikitext"],
+    default=[
+        "winogrande",
+        "copa",
+        "piqa",
+        "rte",
+        "hellaswag",
+        "openbookqa",
+        "lambada_openai",
+        "lambada_standard",
+        "wikitext",
+    ],
     type=str,
-    help="tasks list for accuracy validation"
+    help="tasks list for accuracy validation",
 )
-parser.add_argument("--dataset",
-                    nargs="?",
-                    default="NeelNanda/pile-10k",
-                    const="NeelNanda/pile-10k")
-parser.add_argument('--mode',
-                    type=str,
-                    help="benchmark mode of performance or accuracy")
+parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k")
+parser.add_argument("--mode", type=str, help="benchmark mode of performance or accuracy")
 parser.add_argument("--intra_op_num_threads", type=int, default=24)
 parser.add_argument("--trust_remote_code", type=bool, default=False)
 args = parser.parse_args()
@@ -130,7 +112,7 @@ def replace_architectures(json_path):
         data = json.load(file)
         data["architectures"] = ["LlamaForCausalLM"]
 
-    with open(json_path, 'w') as file:
+    with open(json_path, "w") as file:
         json.dump(data, file, indent=4)
 
 
@@ -145,7 +127,7 @@ def eval_func(model):
         model="hf",
         model_args="pretrained=" + model_dir + ",tokenizer=" + args.tokenizer,
         batch_size=args.batch_size,
-        tasks=','.join(args.tasks),
+        tasks=",".join(args.tasks),
         provider="CPUExecutionProvider",
         trust_remote_code=args.trust_remote_code,
     )
@@ -154,12 +136,10 @@ def eval_func(model):
     eval_acc = 0
     for task_name in args.tasks:
         if task_name == "wikitext":
-            print("Accuracy for %s is: %s" %
-                  (task_name, results["results"][task_name]["word_perplexity,none"]))
+            print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["word_perplexity,none"]))
             eval_acc += results["results"][task_name]["word_perplexity,none"]
         else:
-            print("Accuracy for %s is: %s" %
-                  (task_name, results["results"][task_name]["acc,none"]))
+            print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["acc,none"]))
             eval_acc += results["results"][task_name]["acc,none"]
 
     if len(args.tasks) != 0:
@@ -173,14 +153,10 @@ def benchmark(model):
     sess_options.intra_op_num_threads = args.intra_op_num_threads
 
     session = optimum_ort.ORTModelForCausalLM.load_model(  # pylint: disable=E1123
-        os.path.join(model, "model.onnx"),
-        session_options=sess_options)
+        os.path.join(model, "model.onnx"), session_options=sess_options
+    )
     inputs_names = session.get_inputs()
-    key_value_input_names = [
-        key.name
-        for key in inputs_names
-        if (".key" in key.name) or (".value" in key.name)
-    ]
+    key_value_input_names = [key.name for key in inputs_names if (".key" in key.name) or (".value" in key.name)]
     use_cache = len(key_value_input_names) > 0
 
     model = optimum_ort.ORTModelForCausalLM(
@@ -222,19 +198,13 @@ def benchmark(model):
 
 class AWQDataloader(data_reader.CalibrationDataReader):
 
-    def __init__(self,
-                 model_path,
-                 pad_max=196,
-                 batch_size=1,
-                 sub_folder='train',
-                 calibration_sampling_size=8):
+    def __init__(self, model_path, pad_max=196, batch_size=1, sub_folder="train", calibration_sampling_size=8):
         self.encoded_list = []
         self.pad_max = pad_max
         self.batch_size = batch_size
         dataset = datasets.load_dataset(args.dataset, split=sub_folder)
         dataset = dataset.map(tokenize_function, batched=True)
-        dataset.set_format(type="torch",
-                           columns=["input_ids", "attention_mask"])
+        dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
         dataloader = data.DataLoader(
             dataset,
             batch_size=self.batch_size,
@@ -243,9 +213,7 @@ def __init__(self,
         )
         model = onnx.load(model_path, load_external_data=False)
         inputs_names = [input.name for input in model.graph.input]
-        key_value_input_names = [
-            key for key in inputs_names if (".key" in key) or (".value" in key)
-        ]
+        key_value_input_names = [key for key in inputs_names if (".key" in key) or (".value" in key)]
         use_cache = len(key_value_input_names) > 0
         self.batch_size = batch_size
 
@@ -253,20 +221,16 @@ def __init__(self,
             if idx + 1 > calibration_sampling_size:
                 break
             ort_input = {}
-            ort_input["input_ids"] = input_ids[:, :-1].detach().cpu().numpy(
-            ).astype("int64")
-            ort_input["attention_mask"] = attention_mask[:, :-1].detach().cpu(
-            ).numpy().astype("int64")
+            ort_input["input_ids"] = input_ids[:, :-1].detach().cpu().numpy().astype("int64")
+            ort_input["attention_mask"] = attention_mask[:, :-1].detach().cpu().numpy().astype("int64")
             position_ids = attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask == 0, 1)
-            ort_input["position_ids"] = position_ids[:, :-1].detach().cpu(
-            ).numpy().astype("int64")
+            ort_input["position_ids"] = position_ids[:, :-1].detach().cpu().numpy().astype("int64")
             if use_cache:
                 # Create dummy past_key_values for decoder
                 num_attention_heads = model_config.num_key_value_heads
                 embed_size_per_head = model_config.hidden_size // model_config.num_attention_heads
-                shape = (self.batch_size, num_attention_heads, 0,
-                         embed_size_per_head)
+                shape = (self.batch_size, num_attention_heads, 0, embed_size_per_head)
                 key_or_value = np.zeros(shape, dtype=np.float32)
                 for key_value_input_name in key_value_input_names:
                     ort_input[key_value_input_name] = key_or_value
@@ -287,8 +251,7 @@ def collate_batch(self, batch):
             attention_mask = functional.pad(attention_mask, (0, pad_len), value=0)
             input_ids_padded.append(input_ids)
             attention_mask_padded.append(attention_mask)
-        return torch.vstack(input_ids_padded), torch.vstack(
-            attention_mask_padded)
+        return torch.vstack(input_ids_padded), torch.vstack(attention_mask_padded)
 
     def get_next(self):
         return next(self.iter_next, None)
@@ -299,12 +262,7 @@ def rewind(self):
 
 class GPTQDataloader(data_reader.CalibrationDataReader):
 
-    def __init__(self,
-                 model_path,
-                 batch_size=1,
-                 seqlen=2048,
-                 sub_folder="train",
-                 calibration_sampling_size=8):
+    def __init__(self, model_path, batch_size=1, seqlen=2048, sub_folder="train", calibration_sampling_size=8):
         # large `calibration_sampling_size` may result in long GPTQ running time
         # recommend to use smaller `calibration_sampling_size` value
         random.seed(0)
@@ -313,14 +271,11 @@ def __init__(self,
         self.batch_size = batch_size
         traindata = datasets.load_dataset(args.dataset, split=sub_folder)
         traindata = traindata.map(tokenize_function, batched=True)
-        traindata.set_format(type="torch",
-                             columns=["input_ids", "attention_mask"])
+        traindata.set_format(type="torch", columns=["input_ids", "attention_mask"])
 
         session = ort.InferenceSession(model_path)
         inputs_names = [input.name for input in session.get_inputs()]
-        key_value_input_names = [
-            key for key in inputs_names if (".key" in key) or (".value" in key)
-        ]
+        key_value_input_names = [key for key in inputs_names if (".key" in key) or (".value" in key)]
         use_cache = len(key_value_input_names) > 0
 
         for i in range(calibration_sampling_size):
@@ -336,19 +291,15 @@ def __init__(self,
 
             ort_input = {}
             ort_input["input_ids"] = inp.detach().cpu().numpy().astype("int64")
-            ort_input["attention_mask"] = mask.detach().cpu().numpy().astype(
-                "int64")
+            ort_input["attention_mask"] = mask.detach().cpu().numpy().astype("int64")
             input_shape = ort_input["input_ids"].shape
-            position_ids = torch.arange(0, input_shape[-1],
-                                        dtype=torch.long).unsqueeze(0).view(
-                                            -1, input_shape[-1])
+            position_ids = torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1])
             ort_input["position_ids"] = position_ids.numpy()
             if use_cache:
                 # create dummy past_key_values for decoder first generation step
                 num_attention_heads = model_config.num_key_value_heads
                 embed_size_per_head = model_config.hidden_size // model_config.num_attention_heads
-                shape = (self.batch_size, num_attention_heads, 0,
-                         embed_size_per_head)
+                shape = (self.batch_size, num_attention_heads, 0, embed_size_per_head)
                 key_or_value = np.zeros(shape, dtype=np.float32)
                 for key_value_input_name in key_value_input_names:
                     ort_input[key_value_input_name] = key_or_value
@@ -369,9 +320,9 @@ def rewind(self):
         os.mkdir(args.workspace)
 
     if args.benchmark:
-        if args.mode == 'performance':
+        if args.mode == "performance":
             benchmark(args.model_path)
-        elif args.mode == 'accuracy':
+        elif args.mode == "accuracy":
             acc_result = eval_func(args.model_path)
             print("Batch size = %d" % args.batch_size)
             print("Accuracy: %.5f" % acc_result)
@@ -384,17 +335,12 @@ def rewind(self):
         logger.info("Start graph optimization...")
         sess_options = ort.SessionOptions()
         sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
-        sess_options.optimized_model_filepath = os.path.join(
-            args.workspace, "Optimized_model.onnx")
+        sess_options.optimized_model_filepath = os.path.join(args.workspace, "Optimized_model.onnx")
         sess_options.add_session_config_entry(
-            "session.optimized_model_external_initializers_file_name",
-            "Optimized_model.onnx_data")
-        sess_options.add_session_config_entry(
-            "session.optimized_model_external_initializers_min_size_in_bytes",
-            "1024")
-        sess = ort.InferenceSession(model_path,
-                                    sess_options,
-                                    providers=["CPUExecutionProvider"])
+            "session.optimized_model_external_initializers_file_name", "Optimized_model.onnx_data"
+        )
+        sess_options.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "1024")
+        sess = ort.InferenceSession(model_path, sess_options, providers=["CPUExecutionProvider"])
         logger.info("Graph optimization done.")
 
         best_model = None
@@ -411,12 +357,10 @@ def rewind(self):
             best_model = quant.model
 
         elif args.algorithm.upper() == "AWQ":
-            calibration_data_reader = AWQDataloader(model_path,
-                                                    pad_max=args.pad_max,
-                                                    batch_size=1)
+            calibration_data_reader = AWQDataloader(model_path, pad_max=args.pad_max, batch_size=1)
             algo_config = matmul_nbits_quantizer.AWQWeightOnlyQuantConfig(
-                calibration_data_reader=calibration_data_reader,
-                enable_mse_search=False)
+                calibration_data_reader=calibration_data_reader, enable_mse_search=False
+            )
             quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(
                 sess_options.optimized_model_filepath,
                 n_bits=4,
@@ -428,11 +372,10 @@ def rewind(self):
             best_model = quant.model
 
         elif args.algorithm.upper() == "GPTQ":
-            calibration_data_reader = GPTQDataloader(model_path,
-                                                     seqlen=args.seqlen,
-                                                     batch_size=1)
+            calibration_data_reader = GPTQDataloader(model_path, seqlen=args.seqlen, batch_size=1)
             algo_config = matmul_nbits_quantizer.GPTQWeightOnlyQuantConfig(
-                calibration_data_reader=calibration_data_reader,)
+                calibration_data_reader=calibration_data_reader,
+            )
             quant = matmul_nbits_quantizer.MatMulNBitsQuantizer(
                 sess_options.optimized_model_filepath,
                 n_bits=4,
@@ -444,12 +387,9 @@ def rewind(self):
             best_model = quant.model
 
         elif args.algorithm.upper() == "WOQ_TUNE":
-            calibration_data_reader = GPTQDataloader(model_path,
-                                                     seqlen=args.seqlen,
-                                                     batch_size=1)
+            calibration_data_reader = GPTQDataloader(model_path, seqlen=args.seqlen, batch_size=1)
             # set tolerable_loss to 0.5% for test, default is 1%
-            custom_tune_config = tuning.TuningConfig(
-                config_set=config.get_woq_tuning_config(), tolerable_loss=0.005)
+            custom_tune_config = tuning.TuningConfig(config_set=config.get_woq_tuning_config(), tolerable_loss=0.005)
             best_model = tuning.autotune(
                 model_input=model_path,
                 tune_config=custom_tune_config,
@@ -463,5 +403,4 @@ def rewind(self):
                 os.path.join(args.output_model, model_name),
                 save_as_external_data=True,
             )
-            model_config.to_json_file(os.path.join(args.output_model, "config.json"),
-                                use_diff=False)
+            model_config.to_json_file(os.path.join(args.output_model, "config.json"), use_diff=False)
diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/prepare_model.py b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/prepare_model.py
index 188f02a5b..3af820943 100644
--- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/prepare_model.py
+++ b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/prepare_model.py
@@ -1,6 +1,7 @@
 import argparse
 import os
 import subprocess
+
 import optimum.version
 from packaging import version
 
@@ -16,7 +17,8 @@ def parse_arguments():
         type=str,
         required=False,
         default="text-generation-with-past",
-        choices=["text-generation-with-past", "text-generation"])
+        choices=["text-generation-with-past", "text-generation"],
+    )
     return parser.parse_args()
 
 
diff --git a/onnx_neural_compressor/algorithms/layer_wise/core.py b/onnx_neural_compressor/algorithms/layer_wise/core.py
index 1b5cb680e..2e381cfdb 100644
--- a/onnx_neural_compressor/algorithms/layer_wise/core.py
+++ b/onnx_neural_compressor/algorithms/layer_wise/core.py
@@ -23,10 +23,8 @@
 
 import onnx
 import onnxruntime as ort
-from onnx_neural_compressor import data_reader
-from onnx_neural_compressor import logger
-from onnx_neural_compressor import onnx_model
-from onnx_neural_compressor import utility
+
+from onnx_neural_compressor import data_reader, logger, onnx_model, utility
 
 from typing import Callable, List, Union  # isort: skip
 
diff --git a/onnx_neural_compressor/algorithms/smoother/calibrator.py b/onnx_neural_compressor/algorithms/smoother/calibrator.py
index 7fddd2cc9..fe0a862cc 100644
--- a/onnx_neural_compressor/algorithms/smoother/calibrator.py
+++ b/onnx_neural_compressor/algorithms/smoother/calibrator.py
@@ -22,10 +22,8 @@
 import numpy as np
 import onnx
 import onnxruntime
-from onnx_neural_compressor import data_reader
-from onnx_neural_compressor import logger
-from onnx_neural_compressor import onnx_model
-from onnx_neural_compressor import utility
+
+from onnx_neural_compressor import data_reader, logger, onnx_model, utility
 
 
 class Calibrator:
diff --git a/onnx_neural_compressor/algorithms/smoother/core.py b/onnx_neural_compressor/algorithms/smoother/core.py
index d30f78003..d21641482 100644
--- a/onnx_neural_compressor/algorithms/smoother/core.py
+++ b/onnx_neural_compressor/algorithms/smoother/core.py
@@ -20,10 +20,8 @@
 import numpy as np
 import onnx
 import onnxruntime as ort
-from onnx_neural_compressor import data_reader
-from onnx_neural_compressor import logger
-from onnx_neural_compressor import onnx_model
-from onnx_neural_compressor import utility
+
+from onnx_neural_compressor import data_reader, logger, onnx_model, utility
 from onnx_neural_compressor.algorithms.smoother import calibrator
 
 from typing import List, Union  # isort: skip
diff --git a/onnx_neural_compressor/algorithms/weight_only/awq.py b/onnx_neural_compressor/algorithms/weight_only/awq.py
index 6ee1f7c9c..30d9e8442 100644
--- a/onnx_neural_compressor/algorithms/weight_only/awq.py
+++ b/onnx_neural_compressor/algorithms/weight_only/awq.py
@@ -22,15 +22,11 @@
 import numpy as np
 import onnx
 import onnxruntime as ort
-from onnx_neural_compressor import config
-from onnx_neural_compressor import constants
-from onnx_neural_compressor import data_reader
-from onnx_neural_compressor import logger
-from onnx_neural_compressor import onnx_model
-from onnx_neural_compressor import utility
+from packaging import version
+
+from onnx_neural_compressor import config, constants, data_reader, logger, onnx_model, utility
 from onnx_neural_compressor.algorithms.weight_only import rtn
 from onnx_neural_compressor.algorithms.weight_only import utility as woq_utility
-from packaging import version
 
 from typing import List, Union  # isort: skip
 
@@ -66,8 +62,9 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
         weight = []
         org_out = []
         for node in nodes:
-            if (node.name, node.op_type) in weight_config and \
-                weight_config.get((node.name, node.op_type), "fp32") != "fp32":
+            if (node.name, node.op_type) in weight_config and weight_config.get(
+                (node.name, node.op_type), "fp32"
+            ) != "fp32":
                 num_bits = weight_config[(node.name, node.op_type)].get("weight_bits", 4)
                 group_size = weight_config[(node.name, node.op_type)].get("weight_group_size", 32)
                 scheme = "sym" if weight_config[(node.name, node.op_type)].get("weight_sym", True) else "asym"
@@ -128,7 +125,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits,
         for node in nodes:
             weight_config.setdefault((node.name, node.op_type), {}).update({"weight_bits": num_bits})
             weight_config.setdefault((node.name, node.op_type), {}).update({"weight_group_size": group_size})
-            weight_config.setdefault((node.name, node.op_type), {}).update({"weight_sym": scheme=="sym"})
+            weight_config.setdefault((node.name, node.op_type), {}).update({"weight_sym": scheme == "sym"})
 
             init_share_num = model.get_initializer_share_num(node.input[1])
             weight_tensor = model.get_initializer(node.input[1])
diff --git a/onnx_neural_compressor/algorithms/weight_only/gptq.py b/onnx_neural_compressor/algorithms/weight_only/gptq.py
index 07cc4cd1f..5016a2780 100644
--- a/onnx_neural_compressor/algorithms/weight_only/gptq.py
+++ b/onnx_neural_compressor/algorithms/weight_only/gptq.py
@@ -22,14 +22,11 @@
 import numpy as np
 import onnx
 import onnxruntime as ort
-from onnx_neural_compressor import config
-from onnx_neural_compressor import constants
-from onnx_neural_compressor import data_reader
-from onnx_neural_compressor import onnx_model
-from onnx_neural_compressor import utility
+from packaging.version import Version
+
+from onnx_neural_compressor import config, constants, data_reader, onnx_model, utility
 from onnx_neural_compressor.algorithms.layer_wise import core
 from onnx_neural_compressor.algorithms.weight_only import utility as woq_utility
-from packaging.version import Version
 
 from typing import List, Union  # isort: skip
 
diff --git a/onnx_neural_compressor/algorithms/weight_only/rtn.py b/onnx_neural_compressor/algorithms/weight_only/rtn.py
index 8deb39f14..619c055e1 100644
--- a/onnx_neural_compressor/algorithms/weight_only/rtn.py
+++ b/onnx_neural_compressor/algorithms/weight_only/rtn.py
@@ -24,13 +24,11 @@
 import numpy as np
 import onnx
 import onnxruntime as ort
-from onnx_neural_compressor import config
-from onnx_neural_compressor import constants
-from onnx_neural_compressor import onnx_model
-from onnx_neural_compressor import utility
+from packaging import version
+
+from onnx_neural_compressor import config, constants, onnx_model, utility
 from onnx_neural_compressor.algorithms.layer_wise import core
 from onnx_neural_compressor.algorithms.weight_only import utility as woq_utility
-from packaging import version
 
 from typing import List, Union  # isort: skip
 
diff --git a/onnx_neural_compressor/algorithms/weight_only/utility.py b/onnx_neural_compressor/algorithms/weight_only/utility.py
index 6fee4cfc4..ddb5f990d 100644
--- a/onnx_neural_compressor/algorithms/weight_only/utility.py
+++ b/onnx_neural_compressor/algorithms/weight_only/utility.py
@@ -25,10 +25,10 @@
 import numpy as np
 import onnx
 import onnxruntime as ort
-from onnx_neural_compressor import constants
-from onnx_neural_compressor import utility
 from packaging import version
 
+from onnx_neural_compressor import constants, utility
+
 if sys.version_info < (3, 11) and util.find_spec("onnxruntime_extensions"):  # pragma: no cover
     import onnxruntime_extensions
 
@@ -119,8 +119,8 @@ def make_matmul_weight_only_node(
                 even_idx = idx[::2]
                 odd_idx = idx[1::2]
                 # vectorized operation for even and odd indices
-                packed_zp[even_idx // 2] = ((packed_zp[even_idx // 2] & 0xF0) | zero_point[even_idx].ravel())
-                packed_zp[odd_idx // 2] = ((packed_zp[odd_idx // 2] & 0x0F) | (zero_point[odd_idx].ravel() << 4))
+                packed_zp[even_idx // 2] = (packed_zp[even_idx // 2] & 0xF0) | zero_point[even_idx].ravel()
+                packed_zp[odd_idx // 2] = (packed_zp[odd_idx // 2] & 0x0F) | (zero_point[odd_idx].ravel() << 4)
 
             zp_tensor = onnx.helper.make_tensor(
                 name=node.input[1] + "_zp", data_type=2, dims=packed_zp.shape, vals=packed_zp.tobytes(), raw=True
@@ -282,7 +282,7 @@ def quant_tensor(
         max_range = np.maximum(np.abs(rmin), np.abs(rmax))
 
         scale = np.ones(rmax.shape)
-        mask = (max_range > 0)
+        mask = max_range > 0
         scale[mask] = (max_range[mask] * 2.0).astype(np.float64) / (maxq - minq)
         zero_point = (
             np.zeros(scale.shape) if dtype == "int" else np.ones(rmax.shape, dtype="uint8") * (1 << (num_bits - 1))
@@ -305,6 +305,7 @@ def quant_tensor(
 
     return q_weight, scale, zero_point
 
+
 def qdq_tensor(
     data: np.array,
     num_bits: int = 4,
diff --git a/onnx_neural_compressor/config.py b/onnx_neural_compressor/config.py
index 61ab8fc67..b6fad923a 100644
--- a/onnx_neural_compressor/config.py
+++ b/onnx_neural_compressor/config.py
@@ -23,19 +23,16 @@
 import json
 import pathlib
 import re
-from abc import ABC
-from abc import abstractmethod
+from abc import ABC, abstractmethod
 
 import numpy as np
 import onnx
 import pydantic
-from onnx_neural_compressor import constants
-from onnx_neural_compressor import data_reader
-from onnx_neural_compressor import logger
-from onnx_neural_compressor import utility
 from onnxruntime import quantization
 from typing_extensions import Self
 
+from onnx_neural_compressor import constants, data_reader, logger, utility
+
 from collections import OrderedDict  # isort: skip
 from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple, Type, Union, _GenericAlias  # isort: skip
 
@@ -1239,4 +1236,4 @@ def generate_nc_sq_config(quant_config: quantization.StaticQuantConfig):
     quant_config.extra_options["SmoothQuant"] = False
     quant_config_dict = quant_config.to_dict()
     nc_sq_config = SmoothQuantConfig(**quant_kwargs, **quant_config_dict)
-    return nc_sq_config
\ No newline at end of file
+    return nc_sq_config
diff --git a/onnx_neural_compressor/onnx_model.py b/onnx_neural_compressor/onnx_model.py
index 5e8921bd2..061f7cad8 100644
--- a/onnx_neural_compressor/onnx_model.py
+++ b/onnx_neural_compressor/onnx_model.py
@@ -21,11 +21,10 @@
 
 import onnx
 import transformers
-from onnx_neural_compressor import constants
-from onnx_neural_compressor import logger
-from onnx_neural_compressor import utility
 from onnxruntime.quantization import onnx_model
 
+from onnx_neural_compressor import constants, logger, utility
+
 
 class ONNXModel(onnx_model.ONNXModel):
     """Build ONNX model."""
diff --git a/onnx_neural_compressor/quantization/__init__.py b/onnx_neural_compressor/quantization/__init__.py
index 7245f8724..7ef91659a 100644
--- a/onnx_neural_compressor/quantization/__init__.py
+++ b/onnx_neural_compressor/quantization/__init__.py
@@ -13,6 +13,6 @@
 # limitations under the License.
 
 
-from onnxruntime.quantization.quant_utils import QuantFormat
-from onnxruntime.quantization.quant_utils import QuantType
+from onnxruntime.quantization.quant_utils import QuantFormat, QuantType
+
 from onnx_neural_compressor.quantization.quantize import quantize
diff --git a/onnx_neural_compressor/quantization/algorithm_entry.py b/onnx_neural_compressor/quantization/algorithm_entry.py
index 982ea3a14..cd079932c 100644
--- a/onnx_neural_compressor/quantization/algorithm_entry.py
+++ b/onnx_neural_compressor/quantization/algorithm_entry.py
@@ -17,17 +17,12 @@
 from typing import Union
 
 import onnx
-from onnx_neural_compressor import config
-from onnx_neural_compressor import constants
-from onnx_neural_compressor import data_reader
-from onnx_neural_compressor import logger
-from onnx_neural_compressor import utility
-from onnx_neural_compressor.algorithms.smoother import core
-from onnx_neural_compressor.algorithms.weight_only import awq
-from onnx_neural_compressor.algorithms.weight_only import gptq
-from onnx_neural_compressor.algorithms.weight_only import rtn
 from onnxruntime import quantization
 
+from onnx_neural_compressor import config, constants, data_reader, logger, utility
+from onnx_neural_compressor.algorithms.smoother import core
+from onnx_neural_compressor.algorithms.weight_only import awq, gptq, rtn
+
 
 ###################### SmoothQuant Entry ##################################
 @utility.register_algo(name=constants.SMOOTH_QUANT)
diff --git a/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py b/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py
index ea1cf62a9..62a671fba 100644
--- a/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py
+++ b/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py
@@ -15,9 +15,10 @@
 from typing import List, Union  # isort: skip
 
 import onnx
-from onnx_neural_compressor.quantization import matmul_nbits_quantizer
 from onnxruntime.quantization import matmul_4bits_quantizer
 
+from onnx_neural_compressor.quantization import matmul_nbits_quantizer
+
 RTNWeightOnlyQuantConfig = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig
 AWQWeightOnlyQuantConfig = matmul_nbits_quantizer.AWQWeightOnlyQuantConfig
 GPTQWeightOnlyQuantConfig = matmul_nbits_quantizer.GPTQWeightOnlyQuantConfig
diff --git a/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py b/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py
index 1b6b3f1c7..0d00bbbc5 100644
--- a/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py
+++ b/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py
@@ -15,14 +15,11 @@
 from typing import List, Union  # isort: skip
 
 import onnx
-from onnx_neural_compressor import config
-from onnx_neural_compressor import data_reader
-from onnx_neural_compressor import logger
-from onnx_neural_compressor import onnx_model
-from onnx_neural_compressor import utility
-from onnx_neural_compressor.quantization import algorithm_entry as algos
 from onnxruntime.quantization import matmul_4bits_quantizer
 
+from onnx_neural_compressor import config, data_reader, logger, onnx_model, utility
+from onnx_neural_compressor.quantization import algorithm_entry as algos
+
 
 class RTNWeightOnlyQuantConfig(matmul_4bits_quantizer.RTNWeightOnlyQuantConfig):
 
diff --git a/onnx_neural_compressor/quantization/quantize.py b/onnx_neural_compressor/quantization/quantize.py
index f586655dc..7e388e3aa 100644
--- a/onnx_neural_compressor/quantization/quantize.py
+++ b/onnx_neural_compressor/quantization/quantize.py
@@ -16,9 +16,10 @@
 from typing import Union
 
 import onnx
+from onnxruntime.quantization.quantize import QuantConfig
+
 from onnx_neural_compressor import config
 from onnx_neural_compressor.quantization import algorithm_entry as algos
-from onnxruntime.quantization.quantize import QuantConfig
 
 
 # ORT-like user-facing API
diff --git a/onnx_neural_compressor/quantization/tuning.py b/onnx_neural_compressor/quantization/tuning.py
index 91e7eae14..a6743ad7a 100644
--- a/onnx_neural_compressor/quantization/tuning.py
+++ b/onnx_neural_compressor/quantization/tuning.py
@@ -19,10 +19,8 @@
 import uuid
 
 import onnx
-from onnx_neural_compressor import config
-from onnx_neural_compressor import data_reader
-from onnx_neural_compressor import logger
-from onnx_neural_compressor import utility
+
+from onnx_neural_compressor import config, data_reader, logger, utility
 
 from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Sized, Tuple, Union  # isort: skip
 
diff --git a/onnx_neural_compressor/utility.py b/onnx_neural_compressor/utility.py
index 0cb7b1335..cc36b6e8a 100644
--- a/onnx_neural_compressor/utility.py
+++ b/onnx_neural_compressor/utility.py
@@ -23,10 +23,10 @@
 import numpy as np
 import onnx
 import psutil
-from onnx_neural_compressor import constants
-from onnx_neural_compressor import logger
 from onnxruntime.quantization import onnx_model
 
+from onnx_neural_compressor import constants, logger
+
 from typing import Callable, Dict, List, Tuple, Union  # isort: skip
 
 # Dictionary to store a mapping between algorithm names and corresponding algo implementation(function)
diff --git a/pyproject.toml b/pyproject.toml
index 06b02dfe1..9d46c3db1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,15 +1,11 @@
 [tool.isort]
 profile = "black"
 line_length = 120
-known_first_party = ["neural_compressor"]
 extend_skip_glob = ["**/__init__.py"]
-force_single_line = true
-
 
 [tool.black]
 line-length = 120
 
-
 [tool.codespell]
 skip = '*.po,*.ts,*.js,*.map,*.js.map,*.css.map,.azure-pipelines/scripts/codeScan/codespell/inc_dict.txt'
 count = ''
diff --git a/requirements-lintrunner.txt b/requirements-lintrunner.txt
new file mode 100644
index 000000000..e37a0353a
--- /dev/null
+++ b/requirements-lintrunner.txt
@@ -0,0 +1,4 @@
+lintrunner_adapters
+ruff==0.4.5
+black==24.3.0
+isort==5.13.2
diff --git a/setup.py b/setup.py
index cdc3d0479..c80178535 100644
--- a/setup.py
+++ b/setup.py
@@ -49,8 +49,16 @@ def get_build_version():
         url="",
         packages=setuptools.find_packages(),
         include_package_data=True,
-        install_requires=["onnx", "onnxruntime", "onnxruntime-extensions", "psutil", "numpy",
-                          "py-cpuinfo", "pydantic", "transformers"],
+        install_requires=[
+            "onnx",
+            "onnxruntime",
+            "onnxruntime-extensions",
+            "psutil",
+            "numpy",
+            "py-cpuinfo",
+            "pydantic",
+            "transformers",
+        ],
         python_requires=">=3.8.0",
         classifiers=[
             "Intended Audience :: Science/Research",
diff --git a/test/quantization/layer_wise/test_layer_wise.py b/test/quantization/layer_wise/test_layer_wise.py
index 994387eb4..af0bca3e4 100644
--- a/test/quantization/layer_wise/test_layer_wise.py
+++ b/test/quantization/layer_wise/test_layer_wise.py
@@ -8,12 +8,11 @@
 import onnxruntime.tools.symbolic_shape_infer as symbolic_shape_infer
 import torch
 import transformers
-from onnx_neural_compressor import config
-from onnx_neural_compressor import data_reader
-from onnx_neural_compressor import logger
+from optimum.exporters.onnx import main_export
+
+from onnx_neural_compressor import config, data_reader, logger
 from onnx_neural_compressor.quantization import algorithm_entry as algos
 from onnx_neural_compressor.quantization import matmul_4bits_quantizer
-from optimum.exporters.onnx import main_export
 
 
 def find_onnx_file(folder_path):
diff --git a/test/quantization/test_autotune.py b/test/quantization/test_autotune.py
index 031b8369e..0e86c64b9 100644
--- a/test/quantization/test_autotune.py
+++ b/test/quantization/test_autotune.py
@@ -24,11 +24,11 @@
 import numpy as np
 import onnx
 import onnxruntime as ort
-from onnx_neural_compressor import config
-from onnx_neural_compressor import data_reader
-from onnx_neural_compressor.quantization import tuning
 from optimum.exporters.onnx import main_export
 
+from onnx_neural_compressor import config, data_reader
+from onnx_neural_compressor.quantization import tuning
+
 from typing import Callable, Dict, List, Optional, Union  # isort: skip
 
 
diff --git a/test/quantization/test_config.py b/test/quantization/test_config.py
index a7e142978..50ffc74d0 100644
--- a/test/quantization/test_config.py
+++ b/test/quantization/test_config.py
@@ -5,12 +5,11 @@
 
 import numpy as np
 import onnx
-from onnx_neural_compressor import config
-from onnx_neural_compressor import logger
-from onnx_neural_compressor import utility
-from onnx_neural_compressor.quantization import algorithm_entry as algos
 from optimum.exporters.onnx import main_export
 
+from onnx_neural_compressor import config, logger, utility
+from onnx_neural_compressor.quantization import algorithm_entry as algos
+
 
 def find_onnx_file(folder_path):
     # return first .onnx file path in folder_path
diff --git a/test/quantization/test_smooth_quant.py b/test/quantization/test_smooth_quant.py
index 56962af85..fed59e142 100644
--- a/test/quantization/test_smooth_quant.py
+++ b/test/quantization/test_smooth_quant.py
@@ -21,12 +21,12 @@
 
 import numpy as np
 import onnx
-from onnx_neural_compressor import config
-from onnx_neural_compressor import data_reader
+from optimum.exporters.onnx import main_export
+
+from onnx_neural_compressor import config, data_reader
 from onnx_neural_compressor.quantization import QuantType
 from onnx_neural_compressor.quantization import algorithm_entry as algos
 from onnx_neural_compressor.quantization import quantize
-from optimum.exporters.onnx import main_export
 
 
 class DataReader(data_reader.CalibrationDataReader):
diff --git a/test/quantization/weight_only/test_awq.py b/test/quantization/weight_only/test_awq.py
index 82a003791..2d918cc61 100644
--- a/test/quantization/weight_only/test_awq.py
+++ b/test/quantization/weight_only/test_awq.py
@@ -6,14 +6,12 @@
 
 import torch
 import transformers
-from onnx_neural_compressor import config
-from onnx_neural_compressor import data_reader
-from onnx_neural_compressor import logger
-from onnx_neural_compressor.quantization import algorithm_entry as algos
-from onnx_neural_compressor.quantization import matmul_4bits_quantizer
-from onnx_neural_compressor.quantization import matmul_nbits_quantizer
 from optimum.exporters.onnx import main_export
 
+from onnx_neural_compressor import config, data_reader, logger
+from onnx_neural_compressor.quantization import algorithm_entry as algos
+from onnx_neural_compressor.quantization import matmul_4bits_quantizer, matmul_nbits_quantizer
+
 
 def find_onnx_file(folder_path):
     # return first .onnx file path in folder_path
diff --git a/test/quantization/weight_only/test_gptq.py b/test/quantization/weight_only/test_gptq.py
index cc5df2cf9..133e11fd1 100644
--- a/test/quantization/weight_only/test_gptq.py
+++ b/test/quantization/weight_only/test_gptq.py
@@ -6,14 +6,12 @@
 
 import torch
 import transformers
-from onnx_neural_compressor import config
-from onnx_neural_compressor import data_reader
-from onnx_neural_compressor import logger
-from onnx_neural_compressor.quantization import algorithm_entry as algos
-from onnx_neural_compressor.quantization import matmul_4bits_quantizer
-from onnx_neural_compressor.quantization import matmul_nbits_quantizer
 from optimum.exporters.onnx import main_export
 
+from onnx_neural_compressor import config, data_reader, logger
+from onnx_neural_compressor.quantization import algorithm_entry as algos
+from onnx_neural_compressor.quantization import matmul_4bits_quantizer, matmul_nbits_quantizer
+
 
 def find_onnx_file(folder_path):
     # return first .onnx file path in folder_path
diff --git a/test/quantization/weight_only/test_rtn.py b/test/quantization/weight_only/test_rtn.py
index 7f75edc41..86b3c49a3 100644
--- a/test/quantization/weight_only/test_rtn.py
+++ b/test/quantization/weight_only/test_rtn.py
@@ -4,13 +4,12 @@
 import shutil
 import unittest
 
-from onnx_neural_compressor import config
-from onnx_neural_compressor import logger
-from onnx_neural_compressor.quantization import algorithm_entry as algos
-from onnx_neural_compressor.quantization import matmul_4bits_quantizer
-from onnx_neural_compressor.quantization import matmul_nbits_quantizer
 from optimum.exporters.onnx import main_export
 
+from onnx_neural_compressor import config, logger
+from onnx_neural_compressor.quantization import algorithm_entry as algos
+from onnx_neural_compressor.quantization import matmul_4bits_quantizer, matmul_nbits_quantizer
+
 
 def find_onnx_file(folder_path):
     # return first .onnx file path in folder_path
diff --git a/test/utils/test_general.py b/test/utils/test_general.py
index e1c89b142..d24392438 100644
--- a/test/utils/test_general.py
+++ b/test/utils/test_general.py
@@ -2,9 +2,7 @@
 
 import unittest
 
-from onnx_neural_compressor import config
-from onnx_neural_compressor import constants
-from onnx_neural_compressor import logger
+from onnx_neural_compressor import config, constants, logger
 from onnx_neural_compressor.quantization import tuning
 
 from typing import Any, Callable, List, Optional, Tuple, Union  # isort: skip