From 0f8c3a6a5d3413920dbcfd925efdcb2de02ba06a Mon Sep 17 00:00:00 2001 From: Justin Chu Date: Tue, 4 Jun 2024 02:18:17 -0700 Subject: [PATCH] Adopt lintrunner and enable github actions lint checks 1/2 (#16) Signed-off-by: Justin Chu --- .github/workflows/lint.yaml | 84 +++++ .lintrunner.toml | 97 +++++ .pre-commit-config.yaml | 123 ------- .../weight_only/evaluation/accuracy.py | 45 +-- .../weight_only/evaluation/evaluator.py | 111 ++---- .../weight_only/evaluation/models/__init__.py | 1 - .../evaluation/models/huggingface.py | 336 ++++++------------ .../weight_only/evaluation/utils.py | 58 +-- .../llama/quantization/weight_only/main.py | 225 +++++------- .../quantization/weight_only/prepare_model.py | 4 +- .../algorithms/layer_wise/core.py | 6 +- .../algorithms/smoother/calibrator.py | 6 +- .../algorithms/smoother/core.py | 6 +- .../algorithms/weight_only/awq.py | 17 +- .../algorithms/weight_only/gptq.py | 9 +- .../algorithms/weight_only/rtn.py | 8 +- .../algorithms/weight_only/utility.py | 11 +- onnx_neural_compressor/config.py | 11 +- onnx_neural_compressor/onnx_model.py | 5 +- .../quantization/__init__.py | 4 +- .../quantization/algorithm_entry.py | 13 +- .../quantization/matmul_4bits_quantizer.py | 3 +- .../quantization/matmul_nbits_quantizer.py | 9 +- .../quantization/quantize.py | 3 +- onnx_neural_compressor/quantization/tuning.py | 6 +- onnx_neural_compressor/utility.py | 4 +- pyproject.toml | 4 - requirements-lintrunner.txt | 4 + setup.py | 12 +- .../layer_wise/test_layer_wise.py | 7 +- test/quantization/test_autotune.py | 6 +- test/quantization/test_config.py | 7 +- test/quantization/test_smooth_quant.py | 6 +- test/quantization/weight_only/test_awq.py | 10 +- test/quantization/weight_only/test_gptq.py | 10 +- test/quantization/weight_only/test_rtn.py | 9 +- test/utils/test_general.py | 4 +- 37 files changed, 546 insertions(+), 738 deletions(-) create mode 100644 .github/workflows/lint.yaml create mode 100644 .lintrunner.toml delete mode 100644 .pre-commit-config.yaml create mode 100644 requirements-lintrunner.txt diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml new file mode 100644 index 000000000..9839352d0 --- /dev/null +++ b/.github/workflows/lint.yaml @@ -0,0 +1,84 @@ +# Copyright (c) ONNX Neural Compressor Project Contributors +# +# SPDX-License-Identifier: Apache-2.0 + +name: Lint + +on: + push: + branches: + - main + pull_request: + merge_group: + +permissions: # set top-level default permissions as security best practice + contents: read + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + optional-lint: + name: Optional Lint + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + - name: misspell # Check spellings as well + uses: reviewdog/action-misspell@5bd7be2fc7ae56a517184f5c4bbcf2fd7afe3927 # v1.17.0 + with: + github_token: ${{ secrets.github_token }} + locale: "US" + reporter: github-pr-check + level: info + filter_mode: diff_context + - name: shellcheck # Static check shell scripts + uses: reviewdog/action-shellcheck@72365a51bf6476fe952a117c3ff703eb7775e40a # v1.20.0 + with: + github_token: ${{ secrets.github_token }} + reporter: github-pr-check + level: info + filter_mode: diff_context + + enforce-style: + name: Enforce style + runs-on: ubuntu-latest + permissions: + security-events: write + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + - name: Setup Python + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + with: + python-version: "3.12" + - name: Install ONNX Neural Compressor + run: | + pip install . + - name: Install dependencies + run: | + python -m pip install lintrunner lintrunner-adapters + lintrunner init + - name: Run lintrunner on all files + run: | + set +e + if ! lintrunner --force-color --all-files --tee-json=lint.json -v; then + echo "" + echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`.\e[0m" + echo -e "\e[1m\e[36mSee https://github.com/onnx/neural-compressor/blob/main/.lintrunner.toml for setup instructions.\e[0m" + exit 1 + fi + - name: Produce SARIF + if: always() + run: | + python -m lintrunner_adapters to-sarif lint.json lintrunner.sarif + - name: Upload SARIF file + # Use always() to always upload SARIF even if lintrunner returns with error code + # To toggle linter comments in the files page, press `i` on the keyboard + if: always() + continue-on-error: true + uses: github/codeql-action/upload-sarif@cdcdbb579706841c47f7063dda365e292e5cad7a # v2.13.4 + with: + # Path to SARIF file relative to the root of the repository + sarif_file: lintrunner.sarif + category: lintrunner + checkout_path: ${{ github.workspace }} diff --git a/.lintrunner.toml b/.lintrunner.toml new file mode 100644 index 000000000..5a5298134 --- /dev/null +++ b/.lintrunner.toml @@ -0,0 +1,97 @@ +# Configuration for lintrunner https://github.com/suo/lintrunner +# You can install the dependencies and initialize with +# +# ```sh +# pip install lintrunner lintrunner-adapters +# lintrunner init +# ``` +# +# This will install lintrunner on your system and download all the necessary +# dependencies to run linters locally. +# If you want to see what lintrunner init will install, run +# `lintrunner init --dry-run`. +# +# To lint local changes: +# +# ```bash +# lintrunner +# ``` +# +# To lint all files: +# +# ```bash +# lintrunner --all-files +# ``` +# +# To format files: +# +# ```bash +# lintrunner -a +# ``` +# +# To read more about lintrunner, see [wiki](https://github.com/pytorch/pytorch/wiki/lintrunner). +# To update an existing linting rule or create a new one, modify this file or create a +# new adapter following examples in https://github.com/justinchuby/lintrunner-adapters. +merge_base_with = 'main' + +[[linter]] +code = 'RUFF' +include_patterns = [ + '**/*.py', + '**/*.pyi', +] +exclude_patterns = [ + '*_pb2*', + '.setuptools-cmake-build/*', + 'docs/**', +] +command = [ + 'python', + '-m', + 'lintrunner_adapters', + 'run', + 'ruff_linter', + '--config=pyproject.toml', + '@{{PATHSFILE}}' +] +init_command = [ + 'python', + '-m', + 'lintrunner_adapters', + 'run', + 'pip_init', + '--dry-run={{DRYRUN}}', + '--requirement=requirements-lintrunner.txt', +] +is_formatter = true + +[[linter]] +code = 'BLACK-ISORT' +include_patterns = [ + '**/*.py', +] +exclude_patterns = [ + '*_pb2*', + '.setuptools-cmake-build/*', + 'cmake/**', + 'docs/**', +] +command = [ + 'python', + '-m', + 'lintrunner_adapters', + 'run', + 'black_isort_linter', + '--', + '@{{PATHSFILE}}' +] +init_command = [ + 'python', + '-m', + 'lintrunner_adapters', + 'run', + 'pip_init', + '--dry-run={{DRYRUN}}', + '--requirement=requirements-lintrunner.txt', +] +is_formatter = true diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml deleted file mode 100644 index a028ec376..000000000 --- a/.pre-commit-config.yaml +++ /dev/null @@ -1,123 +0,0 @@ -ci: - autofix_prs: true - autoupdate_schedule: quarterly - -exclude: | - (?x)^( - conda_meta/.+| - )$ - -repos: - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 - hooks: - - id: end-of-file-fixer - files: (.*\.(py|md|rst|yaml|yml))$ - exclude: | - (?x)^( - examples/.+ - )$ - - id: check-json - exclude: | - (?x)^( - .vscode/settings_recommended.json - )$ - - id: check-yaml - exclude: | - (?x)^( - conda_meta/| - )$ - - id: debug-statements - - id: file-contents-sorter - exclude: | - (?x)^( - examples/.+ - )$ - args: [--unique] - - id: requirements-txt-fixer - exclude: | - (?x)^( - examples/.+ - )$ - - id: trailing-whitespace - files: (.*\.(py|rst|cmake|yaml|yml))$ - exclude: | - (?x)^( - examples/.+ - )$ - - - repo: https://github.com/asottile/yesqa - rev: v1.5.0 - hooks: - - id: yesqa - name: Unused noqa - - - repo: https://github.com/pycqa/isort - rev: 5.13.2 - hooks: - - id: isort - exclude: | - (?x)^( - examples/.+ - )$ - - - repo: https://github.com/PyCQA/docformatter - rev: v1.7.5 - hooks: - - id: docformatter - args: [ - --in-place, - --wrap-summaries=0, # 0 means disable wrap - --wrap-descriptions=0, # 0 means disable wrap - --black, - --style=google, - ] - exclude: | - (?x)^( - examples/.+ - )$ - - - repo: https://github.com/psf/black.git - rev: 24.3.0 - hooks: - - id: black - files: (.*\.py)$ - exclude: | - (?x)^( - examples/.+ - )$ - - - repo: https://github.com/asottile/blacken-docs - rev: 1.16.0 - hooks: - - id: blacken-docs - args: [--line-length=120, --skip-errors] - additional_dependencies: - - black==24.3.0 - exclude: | - (?x)^( - examples/.+| - docs/source-app - )$ - - - repo: https://github.com/codespell-project/codespell - rev: v2.2.6 - hooks: - - id: codespell - args: [-w] - additional_dependencies: - - tomli - exclude: | - (?x)^( - examples/.*(txt|patch)| - )$ - - - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.3.5 - hooks: - - id: ruff - args: [--fix, --exit-non-zero-on-fix, --no-cache] - exclude: | - (?x)^( - examples/.+ - )$ diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/accuracy.py b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/accuracy.py index 045d28c8b..5608307f6 100644 --- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/accuracy.py +++ b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/accuracy.py @@ -13,19 +13,18 @@ # limitations under the License. +import glob import json import logging import os import re import sys -import glob from pathlib import Path -import numpy as np import lm_eval.logging_utils import lm_eval.tasks import lm_eval.utils - +import numpy as np from evaluation import evaluator DEFAULT_RESULTS_FILE = "results.json" @@ -52,9 +51,7 @@ def cli_evaluate(args) -> None: if args.predict_only: args.log_samples = True if (args.log_samples or args.predict_only) and not args.output_path: - raise ValueError( - "Specify --output_path if providing --log_samples or --predict_only" - ) + raise ValueError("Specify --output_path if providing --log_samples or --predict_only") if args.include_path is not None: eval_logger.info(f"Including path: {args.include_path}") @@ -62,17 +59,14 @@ def cli_evaluate(args) -> None: if args.limit: eval_logger.warning( - " --limit SHOULD ONLY BE USED FOR TESTING." - "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." + " --limit SHOULD ONLY BE USED FOR TESTING." "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." ) if args.tasks is None: eval_logger.error("Need to specify task to evaluate.") sys.exit() elif args.tasks == "list": - eval_logger.info( - "Available Tasks:\n - {}".format("\n - ".join(task_manager.all_tasks)) - ) + eval_logger.info("Available Tasks:\n - {}".format("\n - ".join(task_manager.all_tasks))) sys.exit() else: if os.path.isdir(args.tasks): @@ -99,8 +93,8 @@ def cli_evaluate(args) -> None: f"{lm_eval.utils.SPACING}Try `lm-eval --tasks list` for list of available tasks", ) raise ValueError( - f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks," + \ - " or '--verbosity DEBUG' to troubleshoot task registration issues." + f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks," + + " or '--verbosity DEBUG' to troubleshoot task registration issues." ) if args.output_path: @@ -110,9 +104,7 @@ def cli_evaluate(args) -> None: raise FileExistsError(f"File already exists at {path}") output_path_file = path.joinpath(DEFAULT_RESULTS_FILE) if output_path_file.is_file(): - eval_logger.warning( - f"File {output_path_file} already exists. Results will be overwritten." - ) + eval_logger.warning(f"File {output_path_file} already exists. Results will be overwritten.") # if path json then get parent dir elif path.suffix in (".json", ".jsonl"): output_path_file = path @@ -124,17 +116,12 @@ def cli_evaluate(args) -> None: # Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args if args.trust_remote_code: os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = str(args.trust_remote_code) - args.model_args = ( - args.model_args - + f",trust_remote_code={os.environ['HF_DATASETS_TRUST_REMOTE_CODE']}" - ) + args.model_args = args.model_args + f",trust_remote_code={os.environ['HF_DATASETS_TRUST_REMOTE_CODE']}" eval_logger.info(f"Selected Tasks: {task_names}") eval_logger.info("Loading selected tasks...") - request_caching_args = evaluator.request_caching_arg_to_dict( - cache_requests=args.cache_requests - ) + request_caching_args = evaluator.request_caching_arg_to_dict(cache_requests=args.cache_requests) results = evaluator.simple_evaluate( model=args.model, @@ -156,17 +143,15 @@ def cli_evaluate(args) -> None: random_seed=args.seed[0], numpy_random_seed=args.seed[1], torch_random_seed=args.seed[2], - user_model=args.user_model, # to validate the model in memory, - tokenizer=args.tokenizer, # to use tokenizer in mem, + user_model=args.user_model, # to validate the model in memory, + tokenizer=args.tokenizer, # to use tokenizer in mem, **request_caching_args, ) if results is not None: if args.log_samples: samples = results.pop("samples") - dumped = json.dumps( - results, indent=2, default=_handle_non_serializable, ensure_ascii=False - ) + dumped = json.dumps(results, indent=2, default=_handle_non_serializable, ensure_ascii=False) if args.show_config: print(dumped) @@ -187,9 +172,7 @@ def cli_evaluate(args) -> None: if args.log_samples: for task_name, config in results["configs"].items(): - output_name = "{}_{}".format( - re.sub("/|=", "__", args.model_args), task_name - ) + output_name = "{}_{}".format(re.sub("/|=", "__", args.model_args), task_name) filename = path.joinpath(f"{output_name}.jsonl") samples_dumped = json.dumps( samples[task_name], diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/evaluator.py b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/evaluator.py index e4a92565b..2b4a8b2d2 100644 --- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/evaluator.py +++ b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/evaluator.py @@ -13,36 +13,34 @@ # limitations under the License. +import collections import itertools import logging import random import time -import collections from typing import TYPE_CHECKING, List, Optional, Union -import numpy as np -import torch - import lm_eval.api.metrics import lm_eval.api.registry -import lm_eval.models import lm_eval.caching.cache import lm_eval.evaluator_utils import lm_eval.logging_utils +import lm_eval.models import lm_eval.utils - +import numpy as np import optimum.onnxruntime - +import torch from evaluation.models import huggingface if TYPE_CHECKING: import lm_eval.api.model import lm_eval.tasks + @lm_eval.utils.positional_deprecated def simple_evaluate( model, - model_args: Optional[Union[str, dict,object]] = None, + model_args: Optional[Union[str, dict, object]] = None, tasks: Optional[List[Union[str, dict, object]]] = None, num_fewshot: Optional[int] = None, batch_size: Optional[int] = None, @@ -152,9 +150,7 @@ def simple_evaluate( if tasks is None: tasks = [] if len(tasks) == 0: - raise ValueError( - "No tasks specified, or no tasks found. Please verify the task names." - ) + raise ValueError("No tasks specified, or no tasks found. Please verify the task names.") if gen_kwargs is not None: gen_kwargs = lm_eval.utils.simple_parse_args_string(gen_kwargs) @@ -181,9 +177,9 @@ def simple_evaluate( model_id = "fxmarty/onnx-tiny-random-gpt2-with-merge" elif isinstance(user_model, optimum.onnxruntime.ORTModelForSeq2SeqLM): model_id = "optimum/t5-small" - lm_eval.utils.eval_logger.info("We use '{}' to build `LM` instance, the actually run model is user_model you passed.".format( - model_id - )) + lm_eval.utils.eval_logger.info( + "We use '{}' to build `LM` instance, the actually run model is user_model you passed.".format(model_id) + ) lm = lm_eval.api.registry.get_model(model).create_from_arg_string( "pretrained=" + model_id, { @@ -244,9 +240,7 @@ def simple_evaluate( if task_obj.get_config("output_type") == "generate_until": if gen_kwargs is not None: - task_obj.set_config( - key="generation_kwargs", value=gen_kwargs, update=True - ) + task_obj.set_config(key="generation_kwargs", value=gen_kwargs, update=True) if predict_only: log_samples = True @@ -261,8 +255,8 @@ def simple_evaluate( if num_fewshot is not None: if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0: lm_eval.utils.eval_logger.info( - f"num_fewshot has been set to 0 for {task_name} in its config." + \ - "Manual configuration will be ignored." + f"num_fewshot has been set to 0 for {task_name} in its config." + + "Manual configuration will be ignored." ) else: lm_eval.utils.eval_logger.warning( @@ -302,9 +296,7 @@ def simple_evaluate( "model": model_name, "model_args": model_args, "batch_size": batch_size, - "batch_sizes": ( - list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else [] - ), + "batch_sizes": (list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else []), "provider": provider, "use_cache": use_cache, "limit": limit, @@ -316,9 +308,7 @@ def simple_evaluate( try: lm_eval.logging_utils.add_env_info(results) # additional environment info to results except: - lm_eval.utils.eval_logger.info( - f"get env info failed." - ) + lm_eval.utils.eval_logger.info("get env info failed.") return results else: return None @@ -373,8 +363,7 @@ def evaluate( task_hierarchy, eval_tasks = lm_eval.evaluator_utils.get_task_list(task_dict) if not log_samples: if not all( - "bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys() - for task_output in eval_tasks + "bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys() for task_output in eval_tasks ): raise ValueError("log_samples must be True for 'bypass' metric-only tasks") for task_output in eval_tasks: @@ -400,15 +389,9 @@ def evaluate( if lm.world_size > 1: instances_rnk = torch.tensor(len(task._instances), device=torch.device("cpu")) - gathered_item = ( - lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist() - ) + gathered_item = lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist() # "multiple_choice" task types dispatch (several) "loglikelihood" request types - reqtype = ( - "loglikelihood" - if task.OUTPUT_TYPE == "multiple_choice" - else task.OUTPUT_TYPE - ) + reqtype = "loglikelihood" if task.OUTPUT_TYPE == "multiple_choice" else task.OUTPUT_TYPE # compute number of pseudo-batches to pad with (FSDP/DDP require even batches among ranks) numpad = max(gathered_item) - gathered_item[lm.rank] # todo: may not account for padding in cases like SquadV2 which has multiple req types @@ -457,14 +440,10 @@ def evaluate( instances.sort(key=lambda x: x.idx) # iterate over different filters used for filter_key in task.instances[0].filtered_resps.keys(): - doc_iterator = task.doc_iterator( - rank=RANK, limit=limit, world_size=WORLD_SIZE - ) + doc_iterator = task.doc_iterator(rank=RANK, limit=limit, world_size=WORLD_SIZE) for doc_id, doc in doc_iterator: requests = instances_by_doc_id[doc_id] - metrics = task.process_results( - doc, [req.filtered_resps[filter_key] for req in requests] - ) + metrics = task.process_results(doc, [req.filtered_resps[filter_key] for req in requests]) if log_samples: target = task.doc_to_target(doc) example = { @@ -473,9 +452,7 @@ def evaluate( "target": target, "arguments": [req.args for req in requests], "resps": [req.resps for req in requests], - "filtered_resps": [ - req.filtered_resps[filter_key] for req in requests - ], + "filtered_resps": [req.filtered_resps[filter_key] for req in requests], } example.update(metrics) task_output.logged_samples.append(example) @@ -496,9 +473,7 @@ def evaluate( ) if RANK == 0: - task_output.logged_samples = list( - itertools.chain.from_iterable(full_samples) - ) + task_output.logged_samples = list(itertools.chain.from_iterable(full_samples)) # then collect metrics across all ranks for metrics in task_output.sample_metrics: @@ -509,18 +484,14 @@ def evaluate( dst=0, ) if RANK == 0: - task_output.sample_metrics[metrics] = list( - itertools.chain.from_iterable(metric_list) - ) + task_output.sample_metrics[metrics] = list(itertools.chain.from_iterable(metric_list)) if RANK == 0: ### Aggregate results over all datapoints ### # aggregate results ; run bootstrap CIs for task_output in eval_tasks: task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters) - results, samples, configs, versions, num_fewshot = lm_eval.evaluator_utils.consolidate_results( - eval_tasks - ) + results, samples, configs, versions, num_fewshot = lm_eval.evaluator_utils.consolidate_results(eval_tasks) ### Calculate group metrics ### if bool(results): @@ -543,33 +514,17 @@ def evaluate( stderr = "_stderr,".join(metric.split(",")) # gather metrics, sizes, and stderrs from subtasks - metrics = [ - results[task][metric] - for task in task_list - if metric in results[task] - ] # TODO: copy? - stderrs = [ - results[task][stderr] - for task in task_list - if stderr in results[task] - ] - sizes = [ - results[task]["samples"] - for task in task_list - if metric in results[task] - ] + metrics = [results[task][metric] for task in task_list if metric in results[task]] # TODO: copy? + stderrs = [results[task][stderr] for task in task_list if stderr in results[task]] + sizes = [results[task]["samples"] for task in task_list if metric in results[task]] # compute group's pooled metric and stderr - results[group][metric] = ( - lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes) - ) + results[group][metric] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes) # TODO: calculate grouped metric using aggregation fn if "N/A" in stderrs: results[group][stderr] = "N/A" else: - results[group][stderr] = ( - lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes) - ) + results[group][stderr] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes) # TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility # To use the old (likely incorrect) variance formula, # comment out the above and uncomment this line: @@ -587,9 +542,7 @@ def evaluate( if len(left_tasks_list) == 0: break - _task_hierarchy = { - k: v for k, v in task_hierarchy.items() if k in left_tasks_list - } + _task_hierarchy = {k: v for k, v in task_hierarchy.items() if k in left_tasks_list} _results_agg, _groups_agg = lm_eval.evaluator_utils.prepare_print_tasks(_task_hierarchy, results) results_agg = {**results_agg, **_results_agg} @@ -597,9 +550,7 @@ def evaluate( for group_name, task_list in task_hierarchy.items(): if task_list: - num_fewshot[group_name] = num_fewshot[ - task_list[0] - ] # TODO: validate this + num_fewshot[group_name] = num_fewshot[task_list[0]] # TODO: validate this results_dict = { "results": dict(results_agg.items()), diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/models/__init__.py b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/models/__init__.py index 6a7755e15..8a19e05fd 100644 --- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/models/__init__.py +++ b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/models/__init__.py @@ -15,7 +15,6 @@ from evaluation.models import huggingface - # TODO: implement __all__ diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/models/huggingface.py b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/models/huggingface.py index 1eb8cd49a..b682e4f47 100644 --- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/models/huggingface.py +++ b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/models/huggingface.py @@ -13,30 +13,29 @@ # limitations under the License. -import accelerate import copy -import huggingface_hub import os -import packaging.version import tempfile -import tqdm -import transformers from typing import List, Literal, Optional, Tuple, Union -import torch -import torch.nn.functional as F - -import lm_eval.utils +import accelerate +import huggingface_hub import lm_eval.api.instance import lm_eval.api.model import lm_eval.models.utils - +import lm_eval.utils import onnxruntime -import optimum.version import optimum.onnxruntime +import optimum.version +import packaging.version +import torch +import torch.nn.functional as F +import tqdm +import transformers eval_logger = lm_eval.utils.eval_logger + class HFLM(lm_eval.api.model.TemplateLM): """An abstracted Huggingface model class. Enables usage with both models of `optimum.onnxruntime.ORTModelForCausalLM` and @@ -75,22 +74,21 @@ def __init__( available_providers = onnxruntime.get_available_providers() assert provider in available_providers, "{} is not available.".format(provider) self._provider = provider - self._device = torch.device("cpu") # use cpu to generate torch tensor + self._device = torch.device("cpu") # use cpu to generate torch tensor # optionally: take in an already-initialized ORTModel if not isinstance(pretrained, str): eval_logger.warning( - "`pretrained` model kwarg is not of type `str`. "+ \ - "Many other model arguments may be ignored. " + "`pretrained` model kwarg is not of type `str`. " + "Many other model arguments may be ignored. " ) self._model = pretrained self._config = self._model.config self.model.providers if tokenizer: - assert isinstance( - tokenizer, transformers.PreTrainedTokenizer - ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast) + assert isinstance(tokenizer, transformers.PreTrainedTokenizer) or isinstance( + tokenizer, transformers.PreTrainedTokenizerFast + ) self.tokenizer = tokenizer else: # Get tokenizer @@ -112,9 +110,7 @@ def __init__( ) # determine which of 'causal' and 'seq2seq' backends to use - self._get_backend( - config=self.config, backend=backend, trust_remote_code=trust_remote_code - ) + self._get_backend(config=self.config, backend=backend, trust_remote_code=trust_remote_code) # if we passed `pretrained` as a string, initialize our model now if isinstance(pretrained, str): @@ -162,8 +158,8 @@ def __init__( if getattr(self.config, "model_type", None) == "gemma": self.add_bos_token = True eval_logger.info( - f"Model type is '{self.config.model_type}', " + \ - "a BOS token will be used as Gemma underperforms without it." + f"Model type is '{self.config.model_type}', " + + "a BOS token will be used as Gemma underperforms without it." ) self._max_length = max_length @@ -182,8 +178,8 @@ def __init__( if not isinstance(pretrained, str): # if a PreTrainedModel was passed into HFLM, we forgo distributed setup. eval_logger.warning( - "Passed an already-initialized model through `pretrained`," + \ - " assuming single-process call to evaluate() or custom distributed integration" + "Passed an already-initialized model through `pretrained`," + + " assuming single-process call to evaluate() or custom distributed integration" ) self._rank = 0 self._world_size = 1 @@ -254,9 +250,7 @@ def _get_backend( self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM elif backend == "seq2seq": self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM - eval_logger.info( - f"Overrode HF model backend type, and using type '{backend}'" - ) + eval_logger.info(f"Overrode HF model backend type, and using type '{backend}'") else: # determine and use the default HF backend for this model, based on its config + metadata. if ( @@ -268,8 +262,8 @@ def _get_backend( # these special cases should be treated as seq2seq models. self.AUTO_MODEL_CLASS = transformers.AutoModelForSeq2SeqLM elif ( - getattr(self.config, "model_type") in - transformers.models.auto.modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES + getattr(self.config, "model_type") + in transformers.models.auto.modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES ): self.AUTO_MODEL_CLASS = transformers.AutoModelForCausalLM else: @@ -308,8 +302,8 @@ def _create_model( if not os.path.exists(pretrained): eval_logger.warning("`{}` path does not exist. Will try to download it from huggingface.") try: - local_dir = tempfile.TemporaryDirectory().name - huggingface_hub.snapshot_download(pretrained, local_dir =local_dir ) + local_dir = tempfile.TemporaryDirectory().name + huggingface_hub.snapshot_download(pretrained, local_dir=local_dir) pretrained = local_dir except Exception as e: raise e @@ -317,121 +311,106 @@ def _create_model( if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: if ( not os.path.exists(os.path.join(pretrained, "decoder_model.onnx")) - and not os.path.exists( - os.path.join(pretrained, "decoder_with_past_model.onnx") - ) - and not os.path.exists( - os.path.join(pretrained, "decoder_model_merged.onnx") - ) + and not os.path.exists(os.path.join(pretrained, "decoder_with_past_model.onnx")) + and not os.path.exists(os.path.join(pretrained, "decoder_model_merged.onnx")) and not os.path.exists(os.path.join(pretrained, "model.onnx")) ): raise ValueError( - "Couldn't find any ONNX model name in " + \ - "['decoder_model.onnx', 'decoder_with_past_model.onnx', " - "'decoder_model_merged.onnx', 'model.onnx'] in {}.".format( - pretrained - ) + "Couldn't find any ONNX model name in " + "['decoder_model.onnx', 'decoder_with_past_model.onnx', " + "'decoder_model_merged.onnx', 'model.onnx'] in {}.".format(pretrained) ) sess_options = onnxruntime.SessionOptions() - sess_options.graph_optimization_level = ( - onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL - ) + sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL if packaging.version.Version(optimum.version.__version__) >= packaging.version.Version("1.14.0"): if os.path.exists(os.path.join(pretrained, "model.onnx")): session = optimum.onnxruntime.ORTModelForCausalLM.load_model( - os.path.join(pretrained, "model.onnx"), - provider=self.provider, - session_options=sess_options) + os.path.join(pretrained, "model.onnx"), provider=self.provider, session_options=sess_options + ) inputs_names = [input.name for input in session.get_inputs()] key_value_input_names = [key for key in inputs_names if (".key" in key) or (".value" in key)] use_cache = len(key_value_input_names) > 0 - self._model = optimum.onnxruntime.ORTModelForCausalLM(session, - self.config, - use_cache=True if use_cache else False, - use_io_binding=True if use_cache else False) + self._model = optimum.onnxruntime.ORTModelForCausalLM( + session, + self.config, + use_cache=True if use_cache else False, + use_io_binding=True if use_cache else False, + ) else: if os.path.exists(os.path.join(pretrained, "decoder_model_merged.onnx")): session = optimum.onnxruntime.ORTModelForCausalLM.load_model( os.path.join(pretrained, "decoder_model_merged.onnx"), provider=self.provider, - session_options=sess_options) - self._model = optimum.onnxruntime.ORTModelForCausalLM(session, - self.config, - use_cache=True) + session_options=sess_options, + ) + self._model = optimum.onnxruntime.ORTModelForCausalLM(session, self.config, use_cache=True) elif os.path.exists(os.path.join(pretrained, "decoder_with_past_model.onnx")): session = optimum.onnxruntime.ORTModelForCausalLM.load_model( os.path.join(pretrained, "decoder_with_past_model.onnx"), provider=self.provider, - session_options=sess_options) - self._model = optimum.onnxruntime.ORTModelForCausalLM(session, - self.config, - use_cache=True) + session_options=sess_options, + ) + self._model = optimum.onnxruntime.ORTModelForCausalLM(session, self.config, use_cache=True) elif os.path.exists(os.path.join(pretrained, "decoder_model.onnx")): session = optimum.onnxruntime.ORTModelForCausalLM.load_model( os.path.join(pretrained, "decoder_model.onnx"), provider=self.provider, - session_options=sess_options) - self._model = optimum.onnxruntime.ORTModelForCausalLM(session, - self.config, - use_cache=False, - use_io_binding=False) + session_options=sess_options, + ) + self._model = optimum.onnxruntime.ORTModelForCausalLM( + session, self.config, use_cache=False, use_io_binding=False + ) else: if os.path.exists(os.path.join(pretrained, "model.onnx")): session = optimum.onnxruntime.ORTModelForCausalLM.load_model( - os.path.join(pretrained, "model.onnx"), - provider=self.provider, - session_options=sess_options) + os.path.join(pretrained, "model.onnx"), provider=self.provider, session_options=sess_options + ) inputs_names = session.get_inputs() key_value_input_names = [key for key in inputs_names if (".key" in key) or (".value" in key)] use_cache = len(key_value_input_names) > 0 - self._model = optimum.onnxruntime.ORTModelForCausalLM(session[0], - self.config, - pretrained, - use_cache=True if use_cache else False, - use_io_binding=True if use_cache else False,) + self._model = optimum.onnxruntime.ORTModelForCausalLM( + session[0], + self.config, + pretrained, + use_cache=True if use_cache else False, + use_io_binding=True if use_cache else False, + ) else: if os.path.exists(os.path.join(pretrained, "decoder_model_merged.onnx")): sessions = optimum.onnxruntime.ORTModelForCausalLM.load_model( os.path.join(pretrained, "decoder_model_merged.onnx"), provider=self.provider, - session_options=sess_options) - self._model = optimum.onnxruntime.ORTModelForCausalLM(sessions[0], - self.config, - pretrained, - use_cache=True) + session_options=sess_options, + ) + self._model = optimum.onnxruntime.ORTModelForCausalLM( + sessions[0], self.config, pretrained, use_cache=True + ) elif os.path.exists(os.path.join(pretrained, "decoder_with_past_model.onnx")): sessions = optimum.onnxruntime.ORTModelForCausalLM.load_model( os.path.join(pretrained, "decoder_model.onnx"), os.path.join(pretrained, "decoder_with_past_model.onnx"), provider=self.provider, - session_options=sess_options) - self._model = optimum.onnxruntime.ORTModelForCausalLM(sessions[0], - self.config, - pretrained, - sessions[1], - use_cache=True) + session_options=sess_options, + ) + self._model = optimum.onnxruntime.ORTModelForCausalLM( + sessions[0], self.config, pretrained, sessions[1], use_cache=True + ) else: sessions = optimum.onnxruntime.ORTModelForCausalLM.load_model( os.path.join(pretrained, "decoder_model.onnx"), provider=self.provider, - session_options=sess_options) - self._model = optimum.onnxruntime.ORTModelForCausalLM(sessions[0], - self.config, - pretrained, - use_cache=False, - use_io_binding=False) + session_options=sess_options, + ) + self._model = optimum.onnxruntime.ORTModelForCausalLM( + sessions[0], self.config, pretrained, use_cache=False, use_io_binding=False + ) elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM: - if not os.path.exists( - os.path.join(pretrained, "encoder_model.onnx") - ) or ( + if not os.path.exists(os.path.join(pretrained, "encoder_model.onnx")) or ( not os.path.exists(os.path.join(pretrained, "decoder_model.onnx")) - and not os.path.exists( - os.path.join(pretrained, "decoder_model_merged.onnx") - ) + and not os.path.exists(os.path.join(pretrained, "decoder_model_merged.onnx")) ): raise ValueError( "Please ensure encoder_model.onnx and " @@ -439,12 +418,8 @@ def _create_model( ) sess_options = onnxruntime.SessionOptions() - sess_options.graph_optimization_level = ( - onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL - ) - if os.path.exists( - os.path.join(pretrained, "decoder_model_merged.onnx") - ): + sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL + if os.path.exists(os.path.join(pretrained, "decoder_model_merged.onnx")): sessions = optimum.onnxruntime.ORTModelForSeq2SeqLM.load_model( os.path.join(pretrained, "encoder_model.onnx"), os.path.join(pretrained, "decoder_model_merged.onnx"), @@ -458,9 +433,7 @@ def _create_model( use_cache=True, ) - elif os.path.exists( - os.path.join(pretrained, "decoder_with_past_model.onnx") - ): + elif os.path.exists(os.path.join(pretrained, "decoder_with_past_model.onnx")): sessions = optimum.onnxruntime.ORTModelForSeq2SeqLM.load_model( os.path.join(pretrained, "encoder_model.onnx"), os.path.join(pretrained, "decoder_model.onnx"), @@ -520,9 +493,9 @@ def _create_tokenizer( use_fast=use_fast_tokenizer, ) else: - assert isinstance( - tokenizer, transformers.PreTrainedTokenizer - ) or isinstance(tokenizer, transformers.PreTrainedTokenizerFast) + assert isinstance(tokenizer, transformers.PreTrainedTokenizer) or isinstance( + tokenizer, transformers.PreTrainedTokenizerFast + ) self.tokenizer = tokenizer else: # Get tokenizer based on 'pretrained' @@ -542,9 +515,7 @@ def _create_tokenizer( def _detect_batch_size(self, requests=None, pos: int = 0): if requests: _, context_enc, continuation_enc = requests[pos] - max_length = len( - (context_enc + continuation_enc)[-(self.max_length + 1) :][:-1] - ) + max_length = len((context_enc + continuation_enc)[-(self.max_length + 1) :][:-1]) max_context_enc = len(context_enc[-(self.max_length + 1) :]) max_cont_enc = len(continuation_enc[-(self.max_length + 1) :]) else: @@ -555,9 +526,7 @@ def _detect_batch_size(self, requests=None, pos: int = 0): def forward_batch(batch_size): if self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM: length = max(max_context_enc, max_cont_enc) - batched_conts = torch.ones( - (batch_size, length), device=self._device - ).long() + batched_conts = torch.ones((batch_size, length), device=self._device).long() test_batch = torch.ones((batch_size, length), device=self._device).long() call_kwargs = { "attn_mask": test_batch, @@ -565,13 +534,9 @@ def forward_batch(batch_size): } else: call_kwargs = {} - test_batch = torch.ones( - (batch_size, max_length), device=self._device - ).long() + test_batch = torch.ones((batch_size, max_length), device=self._device).long() for _ in range(5): - out = F.log_softmax( - self._model_call(test_batch, **call_kwargs), dim=-1 - ) + out = F.log_softmax(self._model_call(test_batch, **call_kwargs), dim=-1) return batch_size @@ -586,9 +551,7 @@ def forward_batch(batch_size): if self.world_size > 1: # if multi-GPU, always take minimum over all selected batch sizes max_rnk_bs = torch.tensor([batch_size], device=self._device) - gathered = ( - self.accelerator.gather(max_rnk_bs).cpu().detach().numpy().tolist() - ) + gathered = self.accelerator.gather(max_rnk_bs).cpu().detach().numpy().tolist() batch_size = min(gathered) lm_eval.models.utils.clear_torch_cache() return batch_size @@ -596,9 +559,7 @@ def forward_batch(batch_size): lm_eval.models.utils.clear_torch_cache() return batch_size - def tok_encode( - self, string: str, left_truncate_len=None, add_special_tokens=None - ) -> List[int]: + def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None) -> List[int]: if add_special_tokens is None: if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: add_special_tokens = False or self.add_bos_token @@ -639,22 +600,16 @@ def tok_batch_encode( ) if left_truncate_len: encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:] - encoding["attention_mask"] = encoding["attention_mask"][ - :, -left_truncate_len: - ] + encoding["attention_mask"] = encoding["attention_mask"][:, -left_truncate_len:] self.tokenizer.padding_side = old_padding_side return encoding["input_ids"], encoding["attention_mask"] def tok_decode(self, tokens, skip_special_tokens=True): if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: - return self.tokenizer.decode( - tokens, skip_special_tokens=skip_special_tokens - ) + return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens) elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM: - return self.tokenizer.decode( - tokens, skip_special_tokens=skip_special_tokens - ) + return self.tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens) def _model_call(self, inps, attn_mask=None, labels=None): """Call model to get logits results. @@ -684,9 +639,7 @@ def _model_call(self, inps, attn_mask=None, labels=None): shifted_input_ids = labels.new_zeros(labels.shape) shifted_input_ids[..., 1:] = labels[..., :-1].clone() shifted_input_ids[..., 0] = decoder_start_token_id - shifted_input_ids.masked_fill_( - shifted_input_ids == -100, pad_token_id - ) + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) return self.model( inps, attention_mask=attn_mask, @@ -695,32 +648,27 @@ def _model_call(self, inps, attn_mask=None, labels=None): ).logits else: assert self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM - if hasattr(self.model, "config") and hasattr(self.model.config, "auto_map") and \ - "chatglm2" in self.model.config.auto_map["AutoConfig"]: + if ( + hasattr(self.model, "config") + and hasattr(self.model.config, "auto_map") + and "chatglm2" in self.model.config.auto_map["AutoConfig"] + ): input_bs, input_len = inps.shape bos = torch.tensor([64790, 64792]).repeat(input_bs, 1) inps = torch.cat((bos, inps), 1) - inputs_names = [ - input.name for input in self.model.model.get_inputs() - ] + inputs_names = [input.name for input in self.model.model.get_inputs()] if "position_ids" in inputs_names: # model is exported with optimum >= 1.14.0 with new input 'position_ids' input_shape = inps.shape - position_ids = ( - torch.arange(0, input_shape[-1], dtype=torch.long) - .unsqueeze(0) - .view(-1, input_shape[-1]) - ) + position_ids = torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1]) output = self.model( inps, torch.ones(inps.shape, dtype=torch.int64), position_ids, ).logits else: - output = self.model( - inps, torch.ones(inps.shape, dtype=torch.int64) - ).logits + output = self.model(inps, torch.ones(inps.shape, dtype=torch.int64)).logits return output def _model_generate(self, context, max_length, stop, **generation_kwargs): @@ -750,20 +698,14 @@ def _model_generate(self, context, max_length, stop, **generation_kwargs): **generation_kwargs, ) - def _select_cont_toks( - self, logits: torch.Tensor, contlen: int = None, inplen: int = None - ) -> torch.Tensor: + def _select_cont_toks(self, logits: torch.Tensor, contlen: int = None, inplen: int = None) -> torch.Tensor: if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM: - assert ( - contlen and inplen - ), "Must pass input len and cont. len to select scored logits for causal LM" + assert contlen and inplen, "Must pass input len and cont. len to select scored logits for causal LM" # discard right-padding. # also discard the input/context tokens. we'll only score continuations. logits = logits[inplen - contlen : inplen] elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM: - assert ( - contlen and not inplen - ), "Selecting scored logits for Seq2SeqLM requires only cont. len" + assert contlen and not inplen, "Selecting scored logits for Seq2SeqLM requires only cont. len" # only discard right-padding. # the logits input to this fn only contain decoder-side tokens. logits = logits[:contlen] @@ -783,9 +725,7 @@ def loglikelihood_rolling( print(f"Determined Largest batch size: {batch_size}") adaptive_batch_size = batch_size - for (string,) in tqdm.tqdm( - [req.args for req in requests], disable=(disable_tqdm or (self.rank != 0)) - ): + for (string,) in tqdm.tqdm([req.args for req in requests], disable=(disable_tqdm or (self.rank != 0))): rolling_token_windows = list( map( lm_eval.utils.make_disjoint_window, @@ -806,9 +746,7 @@ def loglikelihood_rolling( if self.world_size > 1: # We pad out the external document-level iterator so the inner iterator doesn't hang mytensor = torch.tensor(len(rolling_token_windows), device=self._device) - gathered = ( - self.accelerator.gather(mytensor).cpu().detach().numpy().tolist() - ) + gathered = self.accelerator.gather(mytensor).cpu().detach().numpy().tolist() pad_amnt = max(gathered) - gathered[self.rank] if pad_amnt > 0: @@ -835,15 +773,11 @@ def _batch_scheduler(self, pos, n_reordered_requests): sched = pos // int(len(n_reordered_requests) / self.batch_schedule) if sched in self.batch_sizes: return self.batch_sizes[sched] - if (len(self.batch_sizes) > 1) and ( - self.batch_sizes[sched - 1] == self.max_batch_size - ): + if (len(self.batch_sizes) > 1) and (self.batch_sizes[sched - 1] == self.max_batch_size): # if previous batch size is already maximal, skip recomputation self.batch_sizes[sched] = self.max_batch_size return self.batch_sizes[sched] - print( - f"Passed argument batch_size = auto:{self.batch_schedule}. Detecting largest batch size" - ) + print(f"Passed argument batch_size = auto:{self.batch_schedule}. Detecting largest batch size") self.batch_sizes[sched] = self._detect_batch_size(n_reordered_requests, pos) print(f"Determined largest batch size: {self.batch_sizes[sched]}") return self.batch_sizes[sched] @@ -882,9 +816,7 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]): requests, sort_fn=_collate, group_by=( - "contexts" - if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM and self.logits_cache - else None + "contexts" if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM and self.logits_cache else None ), group_fn=_lookup_one_token_cont, ) @@ -892,16 +824,10 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]): # automatic (variable) batch size detection for vectorization # pull longest context sample from request n_reordered_requests = len(re_ord) - batch_size = ( - self.batch_size - if self.batch_size != "auto" - else override_bs if override_bs is not None else 0 - ) + batch_size = self.batch_size if self.batch_size != "auto" else override_bs if override_bs is not None else 0 batch_fn = ( self._batch_scheduler - if self.batch_size == "auto" - and n_reordered_requests > 0 - and not override_bs + if self.batch_size == "auto" and n_reordered_requests > 0 and not override_bs else None ) @@ -968,17 +894,9 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]): conts.append(cont) - padding_len_cont = ( - max(padding_len_cont, contlen) - if padding_len_cont is not None - else contlen - ) + padding_len_cont = max(padding_len_cont, contlen) if padding_len_cont is not None else contlen - padding_len_inp = ( - max(padding_len_inp, inplen) - if padding_len_inp is not None - else inplen - ) + padding_len_inp = max(padding_len_inp, inplen) if padding_len_inp is not None else inplen inps.append(inp) # [1, inp_length] cont_toks_list.append(continuation_enc) @@ -992,9 +910,7 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]): ) # [batch, padding_len_inp] elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM: # TODO: left-pad encoder inps and mask? - batched_inps = lm_eval.models.utils.pad_and_concat( - padding_len_inp, inps - ) # [batch, padding_len_inp] + batched_inps = lm_eval.models.utils.pad_and_concat(padding_len_inp, inps) # [batch, padding_len_inp] batched_conts = lm_eval.models.utils.pad_and_concat( padding_len_cont, conts ) # [batch, padding_len_cont] @@ -1040,18 +956,12 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]): cont_toks=cont_toks, logits=logits, ): - cont_toks = torch.tensor( - cont_toks, dtype=torch.long, device=self._device - ).unsqueeze( - 0 - ) # [1, seq] + cont_toks = torch.tensor(cont_toks, dtype=torch.long, device=self._device).unsqueeze(0) # [1, seq] max_equal = (greedy_tokens == cont_toks).all() # Obtain log-probs at the corresponding continuation token indices # last_token_slice = logits[:, -1, :].squeeze(0).tolist() - logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze( - -1 - ) # [1, seq] + logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [1, seq] # Answer: (log prob, is-exact-match) answer = (float(logits.sum()), bool(max_equal)) @@ -1065,9 +975,7 @@ def _lookup_one_token_cont(req: Tuple[Tuple[str, str], List[int], List[int]]): return re_ord.get_original(res) - def generate_until( - self, requests: List[lm_eval.api.instance.Instance], disable_tqdm: bool = False - ) -> List[str]: + def generate_until(self, requests: List[lm_eval.api.instance.Instance], disable_tqdm: bool = False) -> List[str]: res = [] def _collate(req: Tuple[str, dict]): @@ -1099,11 +1007,7 @@ def _collate(req: Tuple[str, dict]): if self.batch_size != "auto" else adaptive_batch_size if adaptive_batch_size is not None else 0 ) - batch_fn = ( - self._batch_scheduler - if self.batch_size == "auto" and not adaptive_batch_size - else None - ) + batch_fn = self._batch_scheduler if self.batch_size == "auto" and not adaptive_batch_size else None # we group requests by their generation_kwargs, # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling @@ -1130,13 +1034,9 @@ def _collate(req: Tuple[str, dict]): if isinstance(until, str): until = [kwargs] elif not isinstance(until, list): - raise ValueError( - f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}" - ) + raise ValueError(f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}") else: - raise ValueError( - f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}" - ) + raise ValueError(f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}") # add EOS token to stop sequences eos = self.tok_decode(self.eot_token_id, skip_special_tokens=False) if not until: diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/utils.py b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/utils.py index 9bc338917..a9845eb41 100644 --- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/utils.py +++ b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/evaluation/utils.py @@ -12,38 +12,40 @@ # See the License for the specific language governing permissions and # limitations under the License. + class LMEvalParser: - def __init__(self, - model="hf", - tasks="lambada_openai", - model_args="", - user_model=None, - tokenizer=None, - num_fewshot=None, - batch_size=1, - max_batch_size=None, - provider=None, - output_path=None, - limit=None, - use_cache=None, - cache_requests=None, - check_integrity=False, - write_out=False, - log_samples=False, - show_config=False, - include_path=None, - gen_kwargs=None, - verbosity="INFO", - wandb_args="", - predict_only=False, - seed=[0, 1234, 1234], - trust_remote_code=False - ): + def __init__( + self, + model="hf", + tasks="lambada_openai", + model_args="", + user_model=None, + tokenizer=None, + num_fewshot=None, + batch_size=1, + max_batch_size=None, + provider=None, + output_path=None, + limit=None, + use_cache=None, + cache_requests=None, + check_integrity=False, + write_out=False, + log_samples=False, + show_config=False, + include_path=None, + gen_kwargs=None, + verbosity="INFO", + wandb_args="", + predict_only=False, + seed=[0, 1234, 1234], + trust_remote_code=False, + ): self.model = model self.tasks = tasks self.model_args = model_args - self.user_model=user_model - self.tokenizer=tokenizer + self.user_model = user_model + self.tokenizer = tokenizer self.num_fewshot = num_fewshot self.batch_size = batch_size self.max_batch_size = max_batch_size diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py index 0dac48ba9..9cafe62d3 100644 --- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py +++ b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/main.py @@ -15,74 +15,52 @@ # specific language governing permissions and limitations # under the License. # pylint:disable=redefined-outer-name,logging-format-interpolation -import os -import onnx -import time +import argparse import json -import random -import torch import logging -import argparse +import os import random -import numpy as np +import time + import datasets +import evaluation +import numpy as np +import onnx import onnxruntime as ort +import torch import transformers -import evaluation +from optimum import onnxruntime as optimum_ort from torch.nn import functional from torch.utils import data -from optimum import onnxruntime as optimum_ort -from onnx_neural_compressor.quantization import matmul_nbits_quantizer -from onnx_neural_compressor import config -from onnx_neural_compressor import logger -from onnx_neural_compressor.quantization import tuning -from onnx_neural_compressor import data_reader -from onnx_neural_compressor import utility - -logger = logging.getLogger(__name__) + +from onnx_neural_compressor import config, data_reader, logger, utility +from onnx_neural_compressor.quantization import matmul_nbits_quantizer, tuning + logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - level=logging.WARN) - -parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -parser.add_argument("--model_path", - type=str, - help="Folder path of pre-trained onnx model") -parser.add_argument( - "--benchmark", - action="store_true", \ - default=False -) -parser.add_argument( - "--tune", - action="store_true", \ - default=False, - help="whether quantize the model" + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.WARN ) -parser.add_argument("--output_model", - type=str, - default=None, - help="output model path") + +parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument("--model_path", type=str, help="Folder path of pre-trained onnx model") +parser.add_argument("--benchmark", action="store_true", default=False) +parser.add_argument("--tune", action="store_true", default=False, help="whether quantize the model") +parser.add_argument("--output_model", type=str, default=None, help="output model path") parser.add_argument( "--batch_size", default=1, type=int, ) -parser.add_argument("--tokenizer", - type=str, - help="pretrained model name or path of tokenizer files", - default="meta-llama/Llama-2-7b-hf") -parser.add_argument("--workspace", - type=str, - help="workspace to save intermediate files", - default="nc_workspace") -parser.add_argument("--algorithm", - type=str, - default="WOQ_TUNE", - choices=["WOQ_TUNE", "RTN", "AWQ", "GPTQ"], - help="weight only algorithm") +parser.add_argument( + "--tokenizer", type=str, help="pretrained model name or path of tokenizer files", default="meta-llama/Llama-2-7b-hf" +) +parser.add_argument("--workspace", type=str, help="workspace to save intermediate files", default="nc_workspace") +parser.add_argument( + "--algorithm", + type=str, + default="WOQ_TUNE", + choices=["WOQ_TUNE", "RTN", "AWQ", "GPTQ"], + help="weight only algorithm", +) parser.add_argument( "--pad_max", default=196, @@ -96,18 +74,22 @@ parser.add_argument( "--tasks", nargs="+", - default=["winogrande", "copa", "piqa", "rte", "hellaswag", "openbookqa", \ - "lambada_openai", "lambada_standard", "wikitext"], + default=[ + "winogrande", + "copa", + "piqa", + "rte", + "hellaswag", + "openbookqa", + "lambada_openai", + "lambada_standard", + "wikitext", + ], type=str, - help="tasks list for accuracy validation" + help="tasks list for accuracy validation", ) -parser.add_argument("--dataset", - nargs="?", - default="NeelNanda/pile-10k", - const="NeelNanda/pile-10k") -parser.add_argument('--mode', - type=str, - help="benchmark mode of performance or accuracy") +parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k") +parser.add_argument("--mode", type=str, help="benchmark mode of performance or accuracy") parser.add_argument("--intra_op_num_threads", type=int, default=24) parser.add_argument("--trust_remote_code", type=bool, default=False) args = parser.parse_args() @@ -130,7 +112,7 @@ def replace_architectures(json_path): data = json.load(file) data["architectures"] = ["LlamaForCausalLM"] - with open(json_path, 'w') as file: + with open(json_path, "w") as file: json.dump(data, file, indent=4) @@ -145,7 +127,7 @@ def eval_func(model): model="hf", model_args="pretrained=" + model_dir + ",tokenizer=" + args.tokenizer, batch_size=args.batch_size, - tasks=','.join(args.tasks), + tasks=",".join(args.tasks), provider="CPUExecutionProvider", trust_remote_code=args.trust_remote_code, ) @@ -154,12 +136,10 @@ def eval_func(model): eval_acc = 0 for task_name in args.tasks: if task_name == "wikitext": - print("Accuracy for %s is: %s" % - (task_name, results["results"][task_name]["word_perplexity,none"])) + print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["word_perplexity,none"])) eval_acc += results["results"][task_name]["word_perplexity,none"] else: - print("Accuracy for %s is: %s" % - (task_name, results["results"][task_name]["acc,none"])) + print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["acc,none"])) eval_acc += results["results"][task_name]["acc,none"] if len(args.tasks) != 0: @@ -173,14 +153,10 @@ def benchmark(model): sess_options.intra_op_num_threads = args.intra_op_num_threads session = optimum_ort.ORTModelForCausalLM.load_model( # pylint: disable=E1123 - os.path.join(model, "model.onnx"), - session_options=sess_options) + os.path.join(model, "model.onnx"), session_options=sess_options + ) inputs_names = session.get_inputs() - key_value_input_names = [ - key.name - for key in inputs_names - if (".key" in key.name) or (".value" in key.name) - ] + key_value_input_names = [key.name for key in inputs_names if (".key" in key.name) or (".value" in key.name)] use_cache = len(key_value_input_names) > 0 model = optimum_ort.ORTModelForCausalLM( @@ -222,19 +198,13 @@ def benchmark(model): class AWQDataloader(data_reader.CalibrationDataReader): - def __init__(self, - model_path, - pad_max=196, - batch_size=1, - sub_folder='train', - calibration_sampling_size=8): + def __init__(self, model_path, pad_max=196, batch_size=1, sub_folder="train", calibration_sampling_size=8): self.encoded_list = [] self.pad_max = pad_max self.batch_size = batch_size dataset = datasets.load_dataset(args.dataset, split=sub_folder) dataset = dataset.map(tokenize_function, batched=True) - dataset.set_format(type="torch", - columns=["input_ids", "attention_mask"]) + dataset.set_format(type="torch", columns=["input_ids", "attention_mask"]) dataloader = data.DataLoader( dataset, batch_size=self.batch_size, @@ -243,9 +213,7 @@ def __init__(self, ) model = onnx.load(model_path, load_external_data=False) inputs_names = [input.name for input in model.graph.input] - key_value_input_names = [ - key for key in inputs_names if (".key" in key) or (".value" in key) - ] + key_value_input_names = [key for key in inputs_names if (".key" in key) or (".value" in key)] use_cache = len(key_value_input_names) > 0 self.batch_size = batch_size @@ -253,20 +221,16 @@ def __init__(self, if idx + 1 > calibration_sampling_size: break ort_input = {} - ort_input["input_ids"] = input_ids[:, :-1].detach().cpu().numpy( - ).astype("int64") - ort_input["attention_mask"] = attention_mask[:, :-1].detach().cpu( - ).numpy().astype("int64") + ort_input["input_ids"] = input_ids[:, :-1].detach().cpu().numpy().astype("int64") + ort_input["attention_mask"] = attention_mask[:, :-1].detach().cpu().numpy().astype("int64") position_ids = attention_mask.long().cumsum(-1) - 1 position_ids.masked_fill_(attention_mask == 0, 1) - ort_input["position_ids"] = position_ids[:, :-1].detach().cpu( - ).numpy().astype("int64") + ort_input["position_ids"] = position_ids[:, :-1].detach().cpu().numpy().astype("int64") if use_cache: # Create dummy past_key_values for decoder num_attention_heads = model_config.num_key_value_heads embed_size_per_head = model_config.hidden_size // model_config.num_attention_heads - shape = (self.batch_size, num_attention_heads, 0, - embed_size_per_head) + shape = (self.batch_size, num_attention_heads, 0, embed_size_per_head) key_or_value = np.zeros(shape, dtype=np.float32) for key_value_input_name in key_value_input_names: ort_input[key_value_input_name] = key_or_value @@ -287,8 +251,7 @@ def collate_batch(self, batch): attention_mask = functional.pad(attention_mask, (0, pad_len), value=0) input_ids_padded.append(input_ids) attention_mask_padded.append(attention_mask) - return torch.vstack(input_ids_padded), torch.vstack( - attention_mask_padded) + return torch.vstack(input_ids_padded), torch.vstack(attention_mask_padded) def get_next(self): return next(self.iter_next, None) @@ -299,12 +262,7 @@ def rewind(self): class GPTQDataloader(data_reader.CalibrationDataReader): - def __init__(self, - model_path, - batch_size=1, - seqlen=2048, - sub_folder="train", - calibration_sampling_size=8): + def __init__(self, model_path, batch_size=1, seqlen=2048, sub_folder="train", calibration_sampling_size=8): # large `calibration_sampling_size` may result in long GPTQ running time # recommend to use smaller `calibration_sampling_size` value random.seed(0) @@ -313,14 +271,11 @@ def __init__(self, self.batch_size = batch_size traindata = datasets.load_dataset(args.dataset, split=sub_folder) traindata = traindata.map(tokenize_function, batched=True) - traindata.set_format(type="torch", - columns=["input_ids", "attention_mask"]) + traindata.set_format(type="torch", columns=["input_ids", "attention_mask"]) session = ort.InferenceSession(model_path) inputs_names = [input.name for input in session.get_inputs()] - key_value_input_names = [ - key for key in inputs_names if (".key" in key) or (".value" in key) - ] + key_value_input_names = [key for key in inputs_names if (".key" in key) or (".value" in key)] use_cache = len(key_value_input_names) > 0 for i in range(calibration_sampling_size): @@ -336,19 +291,15 @@ def __init__(self, ort_input = {} ort_input["input_ids"] = inp.detach().cpu().numpy().astype("int64") - ort_input["attention_mask"] = mask.detach().cpu().numpy().astype( - "int64") + ort_input["attention_mask"] = mask.detach().cpu().numpy().astype("int64") input_shape = ort_input["input_ids"].shape - position_ids = torch.arange(0, input_shape[-1], - dtype=torch.long).unsqueeze(0).view( - -1, input_shape[-1]) + position_ids = torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1]) ort_input["position_ids"] = position_ids.numpy() if use_cache: # create dummy past_key_values for decoder first generation step num_attention_heads = model_config.num_key_value_heads embed_size_per_head = model_config.hidden_size // model_config.num_attention_heads - shape = (self.batch_size, num_attention_heads, 0, - embed_size_per_head) + shape = (self.batch_size, num_attention_heads, 0, embed_size_per_head) key_or_value = np.zeros(shape, dtype=np.float32) for key_value_input_name in key_value_input_names: ort_input[key_value_input_name] = key_or_value @@ -369,9 +320,9 @@ def rewind(self): os.mkdir(args.workspace) if args.benchmark: - if args.mode == 'performance': + if args.mode == "performance": benchmark(args.model_path) - elif args.mode == 'accuracy': + elif args.mode == "accuracy": acc_result = eval_func(args.model_path) print("Batch size = %d" % args.batch_size) print("Accuracy: %.5f" % acc_result) @@ -384,17 +335,12 @@ def rewind(self): logger.info("Start graph optimization...") sess_options = ort.SessionOptions() sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED - sess_options.optimized_model_filepath = os.path.join( - args.workspace, "Optimized_model.onnx") + sess_options.optimized_model_filepath = os.path.join(args.workspace, "Optimized_model.onnx") sess_options.add_session_config_entry( - "session.optimized_model_external_initializers_file_name", - "Optimized_model.onnx_data") - sess_options.add_session_config_entry( - "session.optimized_model_external_initializers_min_size_in_bytes", - "1024") - sess = ort.InferenceSession(model_path, - sess_options, - providers=["CPUExecutionProvider"]) + "session.optimized_model_external_initializers_file_name", "Optimized_model.onnx_data" + ) + sess_options.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "1024") + sess = ort.InferenceSession(model_path, sess_options, providers=["CPUExecutionProvider"]) logger.info("Graph optimization done.") best_model = None @@ -411,12 +357,10 @@ def rewind(self): best_model = quant.model elif args.algorithm.upper() == "AWQ": - calibration_data_reader = AWQDataloader(model_path, - pad_max=args.pad_max, - batch_size=1) + calibration_data_reader = AWQDataloader(model_path, pad_max=args.pad_max, batch_size=1) algo_config = matmul_nbits_quantizer.AWQWeightOnlyQuantConfig( - calibration_data_reader=calibration_data_reader, - enable_mse_search=False) + calibration_data_reader=calibration_data_reader, enable_mse_search=False + ) quant = matmul_nbits_quantizer.MatMulNBitsQuantizer( sess_options.optimized_model_filepath, n_bits=4, @@ -428,11 +372,10 @@ def rewind(self): best_model = quant.model elif args.algorithm.upper() == "GPTQ": - calibration_data_reader = GPTQDataloader(model_path, - seqlen=args.seqlen, - batch_size=1) + calibration_data_reader = GPTQDataloader(model_path, seqlen=args.seqlen, batch_size=1) algo_config = matmul_nbits_quantizer.GPTQWeightOnlyQuantConfig( - calibration_data_reader=calibration_data_reader,) + calibration_data_reader=calibration_data_reader, + ) quant = matmul_nbits_quantizer.MatMulNBitsQuantizer( sess_options.optimized_model_filepath, n_bits=4, @@ -444,12 +387,9 @@ def rewind(self): best_model = quant.model elif args.algorithm.upper() == "WOQ_TUNE": - calibration_data_reader = GPTQDataloader(model_path, - seqlen=args.seqlen, - batch_size=1) + calibration_data_reader = GPTQDataloader(model_path, seqlen=args.seqlen, batch_size=1) # set tolerable_loss to 0.5% for test, default is 1% - custom_tune_config = tuning.TuningConfig( - config_set=config.get_woq_tuning_config(), tolerable_loss=0.005) + custom_tune_config = tuning.TuningConfig(config_set=config.get_woq_tuning_config(), tolerable_loss=0.005) best_model = tuning.autotune( model_input=model_path, tune_config=custom_tune_config, @@ -463,5 +403,4 @@ def rewind(self): os.path.join(args.output_model, model_name), save_as_external_data=True, ) - model_config.to_json_file(os.path.join(args.output_model, "config.json"), - use_diff=False) + model_config.to_json_file(os.path.join(args.output_model, "config.json"), use_diff=False) diff --git a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/prepare_model.py b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/prepare_model.py index 188f02a5b..3af820943 100644 --- a/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/prepare_model.py +++ b/examples/nlp/huggingface_model/text_generation/llama/quantization/weight_only/prepare_model.py @@ -1,6 +1,7 @@ import argparse import os import subprocess + import optimum.version from packaging import version @@ -16,7 +17,8 @@ def parse_arguments(): type=str, required=False, default="text-generation-with-past", - choices=["text-generation-with-past", "text-generation"]) + choices=["text-generation-with-past", "text-generation"], + ) return parser.parse_args() diff --git a/onnx_neural_compressor/algorithms/layer_wise/core.py b/onnx_neural_compressor/algorithms/layer_wise/core.py index 1b5cb680e..2e381cfdb 100644 --- a/onnx_neural_compressor/algorithms/layer_wise/core.py +++ b/onnx_neural_compressor/algorithms/layer_wise/core.py @@ -23,10 +23,8 @@ import onnx import onnxruntime as ort -from onnx_neural_compressor import data_reader -from onnx_neural_compressor import logger -from onnx_neural_compressor import onnx_model -from onnx_neural_compressor import utility + +from onnx_neural_compressor import data_reader, logger, onnx_model, utility from typing import Callable, List, Union # isort: skip diff --git a/onnx_neural_compressor/algorithms/smoother/calibrator.py b/onnx_neural_compressor/algorithms/smoother/calibrator.py index 7fddd2cc9..fe0a862cc 100644 --- a/onnx_neural_compressor/algorithms/smoother/calibrator.py +++ b/onnx_neural_compressor/algorithms/smoother/calibrator.py @@ -22,10 +22,8 @@ import numpy as np import onnx import onnxruntime -from onnx_neural_compressor import data_reader -from onnx_neural_compressor import logger -from onnx_neural_compressor import onnx_model -from onnx_neural_compressor import utility + +from onnx_neural_compressor import data_reader, logger, onnx_model, utility class Calibrator: diff --git a/onnx_neural_compressor/algorithms/smoother/core.py b/onnx_neural_compressor/algorithms/smoother/core.py index d30f78003..d21641482 100644 --- a/onnx_neural_compressor/algorithms/smoother/core.py +++ b/onnx_neural_compressor/algorithms/smoother/core.py @@ -20,10 +20,8 @@ import numpy as np import onnx import onnxruntime as ort -from onnx_neural_compressor import data_reader -from onnx_neural_compressor import logger -from onnx_neural_compressor import onnx_model -from onnx_neural_compressor import utility + +from onnx_neural_compressor import data_reader, logger, onnx_model, utility from onnx_neural_compressor.algorithms.smoother import calibrator from typing import List, Union # isort: skip diff --git a/onnx_neural_compressor/algorithms/weight_only/awq.py b/onnx_neural_compressor/algorithms/weight_only/awq.py index 6ee1f7c9c..30d9e8442 100644 --- a/onnx_neural_compressor/algorithms/weight_only/awq.py +++ b/onnx_neural_compressor/algorithms/weight_only/awq.py @@ -22,15 +22,11 @@ import numpy as np import onnx import onnxruntime as ort -from onnx_neural_compressor import config -from onnx_neural_compressor import constants -from onnx_neural_compressor import data_reader -from onnx_neural_compressor import logger -from onnx_neural_compressor import onnx_model -from onnx_neural_compressor import utility +from packaging import version + +from onnx_neural_compressor import config, constants, data_reader, logger, onnx_model, utility from onnx_neural_compressor.algorithms.weight_only import rtn from onnx_neural_compressor.algorithms.weight_only import utility as woq_utility -from packaging import version from typing import List, Union # isort: skip @@ -66,8 +62,9 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, weight = [] org_out = [] for node in nodes: - if (node.name, node.op_type) in weight_config and \ - weight_config.get((node.name, node.op_type), "fp32") != "fp32": + if (node.name, node.op_type) in weight_config and weight_config.get( + (node.name, node.op_type), "fp32" + ) != "fp32": num_bits = weight_config[(node.name, node.op_type)].get("weight_bits", 4) group_size = weight_config[(node.name, node.op_type)].get("weight_group_size", 32) scheme = "sym" if weight_config[(node.name, node.op_type)].get("weight_sym", True) else "asym" @@ -128,7 +125,7 @@ def _apply_awq_scale(model, weight_config, absorb_pairs, output_dicts, num_bits, for node in nodes: weight_config.setdefault((node.name, node.op_type), {}).update({"weight_bits": num_bits}) weight_config.setdefault((node.name, node.op_type), {}).update({"weight_group_size": group_size}) - weight_config.setdefault((node.name, node.op_type), {}).update({"weight_sym": scheme=="sym"}) + weight_config.setdefault((node.name, node.op_type), {}).update({"weight_sym": scheme == "sym"}) init_share_num = model.get_initializer_share_num(node.input[1]) weight_tensor = model.get_initializer(node.input[1]) diff --git a/onnx_neural_compressor/algorithms/weight_only/gptq.py b/onnx_neural_compressor/algorithms/weight_only/gptq.py index 07cc4cd1f..5016a2780 100644 --- a/onnx_neural_compressor/algorithms/weight_only/gptq.py +++ b/onnx_neural_compressor/algorithms/weight_only/gptq.py @@ -22,14 +22,11 @@ import numpy as np import onnx import onnxruntime as ort -from onnx_neural_compressor import config -from onnx_neural_compressor import constants -from onnx_neural_compressor import data_reader -from onnx_neural_compressor import onnx_model -from onnx_neural_compressor import utility +from packaging.version import Version + +from onnx_neural_compressor import config, constants, data_reader, onnx_model, utility from onnx_neural_compressor.algorithms.layer_wise import core from onnx_neural_compressor.algorithms.weight_only import utility as woq_utility -from packaging.version import Version from typing import List, Union # isort: skip diff --git a/onnx_neural_compressor/algorithms/weight_only/rtn.py b/onnx_neural_compressor/algorithms/weight_only/rtn.py index 8deb39f14..619c055e1 100644 --- a/onnx_neural_compressor/algorithms/weight_only/rtn.py +++ b/onnx_neural_compressor/algorithms/weight_only/rtn.py @@ -24,13 +24,11 @@ import numpy as np import onnx import onnxruntime as ort -from onnx_neural_compressor import config -from onnx_neural_compressor import constants -from onnx_neural_compressor import onnx_model -from onnx_neural_compressor import utility +from packaging import version + +from onnx_neural_compressor import config, constants, onnx_model, utility from onnx_neural_compressor.algorithms.layer_wise import core from onnx_neural_compressor.algorithms.weight_only import utility as woq_utility -from packaging import version from typing import List, Union # isort: skip diff --git a/onnx_neural_compressor/algorithms/weight_only/utility.py b/onnx_neural_compressor/algorithms/weight_only/utility.py index 6fee4cfc4..ddb5f990d 100644 --- a/onnx_neural_compressor/algorithms/weight_only/utility.py +++ b/onnx_neural_compressor/algorithms/weight_only/utility.py @@ -25,10 +25,10 @@ import numpy as np import onnx import onnxruntime as ort -from onnx_neural_compressor import constants -from onnx_neural_compressor import utility from packaging import version +from onnx_neural_compressor import constants, utility + if sys.version_info < (3, 11) and util.find_spec("onnxruntime_extensions"): # pragma: no cover import onnxruntime_extensions @@ -119,8 +119,8 @@ def make_matmul_weight_only_node( even_idx = idx[::2] odd_idx = idx[1::2] # vectorized operation for even and odd indices - packed_zp[even_idx // 2] = ((packed_zp[even_idx // 2] & 0xF0) | zero_point[even_idx].ravel()) - packed_zp[odd_idx // 2] = ((packed_zp[odd_idx // 2] & 0x0F) | (zero_point[odd_idx].ravel() << 4)) + packed_zp[even_idx // 2] = (packed_zp[even_idx // 2] & 0xF0) | zero_point[even_idx].ravel() + packed_zp[odd_idx // 2] = (packed_zp[odd_idx // 2] & 0x0F) | (zero_point[odd_idx].ravel() << 4) zp_tensor = onnx.helper.make_tensor( name=node.input[1] + "_zp", data_type=2, dims=packed_zp.shape, vals=packed_zp.tobytes(), raw=True @@ -282,7 +282,7 @@ def quant_tensor( max_range = np.maximum(np.abs(rmin), np.abs(rmax)) scale = np.ones(rmax.shape) - mask = (max_range > 0) + mask = max_range > 0 scale[mask] = (max_range[mask] * 2.0).astype(np.float64) / (maxq - minq) zero_point = ( np.zeros(scale.shape) if dtype == "int" else np.ones(rmax.shape, dtype="uint8") * (1 << (num_bits - 1)) @@ -305,6 +305,7 @@ def quant_tensor( return q_weight, scale, zero_point + def qdq_tensor( data: np.array, num_bits: int = 4, diff --git a/onnx_neural_compressor/config.py b/onnx_neural_compressor/config.py index 61ab8fc67..b6fad923a 100644 --- a/onnx_neural_compressor/config.py +++ b/onnx_neural_compressor/config.py @@ -23,19 +23,16 @@ import json import pathlib import re -from abc import ABC -from abc import abstractmethod +from abc import ABC, abstractmethod import numpy as np import onnx import pydantic -from onnx_neural_compressor import constants -from onnx_neural_compressor import data_reader -from onnx_neural_compressor import logger -from onnx_neural_compressor import utility from onnxruntime import quantization from typing_extensions import Self +from onnx_neural_compressor import constants, data_reader, logger, utility + from collections import OrderedDict # isort: skip from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple, Type, Union, _GenericAlias # isort: skip @@ -1239,4 +1236,4 @@ def generate_nc_sq_config(quant_config: quantization.StaticQuantConfig): quant_config.extra_options["SmoothQuant"] = False quant_config_dict = quant_config.to_dict() nc_sq_config = SmoothQuantConfig(**quant_kwargs, **quant_config_dict) - return nc_sq_config \ No newline at end of file + return nc_sq_config diff --git a/onnx_neural_compressor/onnx_model.py b/onnx_neural_compressor/onnx_model.py index 5e8921bd2..061f7cad8 100644 --- a/onnx_neural_compressor/onnx_model.py +++ b/onnx_neural_compressor/onnx_model.py @@ -21,11 +21,10 @@ import onnx import transformers -from onnx_neural_compressor import constants -from onnx_neural_compressor import logger -from onnx_neural_compressor import utility from onnxruntime.quantization import onnx_model +from onnx_neural_compressor import constants, logger, utility + class ONNXModel(onnx_model.ONNXModel): """Build ONNX model.""" diff --git a/onnx_neural_compressor/quantization/__init__.py b/onnx_neural_compressor/quantization/__init__.py index 7245f8724..7ef91659a 100644 --- a/onnx_neural_compressor/quantization/__init__.py +++ b/onnx_neural_compressor/quantization/__init__.py @@ -13,6 +13,6 @@ # limitations under the License. -from onnxruntime.quantization.quant_utils import QuantFormat -from onnxruntime.quantization.quant_utils import QuantType +from onnxruntime.quantization.quant_utils import QuantFormat, QuantType + from onnx_neural_compressor.quantization.quantize import quantize diff --git a/onnx_neural_compressor/quantization/algorithm_entry.py b/onnx_neural_compressor/quantization/algorithm_entry.py index 982ea3a14..cd079932c 100644 --- a/onnx_neural_compressor/quantization/algorithm_entry.py +++ b/onnx_neural_compressor/quantization/algorithm_entry.py @@ -17,17 +17,12 @@ from typing import Union import onnx -from onnx_neural_compressor import config -from onnx_neural_compressor import constants -from onnx_neural_compressor import data_reader -from onnx_neural_compressor import logger -from onnx_neural_compressor import utility -from onnx_neural_compressor.algorithms.smoother import core -from onnx_neural_compressor.algorithms.weight_only import awq -from onnx_neural_compressor.algorithms.weight_only import gptq -from onnx_neural_compressor.algorithms.weight_only import rtn from onnxruntime import quantization +from onnx_neural_compressor import config, constants, data_reader, logger, utility +from onnx_neural_compressor.algorithms.smoother import core +from onnx_neural_compressor.algorithms.weight_only import awq, gptq, rtn + ###################### SmoothQuant Entry ################################## @utility.register_algo(name=constants.SMOOTH_QUANT) diff --git a/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py b/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py index ea1cf62a9..62a671fba 100644 --- a/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py +++ b/onnx_neural_compressor/quantization/matmul_4bits_quantizer.py @@ -15,9 +15,10 @@ from typing import List, Union # isort: skip import onnx -from onnx_neural_compressor.quantization import matmul_nbits_quantizer from onnxruntime.quantization import matmul_4bits_quantizer +from onnx_neural_compressor.quantization import matmul_nbits_quantizer + RTNWeightOnlyQuantConfig = matmul_nbits_quantizer.RTNWeightOnlyQuantConfig AWQWeightOnlyQuantConfig = matmul_nbits_quantizer.AWQWeightOnlyQuantConfig GPTQWeightOnlyQuantConfig = matmul_nbits_quantizer.GPTQWeightOnlyQuantConfig diff --git a/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py b/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py index 1b6b3f1c7..0d00bbbc5 100644 --- a/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py +++ b/onnx_neural_compressor/quantization/matmul_nbits_quantizer.py @@ -15,14 +15,11 @@ from typing import List, Union # isort: skip import onnx -from onnx_neural_compressor import config -from onnx_neural_compressor import data_reader -from onnx_neural_compressor import logger -from onnx_neural_compressor import onnx_model -from onnx_neural_compressor import utility -from onnx_neural_compressor.quantization import algorithm_entry as algos from onnxruntime.quantization import matmul_4bits_quantizer +from onnx_neural_compressor import config, data_reader, logger, onnx_model, utility +from onnx_neural_compressor.quantization import algorithm_entry as algos + class RTNWeightOnlyQuantConfig(matmul_4bits_quantizer.RTNWeightOnlyQuantConfig): diff --git a/onnx_neural_compressor/quantization/quantize.py b/onnx_neural_compressor/quantization/quantize.py index f586655dc..7e388e3aa 100644 --- a/onnx_neural_compressor/quantization/quantize.py +++ b/onnx_neural_compressor/quantization/quantize.py @@ -16,9 +16,10 @@ from typing import Union import onnx +from onnxruntime.quantization.quantize import QuantConfig + from onnx_neural_compressor import config from onnx_neural_compressor.quantization import algorithm_entry as algos -from onnxruntime.quantization.quantize import QuantConfig # ORT-like user-facing API diff --git a/onnx_neural_compressor/quantization/tuning.py b/onnx_neural_compressor/quantization/tuning.py index 91e7eae14..a6743ad7a 100644 --- a/onnx_neural_compressor/quantization/tuning.py +++ b/onnx_neural_compressor/quantization/tuning.py @@ -19,10 +19,8 @@ import uuid import onnx -from onnx_neural_compressor import config -from onnx_neural_compressor import data_reader -from onnx_neural_compressor import logger -from onnx_neural_compressor import utility + +from onnx_neural_compressor import config, data_reader, logger, utility from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Sized, Tuple, Union # isort: skip diff --git a/onnx_neural_compressor/utility.py b/onnx_neural_compressor/utility.py index 0cb7b1335..cc36b6e8a 100644 --- a/onnx_neural_compressor/utility.py +++ b/onnx_neural_compressor/utility.py @@ -23,10 +23,10 @@ import numpy as np import onnx import psutil -from onnx_neural_compressor import constants -from onnx_neural_compressor import logger from onnxruntime.quantization import onnx_model +from onnx_neural_compressor import constants, logger + from typing import Callable, Dict, List, Tuple, Union # isort: skip # Dictionary to store a mapping between algorithm names and corresponding algo implementation(function) diff --git a/pyproject.toml b/pyproject.toml index 06b02dfe1..9d46c3db1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,15 +1,11 @@ [tool.isort] profile = "black" line_length = 120 -known_first_party = ["neural_compressor"] extend_skip_glob = ["**/__init__.py"] -force_single_line = true - [tool.black] line-length = 120 - [tool.codespell] skip = '*.po,*.ts,*.js,*.map,*.js.map,*.css.map,.azure-pipelines/scripts/codeScan/codespell/inc_dict.txt' count = '' diff --git a/requirements-lintrunner.txt b/requirements-lintrunner.txt new file mode 100644 index 000000000..e37a0353a --- /dev/null +++ b/requirements-lintrunner.txt @@ -0,0 +1,4 @@ +lintrunner_adapters +ruff==0.4.5 +black==24.3.0 +isort==5.13.2 diff --git a/setup.py b/setup.py index cdc3d0479..c80178535 100644 --- a/setup.py +++ b/setup.py @@ -49,8 +49,16 @@ def get_build_version(): url="", packages=setuptools.find_packages(), include_package_data=True, - install_requires=["onnx", "onnxruntime", "onnxruntime-extensions", "psutil", "numpy", - "py-cpuinfo", "pydantic", "transformers"], + install_requires=[ + "onnx", + "onnxruntime", + "onnxruntime-extensions", + "psutil", + "numpy", + "py-cpuinfo", + "pydantic", + "transformers", + ], python_requires=">=3.8.0", classifiers=[ "Intended Audience :: Science/Research", diff --git a/test/quantization/layer_wise/test_layer_wise.py b/test/quantization/layer_wise/test_layer_wise.py index 994387eb4..af0bca3e4 100644 --- a/test/quantization/layer_wise/test_layer_wise.py +++ b/test/quantization/layer_wise/test_layer_wise.py @@ -8,12 +8,11 @@ import onnxruntime.tools.symbolic_shape_infer as symbolic_shape_infer import torch import transformers -from onnx_neural_compressor import config -from onnx_neural_compressor import data_reader -from onnx_neural_compressor import logger +from optimum.exporters.onnx import main_export + +from onnx_neural_compressor import config, data_reader, logger from onnx_neural_compressor.quantization import algorithm_entry as algos from onnx_neural_compressor.quantization import matmul_4bits_quantizer -from optimum.exporters.onnx import main_export def find_onnx_file(folder_path): diff --git a/test/quantization/test_autotune.py b/test/quantization/test_autotune.py index 031b8369e..0e86c64b9 100644 --- a/test/quantization/test_autotune.py +++ b/test/quantization/test_autotune.py @@ -24,11 +24,11 @@ import numpy as np import onnx import onnxruntime as ort -from onnx_neural_compressor import config -from onnx_neural_compressor import data_reader -from onnx_neural_compressor.quantization import tuning from optimum.exporters.onnx import main_export +from onnx_neural_compressor import config, data_reader +from onnx_neural_compressor.quantization import tuning + from typing import Callable, Dict, List, Optional, Union # isort: skip diff --git a/test/quantization/test_config.py b/test/quantization/test_config.py index a7e142978..50ffc74d0 100644 --- a/test/quantization/test_config.py +++ b/test/quantization/test_config.py @@ -5,12 +5,11 @@ import numpy as np import onnx -from onnx_neural_compressor import config -from onnx_neural_compressor import logger -from onnx_neural_compressor import utility -from onnx_neural_compressor.quantization import algorithm_entry as algos from optimum.exporters.onnx import main_export +from onnx_neural_compressor import config, logger, utility +from onnx_neural_compressor.quantization import algorithm_entry as algos + def find_onnx_file(folder_path): # return first .onnx file path in folder_path diff --git a/test/quantization/test_smooth_quant.py b/test/quantization/test_smooth_quant.py index 56962af85..fed59e142 100644 --- a/test/quantization/test_smooth_quant.py +++ b/test/quantization/test_smooth_quant.py @@ -21,12 +21,12 @@ import numpy as np import onnx -from onnx_neural_compressor import config -from onnx_neural_compressor import data_reader +from optimum.exporters.onnx import main_export + +from onnx_neural_compressor import config, data_reader from onnx_neural_compressor.quantization import QuantType from onnx_neural_compressor.quantization import algorithm_entry as algos from onnx_neural_compressor.quantization import quantize -from optimum.exporters.onnx import main_export class DataReader(data_reader.CalibrationDataReader): diff --git a/test/quantization/weight_only/test_awq.py b/test/quantization/weight_only/test_awq.py index 82a003791..2d918cc61 100644 --- a/test/quantization/weight_only/test_awq.py +++ b/test/quantization/weight_only/test_awq.py @@ -6,14 +6,12 @@ import torch import transformers -from onnx_neural_compressor import config -from onnx_neural_compressor import data_reader -from onnx_neural_compressor import logger -from onnx_neural_compressor.quantization import algorithm_entry as algos -from onnx_neural_compressor.quantization import matmul_4bits_quantizer -from onnx_neural_compressor.quantization import matmul_nbits_quantizer from optimum.exporters.onnx import main_export +from onnx_neural_compressor import config, data_reader, logger +from onnx_neural_compressor.quantization import algorithm_entry as algos +from onnx_neural_compressor.quantization import matmul_4bits_quantizer, matmul_nbits_quantizer + def find_onnx_file(folder_path): # return first .onnx file path in folder_path diff --git a/test/quantization/weight_only/test_gptq.py b/test/quantization/weight_only/test_gptq.py index cc5df2cf9..133e11fd1 100644 --- a/test/quantization/weight_only/test_gptq.py +++ b/test/quantization/weight_only/test_gptq.py @@ -6,14 +6,12 @@ import torch import transformers -from onnx_neural_compressor import config -from onnx_neural_compressor import data_reader -from onnx_neural_compressor import logger -from onnx_neural_compressor.quantization import algorithm_entry as algos -from onnx_neural_compressor.quantization import matmul_4bits_quantizer -from onnx_neural_compressor.quantization import matmul_nbits_quantizer from optimum.exporters.onnx import main_export +from onnx_neural_compressor import config, data_reader, logger +from onnx_neural_compressor.quantization import algorithm_entry as algos +from onnx_neural_compressor.quantization import matmul_4bits_quantizer, matmul_nbits_quantizer + def find_onnx_file(folder_path): # return first .onnx file path in folder_path diff --git a/test/quantization/weight_only/test_rtn.py b/test/quantization/weight_only/test_rtn.py index 7f75edc41..86b3c49a3 100644 --- a/test/quantization/weight_only/test_rtn.py +++ b/test/quantization/weight_only/test_rtn.py @@ -4,13 +4,12 @@ import shutil import unittest -from onnx_neural_compressor import config -from onnx_neural_compressor import logger -from onnx_neural_compressor.quantization import algorithm_entry as algos -from onnx_neural_compressor.quantization import matmul_4bits_quantizer -from onnx_neural_compressor.quantization import matmul_nbits_quantizer from optimum.exporters.onnx import main_export +from onnx_neural_compressor import config, logger +from onnx_neural_compressor.quantization import algorithm_entry as algos +from onnx_neural_compressor.quantization import matmul_4bits_quantizer, matmul_nbits_quantizer + def find_onnx_file(folder_path): # return first .onnx file path in folder_path diff --git a/test/utils/test_general.py b/test/utils/test_general.py index e1c89b142..d24392438 100644 --- a/test/utils/test_general.py +++ b/test/utils/test_general.py @@ -2,9 +2,7 @@ import unittest -from onnx_neural_compressor import config -from onnx_neural_compressor import constants -from onnx_neural_compressor import logger +from onnx_neural_compressor import config, constants, logger from onnx_neural_compressor.quantization import tuning from typing import Any, Callable, List, Optional, Tuple, Union # isort: skip