From 73dc12cac5a7587ee5ca0085d9053a7b81ca8cfa Mon Sep 17 00:00:00 2001 From: xin3he Date: Thu, 18 Jan 2024 16:15:28 +0800 Subject: [PATCH 1/3] Rename RTNWeightOnlyConfig to RTNConfig Signed-off-by: xin3he --- .../quantization/llm/run_clm_no_trainer.py | 8 +- neural_compressor/common/utility.py | 2 +- neural_compressor/tensorflow/utils.py | 2 +- neural_compressor/torch/__init__.py | 2 +- .../torch/algorithms/weight_only/rtn.py | 4 +- .../torch/algorithms/weight_only_algos.py | 8 +- neural_compressor/torch/autotune.py | 4 +- .../torch/quantization/__init__.py | 2 +- .../torch/quantization/autotune.py | 67 +++++++++++++ .../torch/quantization/config.py | 24 ++--- neural_compressor/torch/utils/utility.py | 2 +- .../quantization/weight_only/test_rtn.py | 18 ++-- test/3x/torch/test_autotune.py | 12 +-- test/3x/torch/test_config.py | 96 +++++++++---------- 14 files changed, 155 insertions(+), 96 deletions(-) create mode 100644 neural_compressor/torch/quantization/autotune.py diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py index 06818699f5a..8235594a8d2 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py @@ -230,7 +230,7 @@ def get_user_model(): # 3.x api if args.approach == 'weight_only': - from neural_compressor.torch import RTNWeightQuantConfig, GPTQConfig, quantize + from neural_compressor.torch import RTNConfig, GPTQConfig, quantize from neural_compressor.torch.utils.utility import get_double_quant_config weight_sym = True if args.woq_scheme == "sym" else False double_quant_config_dict = get_double_quant_config(args.double_quant_type, weight_sym=weight_sym) @@ -243,9 +243,9 @@ def get_user_model(): "enable_mse_search": args.woq_enable_mse_search, } ) - quant_config = RTNWeightQuantConfig.from_dict(double_quant_config_dict) + quant_config = RTNConfig.from_dict(double_quant_config_dict) else: - quant_config = RTNWeightQuantConfig( + quant_config = RTNConfig( weight_dtype=args.woq_dtype, weight_bits=args.woq_bits, weight_group_size=args.woq_group_size, @@ -257,7 +257,7 @@ def get_user_model(): double_quant_sym=args.double_quant_sym, double_quant_group_size=args.double_quant_group_size, ) - quant_config.set_local("lm_head", RTNWeightQuantConfig(weight_dtype="fp32")) + quant_config.set_local("lm_head", RTNConfig(weight_dtype="fp32")) user_model = quantize( model=user_model, quant_config=quant_config ) diff --git a/neural_compressor/common/utility.py b/neural_compressor/common/utility.py index 7761a173d7d..42f6e445b9a 100644 --- a/neural_compressor/common/utility.py +++ b/neural_compressor/common/utility.py @@ -27,7 +27,7 @@ # config name BASE_CONFIG = "base_config" COMPOSABLE_CONFIG = "composable_config" -RTN_WEIGHT_ONLY_QUANT = "rtn_weight_only_quant" +RTN = "rtn" STATIC_QUANT = "static_quant" GPTQ = "gptq" FP8_QUANT = "fp8_quant" diff --git a/neural_compressor/tensorflow/utils.py b/neural_compressor/tensorflow/utils.py index 6f65f79fbc1..4497c1e9a7a 100644 --- a/neural_compressor/tensorflow/utils.py +++ b/neural_compressor/tensorflow/utils.py @@ -35,7 +35,7 @@ def register_algo(name): Usage example: @register_algo(name=example_algo) - def example_algo(model: torch.nn.Module, quant_config: RTNWeightQuantConfig) -> torch.nn.Module: + def example_algo(model: torch.nn.Module, quant_config: RTNConfig) -> torch.nn.Module: ... Args: name (str): The name under which the algorithm function will be registered. diff --git a/neural_compressor/torch/__init__.py b/neural_compressor/torch/__init__.py index c50e60103ea..46f4082de67 100644 --- a/neural_compressor/torch/__init__.py +++ b/neural_compressor/torch/__init__.py @@ -17,7 +17,7 @@ from neural_compressor.torch.quantization import ( quantize, - RTNWeightQuantConfig, + RTNConfig, get_default_rtn_config, GPTQConfig, get_default_gptq_config, diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py index 1f5949946c3..8d5e57c4f7f 100644 --- a/neural_compressor/torch/algorithms/weight_only/rtn.py +++ b/neural_compressor/torch/algorithms/weight_only/rtn.py @@ -580,10 +580,10 @@ def quant_weight_w_scale(weight, scale, zp, group_size=-1, dtype="int"): return int_weight -from neural_compressor.torch.quantization.config import RTNWeightQuantConfig +from neural_compressor.torch.quantization.config import RTNConfig -def apply_rtn_on_single_module(module: torch.nn.Module, quant_config: RTNWeightQuantConfig) -> torch.nn.Module: +def apply_rtn_on_single_module(module: torch.nn.Module, quant_config: RTNConfig) -> torch.nn.Module: # TODO (Yi) remove it enable_full_range = quant_config.enable_full_range enable_mse_search = quant_config.enable_mse_search diff --git a/neural_compressor/torch/algorithms/weight_only_algos.py b/neural_compressor/torch/algorithms/weight_only_algos.py index e3cef82d213..368f00c74b6 100644 --- a/neural_compressor/torch/algorithms/weight_only_algos.py +++ b/neural_compressor/torch/algorithms/weight_only_algos.py @@ -18,17 +18,17 @@ import torch from neural_compressor.common.logger import Logger -from neural_compressor.common.utility import GPTQ, RTN_WEIGHT_ONLY_QUANT -from neural_compressor.torch.quantization.config import GPTQConfig, RTNWeightQuantConfig +from neural_compressor.common.utility import GPTQ, RTN +from neural_compressor.torch.quantization.config import GPTQConfig, RTNConfig from neural_compressor.torch.utils.utility import fetch_module, register_algo, set_module logger = Logger().get_logger() ###################### RTN Algo Entry ################################## -@register_algo(name=RTN_WEIGHT_ONLY_QUANT) +@register_algo(name=RTN) def rtn_quantize_entry( - model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], RTNWeightQuantConfig], *args, **kwargs + model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], RTNConfig], *args, **kwargs ) -> torch.nn.Module: """The main entry to apply rtn quantization.""" from .weight_only.rtn import apply_rtn_on_single_module diff --git a/neural_compressor/torch/autotune.py b/neural_compressor/torch/autotune.py index cded26ebc48..73dda619d22 100644 --- a/neural_compressor/torch/autotune.py +++ b/neural_compressor/torch/autotune.py @@ -20,7 +20,7 @@ from neural_compressor.common.base_tuning import TuningConfig, evaluator, init_tuning from neural_compressor.common.logger import Logger from neural_compressor.torch import quantize -from neural_compressor.torch.quantization.config import GPTQConfig, RTNWeightQuantConfig +from neural_compressor.torch.quantization.config import GPTQConfig, RTNConfig logger = Logger().get_logger() @@ -33,7 +33,7 @@ def get_default_tune_config() -> TuningConfig: # TODO use the registered default tuning config in the next PR - return TuningConfig(quant_configs=[GPTQConfig(weight_bits=[4, 8]), RTNWeightQuantConfig(weight_bits=[4, 8])]) + return TuningConfig(quant_configs=[GPTQConfig(weight_bits=[4, 8]), RTNConfig(weight_bits=[4, 8])]) def autotune( diff --git a/neural_compressor/torch/quantization/__init__.py b/neural_compressor/torch/quantization/__init__.py index d54393a21fc..c78a7b0552e 100644 --- a/neural_compressor/torch/quantization/__init__.py +++ b/neural_compressor/torch/quantization/__init__.py @@ -14,7 +14,7 @@ from neural_compressor.torch.quantization.quantize import quantize, quantize_dynamic from neural_compressor.torch.quantization.config import ( - RTNWeightQuantConfig, + RTNConfig, get_default_rtn_config, GPTQConfig, get_default_gptq_config, diff --git a/neural_compressor/torch/quantization/autotune.py b/neural_compressor/torch/quantization/autotune.py new file mode 100644 index 00000000000..73dda619d22 --- /dev/null +++ b/neural_compressor/torch/quantization/autotune.py @@ -0,0 +1,67 @@ +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Optional, Union + +import torch + +from neural_compressor.common.base_config import BaseConfig +from neural_compressor.common.base_tuning import TuningConfig, evaluator, init_tuning +from neural_compressor.common.logger import Logger +from neural_compressor.torch import quantize +from neural_compressor.torch.quantization.config import GPTQConfig, RTNConfig + +logger = Logger().get_logger() + + +__all__ = [ + "get_default_tune_config", + "autotune", +] + + +def get_default_tune_config() -> TuningConfig: + # TODO use the registered default tuning config in the next PR + return TuningConfig(quant_configs=[GPTQConfig(weight_bits=[4, 8]), RTNConfig(weight_bits=[4, 8])]) + + +def autotune( + model: torch.nn.Module, + tune_config: TuningConfig, + eval_fns: Optional[Union[Dict, List[Dict]]] = None, + run_fn=None, + run_args=None, +) -> Optional[torch.nn.Module]: + """The main entry of auto-tune.""" + best_quant_model = None + evaluator.set_eval_fn_registry(eval_fns) + evaluator.self_check() + config_loader, tuning_logger, tuning_monitor = init_tuning(tuning_config=tune_config) + tuning_logger.tuning_start() + for trial_index, quant_config in enumerate(config_loader): + tuning_logger.trial_start(trial_index=trial_index) + tuning_logger.quantization_start() + q_model = quantize(model, quant_config=quant_config, run_fn=run_fn, run_args=run_args) + tuning_logger.quantization_end() + tuning_logger.evaluation_start() + eval_result: float = evaluator.evaluate(q_model) + tuning_logger.evaluation_end() + tuning_monitor.add_trial_result(trial_index, eval_result, quant_config) + if tuning_monitor.need_stop(): + best_quant_config: BaseConfig = tuning_monitor.get_best_quant_config() + quantize(model, quant_config=best_quant_config, run_fn=run_fn, run_args=run_args, inplace=True) + best_quant_model = model # quantize model inplace + tuning_logger.trial_end(trial_index) + tuning_logger.tuning_end() + return best_quant_model diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index 11dc239d050..07dc85db5e1 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -24,13 +24,7 @@ import torch from neural_compressor.common.base_config import BaseConfig, config_registry, register_config -from neural_compressor.common.utility import ( - DEFAULT_WHITE_LIST, - FP8_QUANT, - GPTQ, - OP_NAME_OR_MODULE_TYPE, - RTN_WEIGHT_ONLY_QUANT, -) +from neural_compressor.common.utility import DEFAULT_WHITE_LIST, FP8_QUANT, GPTQ, OP_NAME_OR_MODULE_TYPE, RTN from neural_compressor.torch.utils.constants import PRIORITY_GPTQ, PRIORITY_RTN from neural_compressor.torch.utils.utility import is_hpex_avaliable, logger @@ -60,8 +54,8 @@ class OperatorConfig(NamedTuple): ######################## RNT Config ############################### -@register_config(framework_name=FRAMEWORK_NAME, algo_name=RTN_WEIGHT_ONLY_QUANT, priority=PRIORITY_RTN) -class RTNWeightQuantConfig(BaseConfig): +@register_config(framework_name=FRAMEWORK_NAME, algo_name=RTN, priority=PRIORITY_RTN) +class RTNConfig(BaseConfig): """Config class for round-to-nearest weight-only quantization.""" supported_configs: List[OperatorConfig] = [] @@ -80,7 +74,7 @@ class RTNWeightQuantConfig(BaseConfig): "double_quant_sym", "double_quant_group_size", ] - name = RTN_WEIGHT_ONLY_QUANT + name = RTN def __init__( self, @@ -137,12 +131,12 @@ def to_dict(self): @classmethod def from_dict(cls, config_dict): - return super(RTNWeightQuantConfig, cls).from_dict(config_dict=config_dict, str2operator=str2operator) + return super(RTNConfig, cls).from_dict(config_dict=config_dict, str2operator=str2operator) @classmethod def register_supported_configs(cls) -> List[OperatorConfig]: supported_configs = [] - linear_rtn_config = RTNWeightQuantConfig( + linear_rtn_config = RTNConfig( weight_dtype=["int", "int8", "int4", "nf4", "fp4", "fp4_e2m1_bnb", "fp4_e2m1"], weight_bits=[4, 1, 2, 3, 5, 6, 7, 8], weight_group_size=[32, -1, 1, 4, 8, 16, 64, 128, 256, 512, 1024], @@ -173,16 +167,16 @@ def get_model_info(model: torch.nn.Module) -> List[Tuple[str, Callable]]: # TODO(Yi) run `register_supported_configs` for all registered config. -RTNWeightQuantConfig.register_supported_configs() +RTNConfig.register_supported_configs() -def get_default_rtn_config() -> RTNWeightQuantConfig: +def get_default_rtn_config() -> RTNConfig: """Generate the default rtn config. Returns: the default rtn config. """ - return RTNWeightQuantConfig() + return RTNConfig() ######################## GPTQ Config ############################### diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py index b1748d059eb..6abde03318d 100644 --- a/neural_compressor/torch/utils/utility.py +++ b/neural_compressor/torch/utils/utility.py @@ -33,7 +33,7 @@ def register_algo(name): Usage example: @register_algo(name=example_algo) - def example_algo(model: torch.nn.Module, quant_config: RTNWeightQuantConfig) -> torch.nn.Module: + def example_algo(model: torch.nn.Module, quant_config: RTNConfig) -> torch.nn.Module: ... Args: diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py index 00159b55828..a53671c8bb7 100644 --- a/test/3x/torch/quantization/weight_only/test_rtn.py +++ b/test/3x/torch/quantization/weight_only/test_rtn.py @@ -60,7 +60,7 @@ def _apply_rtn(self, quant_config): return qmodel def test_rtn(self): - from neural_compressor.torch import RTNWeightQuantConfig + from neural_compressor.torch import RTNConfig # some tests were skipped to accelerate the CI rnt_options = { @@ -76,7 +76,7 @@ def test_rtn(self): } from itertools import product - keys = RTNWeightQuantConfig.params_list + keys = RTNConfig.params_list for value in product(*rnt_options.values()): d = dict(zip(keys, value)) if (d["weight_dtype"] == "int" and d["weight_bits"] != 8) or ( @@ -85,26 +85,26 @@ def test_rtn(self): or (d["return_int"] and (d["group_dim"] != 1 or d["weight_bits"] != 8)) ): continue - quant_config = RTNWeightQuantConfig(**d) + quant_config = RTNConfig(**d) self._apply_rtn(quant_config) def test_rtn_return_type(self): - from neural_compressor.torch import RTNWeightQuantConfig + from neural_compressor.torch import RTNConfig for return_int in [True, False]: - quant_config = RTNWeightQuantConfig(return_int=return_int) + quant_config = RTNConfig(return_int=return_int) qmodel = self._apply_rtn(quant_config) def test_rtn_mse_search(self): - from neural_compressor.torch import RTNWeightQuantConfig + from neural_compressor.torch import RTNConfig - quant_config = RTNWeightQuantConfig(enable_mse_search=True) + quant_config = RTNConfig(enable_mse_search=True) qmodel = self._apply_rtn(quant_config) def test_rtn_recover(self): - from neural_compressor.torch import RTNWeightQuantConfig + from neural_compressor.torch import RTNConfig - quant_config = RTNWeightQuantConfig(return_int=True) + quant_config = RTNConfig(return_int=True) qmodel = self._apply_rtn(quant_config) input = torch.randn(4, 8) # test forward diff --git a/test/3x/torch/test_autotune.py b/test/3x/torch/test_autotune.py index 7e67436e87c..326da168dc8 100644 --- a/test/3x/torch/test_autotune.py +++ b/test/3x/torch/test_autotune.py @@ -62,12 +62,12 @@ def setUp(self): def test_autotune_api(self): logger.info("test_autotune_api") from neural_compressor.common.base_tuning import evaluator - from neural_compressor.torch import RTNWeightQuantConfig, TuningConfig, autotune + from neural_compressor.torch import RTNConfig, TuningConfig, autotune def eval_acc_fn(model) -> float: return 1.0 - custom_tune_config = TuningConfig(quant_configs=[RTNWeightQuantConfig(weight_bits=[4, 6])], max_trials=2) + custom_tune_config = TuningConfig(quant_configs=[RTNConfig(weight_bits=[4, 6])], max_trials=2) best_model = autotune( model=build_simple_torch_model(), tune_config=custom_tune_config, eval_fns=[{"eval_fn": eval_acc_fn}] ) @@ -78,7 +78,7 @@ def eval_acc_fn(model) -> float: def test_autotune_api_2(self): logger.info("test_autotune_api") from neural_compressor.common.base_tuning import evaluator - from neural_compressor.torch import RTNWeightQuantConfig, TuningConfig, autotune + from neural_compressor.torch import RTNConfig, TuningConfig, autotune def eval_acc_fn(model) -> float: return 1.0 @@ -94,7 +94,7 @@ def eval_perf_fn(model) -> float: }, ] - custom_tune_config = TuningConfig(quant_configs=[RTNWeightQuantConfig(weight_bits=[4, 6])], max_trials=2) + custom_tune_config = TuningConfig(quant_configs=[RTNConfig(weight_bits=[4, 6])], max_trials=2) best_model = autotune(model=build_simple_torch_model(), tune_config=custom_tune_config, eval_fns=eval_fns) self.assertIsNotNone(best_model) self.assertEqual(len(evaluator.eval_fn_registry), 2) @@ -102,9 +102,9 @@ def eval_perf_fn(model) -> float: @reset_tuning_target def test_autotune_not_eval_func(self): logger.info("test_autotune_api") - from neural_compressor.torch import RTNWeightQuantConfig, TuningConfig, autotune + from neural_compressor.torch import RTNConfig, TuningConfig, autotune - custom_tune_config = TuningConfig(quant_configs=[RTNWeightQuantConfig(weight_bits=[4, 6])], max_trials=2) + custom_tune_config = TuningConfig(quant_configs=[RTNConfig(weight_bits=[4, 6])], max_trials=2) # Use assertRaises to check that an AssertionError is raised with self.assertRaises(AssertionError) as context: diff --git a/test/3x/torch/test_config.py b/test/3x/torch/test_config.py index 0e0925685c0..3004bd97c48 100644 --- a/test/3x/torch/test_config.py +++ b/test/3x/torch/test_config.py @@ -57,7 +57,7 @@ def test_quantize_rtn_from_dict_beginner(self): from neural_compressor.torch import quantize quant_config = { - "rtn_weight_only_quant": { + "RTN": { "weight_dtype": "nf4", "weight_bits": 4, "weight_group_size": 32, @@ -68,20 +68,20 @@ def test_quantize_rtn_from_dict_beginner(self): self.assertIsNotNone(qmodel) def test_quantize_rtn_from_class_beginner(self): - from neural_compressor.torch import RTNWeightQuantConfig, quantize + from neural_compressor.torch import RTNConfig, quantize - quant_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="nf4", weight_group_size=32) + quant_config = RTNConfig(weight_bits=4, weight_dtype="nf4", weight_group_size=32) fp32_model = build_simple_torch_model() qmodel = quantize(fp32_model, quant_config) self.assertIsNotNone(qmodel) def test_quantize_rtndq_from_class_beginner(self): - from neural_compressor.torch import RTNWeightQuantConfig, quantize + from neural_compressor.torch import RTNConfig, quantize - fp32_config = RTNWeightQuantConfig(weight_dtype="fp32") + fp32_config = RTNConfig(weight_dtype="fp32") fp32_model = copy.deepcopy(self.gptj) - quant_config = RTNWeightQuantConfig( + quant_config = RTNConfig( weight_bits=4, weight_dtype="int", weight_sym=False, @@ -96,7 +96,7 @@ def test_quantize_rtndq_from_class_beginner(self): from neural_compressor.torch.utils.utility import get_double_quant_config double_quant_config_dict = get_double_quant_config("GGML_TYPE_Q4_K", weight_sym=False) - quant_config = RTNWeightQuantConfig.from_dict(double_quant_config_dict) + quant_config = RTNConfig.from_dict(double_quant_config_dict) quant_config.set_local("lm_head", fp32_config) qmodel = quantize(fp32_model, quant_config) out3 = qmodel(self.lm_input) @@ -104,7 +104,7 @@ def test_quantize_rtndq_from_class_beginner(self): fp32_model = copy.deepcopy(self.gptj) - quant_config = RTNWeightQuantConfig( + quant_config = RTNConfig( weight_bits=4, weight_dtype="nf4", weight_group_size=32, @@ -116,7 +116,7 @@ def test_quantize_rtndq_from_class_beginner(self): fp32_model = copy.deepcopy(self.gptj) # bitsandbytes double quant setting double_quant_config_dict = get_double_quant_config("BNB") - quant_config = RTNWeightQuantConfig.from_dict(double_quant_config_dict) + quant_config = RTNConfig.from_dict(double_quant_config_dict) quant_config.set_local("lm_head", fp32_config) qmodel = quantize(fp32_model, quant_config) out5 = qmodel(self.lm_input) @@ -127,7 +127,7 @@ def test_quantize_rtn_from_dict_advance(self): fp32_model = build_simple_torch_model() quant_config = { - "rtn_weight_only_quant": { + "RTN": { "global": { "weight_dtype": "nf4", "weight_bits": 4, @@ -145,11 +145,11 @@ def test_quantize_rtn_from_dict_advance(self): self.assertIsNotNone(qmodel) def test_quantize_rtn_from_class_advance(self): - from neural_compressor.torch import RTNWeightQuantConfig, quantize + from neural_compressor.torch import RTNConfig, quantize - quant_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="nf4") + quant_config = RTNConfig(weight_bits=4, weight_dtype="nf4") # set operator instance - fc1_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="int8") + fc1_config = RTNConfig(weight_bits=4, weight_dtype="int8") quant_config.set_local("model.fc1", fc1_config) # get model and quantize fp32_model = build_simple_torch_model() @@ -157,23 +157,23 @@ def test_quantize_rtn_from_class_advance(self): self.assertIsNotNone(qmodel) def test_config_white_lst(self): - from neural_compressor.torch import RTNWeightQuantConfig, quantize + from neural_compressor.torch import RTNConfig, quantize - global_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="nf4") + global_config = RTNConfig(weight_bits=4, weight_dtype="nf4") # set operator instance - fc1_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="int8", white_list=["model.fc1"]) + fc1_config = RTNConfig(weight_bits=4, weight_dtype="int8", white_list=["model.fc1"]) # get model and quantize fp32_model = build_simple_torch_model() qmodel = quantize(fp32_model, quant_config=global_config + fc1_config) self.assertIsNotNone(qmodel) def test_config_white_lst2(self): - from neural_compressor.torch import RTNWeightQuantConfig + from neural_compressor.torch import RTNConfig from neural_compressor.torch.utils.utility import get_model_info - global_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="nf4") + global_config = RTNConfig(weight_bits=4, weight_dtype="nf4") # set operator instance - fc1_config = RTNWeightQuantConfig(weight_bits=6, weight_dtype="int8", white_list=["fc1"]) + fc1_config = RTNConfig(weight_bits=6, weight_dtype="int8", white_list=["fc1"]) quant_config = global_config + fc1_config # get model and quantize fp32_model = build_simple_torch_model() @@ -185,10 +185,10 @@ def test_config_white_lst2(self): self.assertTrue(configs_mapping[("fc2", torch.nn.Linear)].weight_bits == 4) def test_config_from_dict(self): - from neural_compressor.torch import RTNWeightQuantConfig + from neural_compressor.torch import RTNConfig quant_config = { - "rtn_weight_only_quant": { + "RTN": { "global": { "weight_dtype": "nf4", "weight_bits": 4, @@ -202,32 +202,32 @@ def test_config_from_dict(self): }, } } - config = RTNWeightQuantConfig.from_dict(quant_config["rtn_weight_only_quant"]) + config = RTNConfig.from_dict(quant_config["RTN"]) self.assertIsNotNone(config.local_config) def test_config_to_dict(self): - from neural_compressor.torch import RTNWeightQuantConfig + from neural_compressor.torch import RTNConfig - quant_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="nf4") - fc1_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="int8") + quant_config = RTNConfig(weight_bits=4, weight_dtype="nf4") + fc1_config = RTNConfig(weight_bits=4, weight_dtype="int8") quant_config.set_local("model.fc1", fc1_config) config_dict = quant_config.to_dict() self.assertIn("global", config_dict) self.assertIn("local", config_dict) def test_same_type_configs_addition(self): - from neural_compressor.torch import RTNWeightQuantConfig + from neural_compressor.torch import RTNConfig quant_config1 = { - "rtn_weight_only_quant": { + "RTN": { "weight_dtype": "nf4", "weight_bits": 4, "weight_group_size": 32, }, } - q_config = RTNWeightQuantConfig.from_dict(quant_config1["rtn_weight_only_quant"]) + q_config = RTNConfig.from_dict(quant_config1["RTN"]) quant_config2 = { - "rtn_weight_only_quant": { + "RTN": { "global": { "weight_bits": 8, "weight_group_size": 32, @@ -240,61 +240,59 @@ def test_same_type_configs_addition(self): }, } } - q_config2 = RTNWeightQuantConfig.from_dict(quant_config2["rtn_weight_only_quant"]) + q_config2 = RTNConfig.from_dict(quant_config2["RTN"]) q_config3 = q_config + q_config2 q3_dict = q_config3.to_dict() - for op_name, op_config in quant_config2["rtn_weight_only_quant"]["local"].items(): + for op_name, op_config in quant_config2["RTN"]["local"].items(): for attr, val in op_config.items(): self.assertEqual(q3_dict["local"][op_name][attr], val) - self.assertNotEqual( - q3_dict["global"]["weight_bits"], quant_config2["rtn_weight_only_quant"]["global"]["weight_bits"] - ) + self.assertNotEqual(q3_dict["global"]["weight_bits"], quant_config2["RTN"]["global"]["weight_bits"]) def test_diff_types_configs_addition(self): - from neural_compressor.torch import GPTQConfig, RTNWeightQuantConfig + from neural_compressor.torch import GPTQConfig, RTNConfig quant_config1 = { - "rtn_weight_only_quant": { + "RTN": { "weight_dtype": "nf4", "weight_bits": 4, "weight_group_size": 32, }, } - q_config = RTNWeightQuantConfig.from_dict(quant_config1["rtn_weight_only_quant"]) + q_config = RTNConfig.from_dict(quant_config1["RTN"]) d_config = GPTQConfig(double_quant_bits=4) combined_config = q_config + d_config combined_config_d = combined_config.to_dict() logger.info(combined_config) - self.assertTrue("rtn_weight_only_quant" in combined_config_d) + self.assertTrue("RTN" in combined_config_d) self.assertIn("gptq", combined_config_d) def test_composable_config_addition(self): - from neural_compressor.torch import GPTQConfig, RTNWeightQuantConfig + from neural_compressor.torch import GPTQConfig, RTNConfig quant_config1 = { - "rtn_weight_only_quant": { + "RTN": { "weight_dtype": "nf4", "weight_bits": 4, "weight_group_size": 32, }, } - q_config = RTNWeightQuantConfig.from_dict(quant_config1["rtn_weight_only_quant"]) + q_config = RTNConfig.from_dict(quant_config1["RTN"]) d_config = GPTQConfig(double_quant_bits=4) combined_config = q_config + d_config combined_config_d = combined_config.to_dict() logger.info(combined_config) - self.assertTrue("rtn_weight_only_quant" in combined_config_d) + self.assertTrue("RTN" in combined_config_d) self.assertIn("gptq", combined_config_d) combined_config2 = combined_config + d_config combined_config3 = combined_config + combined_config2 def test_config_mapping(self): - from neural_compressor.torch import RTNWeightQuantConfig + from neural_compressor.torch import RTNConfig from neural_compressor.torch.utils.utility import get_model_info - quant_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="nf4") + quant_config = RTNConfig(weight_bits=4, weight_dtype="nf4") # set operator instance - fc1_config = RTNWeightQuantConfig(weight_bits=6, weight_dtype="int8") + fc1_config = RTNConfig(weight_bits=6, weight_dtype="int8") quant_config.set_local("fc1", fc1_config) # get model and quantize fp32_model = build_simple_torch_model() @@ -305,7 +303,7 @@ def test_config_mapping(self): self.assertTrue(configs_mapping[("fc1", torch.nn.Linear)].weight_bits == 6) self.assertTrue(configs_mapping[("fc2", torch.nn.Linear)].weight_bits == 4) # test regular matching - fc_config = RTNWeightQuantConfig(weight_bits=5, weight_dtype="int8") + fc_config = RTNConfig(weight_bits=5, weight_dtype="int8") quant_config.set_local("fc", fc_config) configs_mapping = quant_config.to_config_mapping(model_info=model_info) logger.info(configs_mapping) @@ -327,10 +325,10 @@ def test_gptq_config(self): class TestQuantConfigForAutotune(unittest.TestCase): def test_expand_config(self): # test the expand functionalities, the user is not aware it - from neural_compressor.torch import RTNWeightQuantConfig + from neural_compressor.torch import RTNConfig - tune_config = RTNWeightQuantConfig(weight_bits=[4, 6]) - expand_config_list = RTNWeightQuantConfig.expand(tune_config) + tune_config = RTNConfig(weight_bits=[4, 6]) + expand_config_list = RTNConfig.expand(tune_config) self.assertEqual(expand_config_list[0].weight_bits, 4) self.assertEqual(expand_config_list[1].weight_bits, 6) From 0cbc8e6bc955074d7facc97fba9a88167f1b8156 Mon Sep 17 00:00:00 2001 From: xin3he Date: Thu, 18 Jan 2024 16:58:10 +0800 Subject: [PATCH 2/3] fix name bug Signed-off-by: xin3he --- test/3x/torch/test_config.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/test/3x/torch/test_config.py b/test/3x/torch/test_config.py index 3004bd97c48..642f6de9cf5 100644 --- a/test/3x/torch/test_config.py +++ b/test/3x/torch/test_config.py @@ -57,7 +57,7 @@ def test_quantize_rtn_from_dict_beginner(self): from neural_compressor.torch import quantize quant_config = { - "RTN": { + "rtn": { "weight_dtype": "nf4", "weight_bits": 4, "weight_group_size": 32, @@ -127,7 +127,7 @@ def test_quantize_rtn_from_dict_advance(self): fp32_model = build_simple_torch_model() quant_config = { - "RTN": { + "rtn": { "global": { "weight_dtype": "nf4", "weight_bits": 4, @@ -188,7 +188,7 @@ def test_config_from_dict(self): from neural_compressor.torch import RTNConfig quant_config = { - "RTN": { + "rtn": { "global": { "weight_dtype": "nf4", "weight_bits": 4, @@ -202,7 +202,7 @@ def test_config_from_dict(self): }, } } - config = RTNConfig.from_dict(quant_config["RTN"]) + config = RTNConfig.from_dict(quant_config["rtn"]) self.assertIsNotNone(config.local_config) def test_config_to_dict(self): @@ -219,15 +219,15 @@ def test_same_type_configs_addition(self): from neural_compressor.torch import RTNConfig quant_config1 = { - "RTN": { + "rtn": { "weight_dtype": "nf4", "weight_bits": 4, "weight_group_size": 32, }, } - q_config = RTNConfig.from_dict(quant_config1["RTN"]) + q_config = RTNConfig.from_dict(quant_config1["rtn"]) quant_config2 = { - "RTN": { + "rtn": { "global": { "weight_bits": 8, "weight_group_size": 32, @@ -240,48 +240,48 @@ def test_same_type_configs_addition(self): }, } } - q_config2 = RTNConfig.from_dict(quant_config2["RTN"]) + q_config2 = RTNConfig.from_dict(quant_config2["rtn"]) q_config3 = q_config + q_config2 q3_dict = q_config3.to_dict() - for op_name, op_config in quant_config2["RTN"]["local"].items(): + for op_name, op_config in quant_config2["rtn"]["local"].items(): for attr, val in op_config.items(): self.assertEqual(q3_dict["local"][op_name][attr], val) - self.assertNotEqual(q3_dict["global"]["weight_bits"], quant_config2["RTN"]["global"]["weight_bits"]) + self.assertNotEqual(q3_dict["global"]["weight_bits"], quant_config2["rtn"]["global"]["weight_bits"]) def test_diff_types_configs_addition(self): from neural_compressor.torch import GPTQConfig, RTNConfig quant_config1 = { - "RTN": { + "rtn": { "weight_dtype": "nf4", "weight_bits": 4, "weight_group_size": 32, }, } - q_config = RTNConfig.from_dict(quant_config1["RTN"]) + q_config = RTNConfig.from_dict(quant_config1["rtn"]) d_config = GPTQConfig(double_quant_bits=4) combined_config = q_config + d_config combined_config_d = combined_config.to_dict() logger.info(combined_config) - self.assertTrue("RTN" in combined_config_d) + self.assertTrue("rtn" in combined_config_d) self.assertIn("gptq", combined_config_d) def test_composable_config_addition(self): from neural_compressor.torch import GPTQConfig, RTNConfig quant_config1 = { - "RTN": { + "rtn": { "weight_dtype": "nf4", "weight_bits": 4, "weight_group_size": 32, }, } - q_config = RTNConfig.from_dict(quant_config1["RTN"]) + q_config = RTNConfig.from_dict(quant_config1["rtn"]) d_config = GPTQConfig(double_quant_bits=4) combined_config = q_config + d_config combined_config_d = combined_config.to_dict() logger.info(combined_config) - self.assertTrue("RTN" in combined_config_d) + self.assertTrue("rtn" in combined_config_d) self.assertIn("gptq", combined_config_d) combined_config2 = combined_config + d_config combined_config3 = combined_config + combined_config2 From 876e8960e4113c2de45992eff3ae63c4f3fd4d17 Mon Sep 17 00:00:00 2001 From: xin3he Date: Thu, 18 Jan 2024 17:00:43 +0800 Subject: [PATCH 3/3] change autotune path Signed-off-by: xin3he --- neural_compressor/torch/__init__.py | 2 +- neural_compressor/torch/autotune.py | 67 ----------------------------- 2 files changed, 1 insertion(+), 68 deletions(-) delete mode 100644 neural_compressor/torch/autotune.py diff --git a/neural_compressor/torch/__init__.py b/neural_compressor/torch/__init__.py index 46f4082de67..81f131ca114 100644 --- a/neural_compressor/torch/__init__.py +++ b/neural_compressor/torch/__init__.py @@ -24,4 +24,4 @@ ) from neural_compressor.common.base_tuning import TuningConfig -from neural_compressor.torch.autotune import autotune, get_default_tune_config +from neural_compressor.torch.quantization.autotune import autotune, get_default_tune_config diff --git a/neural_compressor/torch/autotune.py b/neural_compressor/torch/autotune.py deleted file mode 100644 index 73dda619d22..00000000000 --- a/neural_compressor/torch/autotune.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright (c) 2023 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Dict, List, Optional, Union - -import torch - -from neural_compressor.common.base_config import BaseConfig -from neural_compressor.common.base_tuning import TuningConfig, evaluator, init_tuning -from neural_compressor.common.logger import Logger -from neural_compressor.torch import quantize -from neural_compressor.torch.quantization.config import GPTQConfig, RTNConfig - -logger = Logger().get_logger() - - -__all__ = [ - "get_default_tune_config", - "autotune", -] - - -def get_default_tune_config() -> TuningConfig: - # TODO use the registered default tuning config in the next PR - return TuningConfig(quant_configs=[GPTQConfig(weight_bits=[4, 8]), RTNConfig(weight_bits=[4, 8])]) - - -def autotune( - model: torch.nn.Module, - tune_config: TuningConfig, - eval_fns: Optional[Union[Dict, List[Dict]]] = None, - run_fn=None, - run_args=None, -) -> Optional[torch.nn.Module]: - """The main entry of auto-tune.""" - best_quant_model = None - evaluator.set_eval_fn_registry(eval_fns) - evaluator.self_check() - config_loader, tuning_logger, tuning_monitor = init_tuning(tuning_config=tune_config) - tuning_logger.tuning_start() - for trial_index, quant_config in enumerate(config_loader): - tuning_logger.trial_start(trial_index=trial_index) - tuning_logger.quantization_start() - q_model = quantize(model, quant_config=quant_config, run_fn=run_fn, run_args=run_args) - tuning_logger.quantization_end() - tuning_logger.evaluation_start() - eval_result: float = evaluator.evaluate(q_model) - tuning_logger.evaluation_end() - tuning_monitor.add_trial_result(trial_index, eval_result, quant_config) - if tuning_monitor.need_stop(): - best_quant_config: BaseConfig = tuning_monitor.get_best_quant_config() - quantize(model, quant_config=best_quant_config, run_fn=run_fn, run_args=run_args, inplace=True) - best_quant_model = model # quantize model inplace - tuning_logger.trial_end(trial_index) - tuning_logger.tuning_end() - return best_quant_model