From 941fed327d0d3f1ea7535f59e789089ce70f7ce0 Mon Sep 17 00:00:00 2001 From: xinhe Date: Fri, 19 Jan 2024 10:54:50 +0800 Subject: [PATCH] Rename RTNWeightOnlyConfig to RTNConfig (#1551) * Rename RTNWeightOnlyConfig to RTNConfig Signed-off-by: xin3he --- .../quantization/llm/run_clm_no_trainer.py | 8 +- neural_compressor/common/utility.py | 2 +- neural_compressor/tensorflow/utils.py | 2 +- neural_compressor/torch/__init__.py | 4 +- .../torch/algorithms/weight_only/rtn.py | 4 +- .../torch/algorithms/weight_only_algos.py | 8 +- .../torch/quantization/__init__.py | 2 +- .../torch/{ => quantization}/autotune.py | 4 +- .../torch/quantization/config.py | 24 ++--- neural_compressor/torch/utils/utility.py | 2 +- .../quantization/weight_only/test_rtn.py | 18 ++-- test/3x/torch/test_autotune.py | 12 +-- test/3x/torch/test_config.py | 96 +++++++++---------- 13 files changed, 89 insertions(+), 97 deletions(-) rename neural_compressor/torch/{ => quantization}/autotune.py (97%) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py index 06818699f5a..8235594a8d2 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py @@ -230,7 +230,7 @@ def get_user_model(): # 3.x api if args.approach == 'weight_only': - from neural_compressor.torch import RTNWeightQuantConfig, GPTQConfig, quantize + from neural_compressor.torch import RTNConfig, GPTQConfig, quantize from neural_compressor.torch.utils.utility import get_double_quant_config weight_sym = True if args.woq_scheme == "sym" else False double_quant_config_dict = get_double_quant_config(args.double_quant_type, weight_sym=weight_sym) @@ -243,9 +243,9 @@ def get_user_model(): "enable_mse_search": args.woq_enable_mse_search, } ) - quant_config = RTNWeightQuantConfig.from_dict(double_quant_config_dict) + quant_config = RTNConfig.from_dict(double_quant_config_dict) else: - quant_config = RTNWeightQuantConfig( + quant_config = RTNConfig( weight_dtype=args.woq_dtype, weight_bits=args.woq_bits, weight_group_size=args.woq_group_size, @@ -257,7 +257,7 @@ def get_user_model(): double_quant_sym=args.double_quant_sym, double_quant_group_size=args.double_quant_group_size, ) - quant_config.set_local("lm_head", RTNWeightQuantConfig(weight_dtype="fp32")) + quant_config.set_local("lm_head", RTNConfig(weight_dtype="fp32")) user_model = quantize( model=user_model, quant_config=quant_config ) diff --git a/neural_compressor/common/utility.py b/neural_compressor/common/utility.py index 7761a173d7d..42f6e445b9a 100644 --- a/neural_compressor/common/utility.py +++ b/neural_compressor/common/utility.py @@ -27,7 +27,7 @@ # config name BASE_CONFIG = "base_config" COMPOSABLE_CONFIG = "composable_config" -RTN_WEIGHT_ONLY_QUANT = "rtn_weight_only_quant" +RTN = "rtn" STATIC_QUANT = "static_quant" GPTQ = "gptq" FP8_QUANT = "fp8_quant" diff --git a/neural_compressor/tensorflow/utils.py b/neural_compressor/tensorflow/utils.py index 6f65f79fbc1..4497c1e9a7a 100644 --- a/neural_compressor/tensorflow/utils.py +++ b/neural_compressor/tensorflow/utils.py @@ -35,7 +35,7 @@ def register_algo(name): Usage example: @register_algo(name=example_algo) - def example_algo(model: torch.nn.Module, quant_config: RTNWeightQuantConfig) -> torch.nn.Module: + def example_algo(model: torch.nn.Module, quant_config: RTNConfig) -> torch.nn.Module: ... Args: name (str): The name under which the algorithm function will be registered. diff --git a/neural_compressor/torch/__init__.py b/neural_compressor/torch/__init__.py index c50e60103ea..81f131ca114 100644 --- a/neural_compressor/torch/__init__.py +++ b/neural_compressor/torch/__init__.py @@ -17,11 +17,11 @@ from neural_compressor.torch.quantization import ( quantize, - RTNWeightQuantConfig, + RTNConfig, get_default_rtn_config, GPTQConfig, get_default_gptq_config, ) from neural_compressor.common.base_tuning import TuningConfig -from neural_compressor.torch.autotune import autotune, get_default_tune_config +from neural_compressor.torch.quantization.autotune import autotune, get_default_tune_config diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py index 1f5949946c3..8d5e57c4f7f 100644 --- a/neural_compressor/torch/algorithms/weight_only/rtn.py +++ b/neural_compressor/torch/algorithms/weight_only/rtn.py @@ -580,10 +580,10 @@ def quant_weight_w_scale(weight, scale, zp, group_size=-1, dtype="int"): return int_weight -from neural_compressor.torch.quantization.config import RTNWeightQuantConfig +from neural_compressor.torch.quantization.config import RTNConfig -def apply_rtn_on_single_module(module: torch.nn.Module, quant_config: RTNWeightQuantConfig) -> torch.nn.Module: +def apply_rtn_on_single_module(module: torch.nn.Module, quant_config: RTNConfig) -> torch.nn.Module: # TODO (Yi) remove it enable_full_range = quant_config.enable_full_range enable_mse_search = quant_config.enable_mse_search diff --git a/neural_compressor/torch/algorithms/weight_only_algos.py b/neural_compressor/torch/algorithms/weight_only_algos.py index e3cef82d213..368f00c74b6 100644 --- a/neural_compressor/torch/algorithms/weight_only_algos.py +++ b/neural_compressor/torch/algorithms/weight_only_algos.py @@ -18,17 +18,17 @@ import torch from neural_compressor.common.logger import Logger -from neural_compressor.common.utility import GPTQ, RTN_WEIGHT_ONLY_QUANT -from neural_compressor.torch.quantization.config import GPTQConfig, RTNWeightQuantConfig +from neural_compressor.common.utility import GPTQ, RTN +from neural_compressor.torch.quantization.config import GPTQConfig, RTNConfig from neural_compressor.torch.utils.utility import fetch_module, register_algo, set_module logger = Logger().get_logger() ###################### RTN Algo Entry ################################## -@register_algo(name=RTN_WEIGHT_ONLY_QUANT) +@register_algo(name=RTN) def rtn_quantize_entry( - model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], RTNWeightQuantConfig], *args, **kwargs + model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], RTNConfig], *args, **kwargs ) -> torch.nn.Module: """The main entry to apply rtn quantization.""" from .weight_only.rtn import apply_rtn_on_single_module diff --git a/neural_compressor/torch/quantization/__init__.py b/neural_compressor/torch/quantization/__init__.py index d54393a21fc..c78a7b0552e 100644 --- a/neural_compressor/torch/quantization/__init__.py +++ b/neural_compressor/torch/quantization/__init__.py @@ -14,7 +14,7 @@ from neural_compressor.torch.quantization.quantize import quantize, quantize_dynamic from neural_compressor.torch.quantization.config import ( - RTNWeightQuantConfig, + RTNConfig, get_default_rtn_config, GPTQConfig, get_default_gptq_config, diff --git a/neural_compressor/torch/autotune.py b/neural_compressor/torch/quantization/autotune.py similarity index 97% rename from neural_compressor/torch/autotune.py rename to neural_compressor/torch/quantization/autotune.py index cded26ebc48..73dda619d22 100644 --- a/neural_compressor/torch/autotune.py +++ b/neural_compressor/torch/quantization/autotune.py @@ -20,7 +20,7 @@ from neural_compressor.common.base_tuning import TuningConfig, evaluator, init_tuning from neural_compressor.common.logger import Logger from neural_compressor.torch import quantize -from neural_compressor.torch.quantization.config import GPTQConfig, RTNWeightQuantConfig +from neural_compressor.torch.quantization.config import GPTQConfig, RTNConfig logger = Logger().get_logger() @@ -33,7 +33,7 @@ def get_default_tune_config() -> TuningConfig: # TODO use the registered default tuning config in the next PR - return TuningConfig(quant_configs=[GPTQConfig(weight_bits=[4, 8]), RTNWeightQuantConfig(weight_bits=[4, 8])]) + return TuningConfig(quant_configs=[GPTQConfig(weight_bits=[4, 8]), RTNConfig(weight_bits=[4, 8])]) def autotune( diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index 11dc239d050..07dc85db5e1 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -24,13 +24,7 @@ import torch from neural_compressor.common.base_config import BaseConfig, config_registry, register_config -from neural_compressor.common.utility import ( - DEFAULT_WHITE_LIST, - FP8_QUANT, - GPTQ, - OP_NAME_OR_MODULE_TYPE, - RTN_WEIGHT_ONLY_QUANT, -) +from neural_compressor.common.utility import DEFAULT_WHITE_LIST, FP8_QUANT, GPTQ, OP_NAME_OR_MODULE_TYPE, RTN from neural_compressor.torch.utils.constants import PRIORITY_GPTQ, PRIORITY_RTN from neural_compressor.torch.utils.utility import is_hpex_avaliable, logger @@ -60,8 +54,8 @@ class OperatorConfig(NamedTuple): ######################## RNT Config ############################### -@register_config(framework_name=FRAMEWORK_NAME, algo_name=RTN_WEIGHT_ONLY_QUANT, priority=PRIORITY_RTN) -class RTNWeightQuantConfig(BaseConfig): +@register_config(framework_name=FRAMEWORK_NAME, algo_name=RTN, priority=PRIORITY_RTN) +class RTNConfig(BaseConfig): """Config class for round-to-nearest weight-only quantization.""" supported_configs: List[OperatorConfig] = [] @@ -80,7 +74,7 @@ class RTNWeightQuantConfig(BaseConfig): "double_quant_sym", "double_quant_group_size", ] - name = RTN_WEIGHT_ONLY_QUANT + name = RTN def __init__( self, @@ -137,12 +131,12 @@ def to_dict(self): @classmethod def from_dict(cls, config_dict): - return super(RTNWeightQuantConfig, cls).from_dict(config_dict=config_dict, str2operator=str2operator) + return super(RTNConfig, cls).from_dict(config_dict=config_dict, str2operator=str2operator) @classmethod def register_supported_configs(cls) -> List[OperatorConfig]: supported_configs = [] - linear_rtn_config = RTNWeightQuantConfig( + linear_rtn_config = RTNConfig( weight_dtype=["int", "int8", "int4", "nf4", "fp4", "fp4_e2m1_bnb", "fp4_e2m1"], weight_bits=[4, 1, 2, 3, 5, 6, 7, 8], weight_group_size=[32, -1, 1, 4, 8, 16, 64, 128, 256, 512, 1024], @@ -173,16 +167,16 @@ def get_model_info(model: torch.nn.Module) -> List[Tuple[str, Callable]]: # TODO(Yi) run `register_supported_configs` for all registered config. -RTNWeightQuantConfig.register_supported_configs() +RTNConfig.register_supported_configs() -def get_default_rtn_config() -> RTNWeightQuantConfig: +def get_default_rtn_config() -> RTNConfig: """Generate the default rtn config. Returns: the default rtn config. """ - return RTNWeightQuantConfig() + return RTNConfig() ######################## GPTQ Config ############################### diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py index b1748d059eb..6abde03318d 100644 --- a/neural_compressor/torch/utils/utility.py +++ b/neural_compressor/torch/utils/utility.py @@ -33,7 +33,7 @@ def register_algo(name): Usage example: @register_algo(name=example_algo) - def example_algo(model: torch.nn.Module, quant_config: RTNWeightQuantConfig) -> torch.nn.Module: + def example_algo(model: torch.nn.Module, quant_config: RTNConfig) -> torch.nn.Module: ... Args: diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py index 00159b55828..a53671c8bb7 100644 --- a/test/3x/torch/quantization/weight_only/test_rtn.py +++ b/test/3x/torch/quantization/weight_only/test_rtn.py @@ -60,7 +60,7 @@ def _apply_rtn(self, quant_config): return qmodel def test_rtn(self): - from neural_compressor.torch import RTNWeightQuantConfig + from neural_compressor.torch import RTNConfig # some tests were skipped to accelerate the CI rnt_options = { @@ -76,7 +76,7 @@ def test_rtn(self): } from itertools import product - keys = RTNWeightQuantConfig.params_list + keys = RTNConfig.params_list for value in product(*rnt_options.values()): d = dict(zip(keys, value)) if (d["weight_dtype"] == "int" and d["weight_bits"] != 8) or ( @@ -85,26 +85,26 @@ def test_rtn(self): or (d["return_int"] and (d["group_dim"] != 1 or d["weight_bits"] != 8)) ): continue - quant_config = RTNWeightQuantConfig(**d) + quant_config = RTNConfig(**d) self._apply_rtn(quant_config) def test_rtn_return_type(self): - from neural_compressor.torch import RTNWeightQuantConfig + from neural_compressor.torch import RTNConfig for return_int in [True, False]: - quant_config = RTNWeightQuantConfig(return_int=return_int) + quant_config = RTNConfig(return_int=return_int) qmodel = self._apply_rtn(quant_config) def test_rtn_mse_search(self): - from neural_compressor.torch import RTNWeightQuantConfig + from neural_compressor.torch import RTNConfig - quant_config = RTNWeightQuantConfig(enable_mse_search=True) + quant_config = RTNConfig(enable_mse_search=True) qmodel = self._apply_rtn(quant_config) def test_rtn_recover(self): - from neural_compressor.torch import RTNWeightQuantConfig + from neural_compressor.torch import RTNConfig - quant_config = RTNWeightQuantConfig(return_int=True) + quant_config = RTNConfig(return_int=True) qmodel = self._apply_rtn(quant_config) input = torch.randn(4, 8) # test forward diff --git a/test/3x/torch/test_autotune.py b/test/3x/torch/test_autotune.py index 7e67436e87c..326da168dc8 100644 --- a/test/3x/torch/test_autotune.py +++ b/test/3x/torch/test_autotune.py @@ -62,12 +62,12 @@ def setUp(self): def test_autotune_api(self): logger.info("test_autotune_api") from neural_compressor.common.base_tuning import evaluator - from neural_compressor.torch import RTNWeightQuantConfig, TuningConfig, autotune + from neural_compressor.torch import RTNConfig, TuningConfig, autotune def eval_acc_fn(model) -> float: return 1.0 - custom_tune_config = TuningConfig(quant_configs=[RTNWeightQuantConfig(weight_bits=[4, 6])], max_trials=2) + custom_tune_config = TuningConfig(quant_configs=[RTNConfig(weight_bits=[4, 6])], max_trials=2) best_model = autotune( model=build_simple_torch_model(), tune_config=custom_tune_config, eval_fns=[{"eval_fn": eval_acc_fn}] ) @@ -78,7 +78,7 @@ def eval_acc_fn(model) -> float: def test_autotune_api_2(self): logger.info("test_autotune_api") from neural_compressor.common.base_tuning import evaluator - from neural_compressor.torch import RTNWeightQuantConfig, TuningConfig, autotune + from neural_compressor.torch import RTNConfig, TuningConfig, autotune def eval_acc_fn(model) -> float: return 1.0 @@ -94,7 +94,7 @@ def eval_perf_fn(model) -> float: }, ] - custom_tune_config = TuningConfig(quant_configs=[RTNWeightQuantConfig(weight_bits=[4, 6])], max_trials=2) + custom_tune_config = TuningConfig(quant_configs=[RTNConfig(weight_bits=[4, 6])], max_trials=2) best_model = autotune(model=build_simple_torch_model(), tune_config=custom_tune_config, eval_fns=eval_fns) self.assertIsNotNone(best_model) self.assertEqual(len(evaluator.eval_fn_registry), 2) @@ -102,9 +102,9 @@ def eval_perf_fn(model) -> float: @reset_tuning_target def test_autotune_not_eval_func(self): logger.info("test_autotune_api") - from neural_compressor.torch import RTNWeightQuantConfig, TuningConfig, autotune + from neural_compressor.torch import RTNConfig, TuningConfig, autotune - custom_tune_config = TuningConfig(quant_configs=[RTNWeightQuantConfig(weight_bits=[4, 6])], max_trials=2) + custom_tune_config = TuningConfig(quant_configs=[RTNConfig(weight_bits=[4, 6])], max_trials=2) # Use assertRaises to check that an AssertionError is raised with self.assertRaises(AssertionError) as context: diff --git a/test/3x/torch/test_config.py b/test/3x/torch/test_config.py index 0e0925685c0..642f6de9cf5 100644 --- a/test/3x/torch/test_config.py +++ b/test/3x/torch/test_config.py @@ -57,7 +57,7 @@ def test_quantize_rtn_from_dict_beginner(self): from neural_compressor.torch import quantize quant_config = { - "rtn_weight_only_quant": { + "rtn": { "weight_dtype": "nf4", "weight_bits": 4, "weight_group_size": 32, @@ -68,20 +68,20 @@ def test_quantize_rtn_from_dict_beginner(self): self.assertIsNotNone(qmodel) def test_quantize_rtn_from_class_beginner(self): - from neural_compressor.torch import RTNWeightQuantConfig, quantize + from neural_compressor.torch import RTNConfig, quantize - quant_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="nf4", weight_group_size=32) + quant_config = RTNConfig(weight_bits=4, weight_dtype="nf4", weight_group_size=32) fp32_model = build_simple_torch_model() qmodel = quantize(fp32_model, quant_config) self.assertIsNotNone(qmodel) def test_quantize_rtndq_from_class_beginner(self): - from neural_compressor.torch import RTNWeightQuantConfig, quantize + from neural_compressor.torch import RTNConfig, quantize - fp32_config = RTNWeightQuantConfig(weight_dtype="fp32") + fp32_config = RTNConfig(weight_dtype="fp32") fp32_model = copy.deepcopy(self.gptj) - quant_config = RTNWeightQuantConfig( + quant_config = RTNConfig( weight_bits=4, weight_dtype="int", weight_sym=False, @@ -96,7 +96,7 @@ def test_quantize_rtndq_from_class_beginner(self): from neural_compressor.torch.utils.utility import get_double_quant_config double_quant_config_dict = get_double_quant_config("GGML_TYPE_Q4_K", weight_sym=False) - quant_config = RTNWeightQuantConfig.from_dict(double_quant_config_dict) + quant_config = RTNConfig.from_dict(double_quant_config_dict) quant_config.set_local("lm_head", fp32_config) qmodel = quantize(fp32_model, quant_config) out3 = qmodel(self.lm_input) @@ -104,7 +104,7 @@ def test_quantize_rtndq_from_class_beginner(self): fp32_model = copy.deepcopy(self.gptj) - quant_config = RTNWeightQuantConfig( + quant_config = RTNConfig( weight_bits=4, weight_dtype="nf4", weight_group_size=32, @@ -116,7 +116,7 @@ def test_quantize_rtndq_from_class_beginner(self): fp32_model = copy.deepcopy(self.gptj) # bitsandbytes double quant setting double_quant_config_dict = get_double_quant_config("BNB") - quant_config = RTNWeightQuantConfig.from_dict(double_quant_config_dict) + quant_config = RTNConfig.from_dict(double_quant_config_dict) quant_config.set_local("lm_head", fp32_config) qmodel = quantize(fp32_model, quant_config) out5 = qmodel(self.lm_input) @@ -127,7 +127,7 @@ def test_quantize_rtn_from_dict_advance(self): fp32_model = build_simple_torch_model() quant_config = { - "rtn_weight_only_quant": { + "rtn": { "global": { "weight_dtype": "nf4", "weight_bits": 4, @@ -145,11 +145,11 @@ def test_quantize_rtn_from_dict_advance(self): self.assertIsNotNone(qmodel) def test_quantize_rtn_from_class_advance(self): - from neural_compressor.torch import RTNWeightQuantConfig, quantize + from neural_compressor.torch import RTNConfig, quantize - quant_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="nf4") + quant_config = RTNConfig(weight_bits=4, weight_dtype="nf4") # set operator instance - fc1_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="int8") + fc1_config = RTNConfig(weight_bits=4, weight_dtype="int8") quant_config.set_local("model.fc1", fc1_config) # get model and quantize fp32_model = build_simple_torch_model() @@ -157,23 +157,23 @@ def test_quantize_rtn_from_class_advance(self): self.assertIsNotNone(qmodel) def test_config_white_lst(self): - from neural_compressor.torch import RTNWeightQuantConfig, quantize + from neural_compressor.torch import RTNConfig, quantize - global_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="nf4") + global_config = RTNConfig(weight_bits=4, weight_dtype="nf4") # set operator instance - fc1_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="int8", white_list=["model.fc1"]) + fc1_config = RTNConfig(weight_bits=4, weight_dtype="int8", white_list=["model.fc1"]) # get model and quantize fp32_model = build_simple_torch_model() qmodel = quantize(fp32_model, quant_config=global_config + fc1_config) self.assertIsNotNone(qmodel) def test_config_white_lst2(self): - from neural_compressor.torch import RTNWeightQuantConfig + from neural_compressor.torch import RTNConfig from neural_compressor.torch.utils.utility import get_model_info - global_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="nf4") + global_config = RTNConfig(weight_bits=4, weight_dtype="nf4") # set operator instance - fc1_config = RTNWeightQuantConfig(weight_bits=6, weight_dtype="int8", white_list=["fc1"]) + fc1_config = RTNConfig(weight_bits=6, weight_dtype="int8", white_list=["fc1"]) quant_config = global_config + fc1_config # get model and quantize fp32_model = build_simple_torch_model() @@ -185,10 +185,10 @@ def test_config_white_lst2(self): self.assertTrue(configs_mapping[("fc2", torch.nn.Linear)].weight_bits == 4) def test_config_from_dict(self): - from neural_compressor.torch import RTNWeightQuantConfig + from neural_compressor.torch import RTNConfig quant_config = { - "rtn_weight_only_quant": { + "rtn": { "global": { "weight_dtype": "nf4", "weight_bits": 4, @@ -202,32 +202,32 @@ def test_config_from_dict(self): }, } } - config = RTNWeightQuantConfig.from_dict(quant_config["rtn_weight_only_quant"]) + config = RTNConfig.from_dict(quant_config["rtn"]) self.assertIsNotNone(config.local_config) def test_config_to_dict(self): - from neural_compressor.torch import RTNWeightQuantConfig + from neural_compressor.torch import RTNConfig - quant_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="nf4") - fc1_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="int8") + quant_config = RTNConfig(weight_bits=4, weight_dtype="nf4") + fc1_config = RTNConfig(weight_bits=4, weight_dtype="int8") quant_config.set_local("model.fc1", fc1_config) config_dict = quant_config.to_dict() self.assertIn("global", config_dict) self.assertIn("local", config_dict) def test_same_type_configs_addition(self): - from neural_compressor.torch import RTNWeightQuantConfig + from neural_compressor.torch import RTNConfig quant_config1 = { - "rtn_weight_only_quant": { + "rtn": { "weight_dtype": "nf4", "weight_bits": 4, "weight_group_size": 32, }, } - q_config = RTNWeightQuantConfig.from_dict(quant_config1["rtn_weight_only_quant"]) + q_config = RTNConfig.from_dict(quant_config1["rtn"]) quant_config2 = { - "rtn_weight_only_quant": { + "rtn": { "global": { "weight_bits": 8, "weight_group_size": 32, @@ -240,61 +240,59 @@ def test_same_type_configs_addition(self): }, } } - q_config2 = RTNWeightQuantConfig.from_dict(quant_config2["rtn_weight_only_quant"]) + q_config2 = RTNConfig.from_dict(quant_config2["rtn"]) q_config3 = q_config + q_config2 q3_dict = q_config3.to_dict() - for op_name, op_config in quant_config2["rtn_weight_only_quant"]["local"].items(): + for op_name, op_config in quant_config2["rtn"]["local"].items(): for attr, val in op_config.items(): self.assertEqual(q3_dict["local"][op_name][attr], val) - self.assertNotEqual( - q3_dict["global"]["weight_bits"], quant_config2["rtn_weight_only_quant"]["global"]["weight_bits"] - ) + self.assertNotEqual(q3_dict["global"]["weight_bits"], quant_config2["rtn"]["global"]["weight_bits"]) def test_diff_types_configs_addition(self): - from neural_compressor.torch import GPTQConfig, RTNWeightQuantConfig + from neural_compressor.torch import GPTQConfig, RTNConfig quant_config1 = { - "rtn_weight_only_quant": { + "rtn": { "weight_dtype": "nf4", "weight_bits": 4, "weight_group_size": 32, }, } - q_config = RTNWeightQuantConfig.from_dict(quant_config1["rtn_weight_only_quant"]) + q_config = RTNConfig.from_dict(quant_config1["rtn"]) d_config = GPTQConfig(double_quant_bits=4) combined_config = q_config + d_config combined_config_d = combined_config.to_dict() logger.info(combined_config) - self.assertTrue("rtn_weight_only_quant" in combined_config_d) + self.assertTrue("rtn" in combined_config_d) self.assertIn("gptq", combined_config_d) def test_composable_config_addition(self): - from neural_compressor.torch import GPTQConfig, RTNWeightQuantConfig + from neural_compressor.torch import GPTQConfig, RTNConfig quant_config1 = { - "rtn_weight_only_quant": { + "rtn": { "weight_dtype": "nf4", "weight_bits": 4, "weight_group_size": 32, }, } - q_config = RTNWeightQuantConfig.from_dict(quant_config1["rtn_weight_only_quant"]) + q_config = RTNConfig.from_dict(quant_config1["rtn"]) d_config = GPTQConfig(double_quant_bits=4) combined_config = q_config + d_config combined_config_d = combined_config.to_dict() logger.info(combined_config) - self.assertTrue("rtn_weight_only_quant" in combined_config_d) + self.assertTrue("rtn" in combined_config_d) self.assertIn("gptq", combined_config_d) combined_config2 = combined_config + d_config combined_config3 = combined_config + combined_config2 def test_config_mapping(self): - from neural_compressor.torch import RTNWeightQuantConfig + from neural_compressor.torch import RTNConfig from neural_compressor.torch.utils.utility import get_model_info - quant_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="nf4") + quant_config = RTNConfig(weight_bits=4, weight_dtype="nf4") # set operator instance - fc1_config = RTNWeightQuantConfig(weight_bits=6, weight_dtype="int8") + fc1_config = RTNConfig(weight_bits=6, weight_dtype="int8") quant_config.set_local("fc1", fc1_config) # get model and quantize fp32_model = build_simple_torch_model() @@ -305,7 +303,7 @@ def test_config_mapping(self): self.assertTrue(configs_mapping[("fc1", torch.nn.Linear)].weight_bits == 6) self.assertTrue(configs_mapping[("fc2", torch.nn.Linear)].weight_bits == 4) # test regular matching - fc_config = RTNWeightQuantConfig(weight_bits=5, weight_dtype="int8") + fc_config = RTNConfig(weight_bits=5, weight_dtype="int8") quant_config.set_local("fc", fc_config) configs_mapping = quant_config.to_config_mapping(model_info=model_info) logger.info(configs_mapping) @@ -327,10 +325,10 @@ def test_gptq_config(self): class TestQuantConfigForAutotune(unittest.TestCase): def test_expand_config(self): # test the expand functionalities, the user is not aware it - from neural_compressor.torch import RTNWeightQuantConfig + from neural_compressor.torch import RTNConfig - tune_config = RTNWeightQuantConfig(weight_bits=[4, 6]) - expand_config_list = RTNWeightQuantConfig.expand(tune_config) + tune_config = RTNConfig(weight_bits=[4, 6]) + expand_config_list = RTNConfig.expand(tune_config) self.assertEqual(expand_config_list[0].weight_bits, 4) self.assertEqual(expand_config_list[1].weight_bits, 6)