From 73dc12cac5a7587ee5ca0085d9053a7b81ca8cfa Mon Sep 17 00:00:00 2001
From: xin3he <xin3.he@intel.com>
Date: Thu, 18 Jan 2024 16:15:28 +0800
Subject: [PATCH 1/3] Rename RTNWeightOnlyConfig to RTNConfig

Signed-off-by: xin3he <xin3.he@intel.com>
---
 .../quantization/llm/run_clm_no_trainer.py    |  8 +-
 neural_compressor/common/utility.py           |  2 +-
 neural_compressor/tensorflow/utils.py         |  2 +-
 neural_compressor/torch/__init__.py           |  2 +-
 .../torch/algorithms/weight_only/rtn.py       |  4 +-
 .../torch/algorithms/weight_only_algos.py     |  8 +-
 neural_compressor/torch/autotune.py           |  4 +-
 .../torch/quantization/__init__.py            |  2 +-
 .../torch/quantization/autotune.py            | 67 +++++++++++++
 .../torch/quantization/config.py              | 24 ++---
 neural_compressor/torch/utils/utility.py      |  2 +-
 .../quantization/weight_only/test_rtn.py      | 18 ++--
 test/3x/torch/test_autotune.py                | 12 +--
 test/3x/torch/test_config.py                  | 96 +++++++++----------
 14 files changed, 155 insertions(+), 96 deletions(-)
 create mode 100644 neural_compressor/torch/quantization/autotune.py

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py
index 06818699f5a..8235594a8d2 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/run_clm_no_trainer.py
@@ -230,7 +230,7 @@ def get_user_model():
 
     # 3.x api
     if args.approach == 'weight_only':
-        from neural_compressor.torch import RTNWeightQuantConfig, GPTQConfig, quantize
+        from neural_compressor.torch import RTNConfig, GPTQConfig, quantize
         from neural_compressor.torch.utils.utility import get_double_quant_config
         weight_sym = True if args.woq_scheme == "sym" else False
         double_quant_config_dict = get_double_quant_config(args.double_quant_type, weight_sym=weight_sym)
@@ -243,9 +243,9 @@ def get_user_model():
                         "enable_mse_search": args.woq_enable_mse_search,
                     }
                 )
-                quant_config = RTNWeightQuantConfig.from_dict(double_quant_config_dict)
+                quant_config = RTNConfig.from_dict(double_quant_config_dict)
             else:
-                quant_config = RTNWeightQuantConfig(
+                quant_config = RTNConfig(
                     weight_dtype=args.woq_dtype,
                     weight_bits=args.woq_bits,
                     weight_group_size=args.woq_group_size,
@@ -257,7 +257,7 @@ def get_user_model():
                     double_quant_sym=args.double_quant_sym,
                     double_quant_group_size=args.double_quant_group_size,
                 )
-            quant_config.set_local("lm_head", RTNWeightQuantConfig(weight_dtype="fp32"))
+            quant_config.set_local("lm_head", RTNConfig(weight_dtype="fp32"))
             user_model = quantize(
                 model=user_model, quant_config=quant_config
             )
diff --git a/neural_compressor/common/utility.py b/neural_compressor/common/utility.py
index 7761a173d7d..42f6e445b9a 100644
--- a/neural_compressor/common/utility.py
+++ b/neural_compressor/common/utility.py
@@ -27,7 +27,7 @@
 # config name
 BASE_CONFIG = "base_config"
 COMPOSABLE_CONFIG = "composable_config"
-RTN_WEIGHT_ONLY_QUANT = "rtn_weight_only_quant"
+RTN = "rtn"
 STATIC_QUANT = "static_quant"
 GPTQ = "gptq"
 FP8_QUANT = "fp8_quant"
diff --git a/neural_compressor/tensorflow/utils.py b/neural_compressor/tensorflow/utils.py
index 6f65f79fbc1..4497c1e9a7a 100644
--- a/neural_compressor/tensorflow/utils.py
+++ b/neural_compressor/tensorflow/utils.py
@@ -35,7 +35,7 @@ def register_algo(name):
 
     Usage example:
         @register_algo(name=example_algo)
-        def example_algo(model: torch.nn.Module, quant_config: RTNWeightQuantConfig) -> torch.nn.Module:
+        def example_algo(model: torch.nn.Module, quant_config: RTNConfig) -> torch.nn.Module:
             ...
     Args:
         name (str): The name under which the algorithm function will be registered.
diff --git a/neural_compressor/torch/__init__.py b/neural_compressor/torch/__init__.py
index c50e60103ea..46f4082de67 100644
--- a/neural_compressor/torch/__init__.py
+++ b/neural_compressor/torch/__init__.py
@@ -17,7 +17,7 @@
 
 from neural_compressor.torch.quantization import (
     quantize,
-    RTNWeightQuantConfig,
+    RTNConfig,
     get_default_rtn_config,
     GPTQConfig,
     get_default_gptq_config,
diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py
index 1f5949946c3..8d5e57c4f7f 100644
--- a/neural_compressor/torch/algorithms/weight_only/rtn.py
+++ b/neural_compressor/torch/algorithms/weight_only/rtn.py
@@ -580,10 +580,10 @@ def quant_weight_w_scale(weight, scale, zp, group_size=-1, dtype="int"):
     return int_weight
 
 
-from neural_compressor.torch.quantization.config import RTNWeightQuantConfig
+from neural_compressor.torch.quantization.config import RTNConfig
 
 
-def apply_rtn_on_single_module(module: torch.nn.Module, quant_config: RTNWeightQuantConfig) -> torch.nn.Module:
+def apply_rtn_on_single_module(module: torch.nn.Module, quant_config: RTNConfig) -> torch.nn.Module:
     # TODO (Yi) remove it
     enable_full_range = quant_config.enable_full_range
     enable_mse_search = quant_config.enable_mse_search
diff --git a/neural_compressor/torch/algorithms/weight_only_algos.py b/neural_compressor/torch/algorithms/weight_only_algos.py
index e3cef82d213..368f00c74b6 100644
--- a/neural_compressor/torch/algorithms/weight_only_algos.py
+++ b/neural_compressor/torch/algorithms/weight_only_algos.py
@@ -18,17 +18,17 @@
 import torch
 
 from neural_compressor.common.logger import Logger
-from neural_compressor.common.utility import GPTQ, RTN_WEIGHT_ONLY_QUANT
-from neural_compressor.torch.quantization.config import GPTQConfig, RTNWeightQuantConfig
+from neural_compressor.common.utility import GPTQ, RTN
+from neural_compressor.torch.quantization.config import GPTQConfig, RTNConfig
 from neural_compressor.torch.utils.utility import fetch_module, register_algo, set_module
 
 logger = Logger().get_logger()
 
 
 ###################### RTN Algo Entry ##################################
-@register_algo(name=RTN_WEIGHT_ONLY_QUANT)
+@register_algo(name=RTN)
 def rtn_quantize_entry(
-    model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], RTNWeightQuantConfig], *args, **kwargs
+    model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], RTNConfig], *args, **kwargs
 ) -> torch.nn.Module:
     """The main entry to apply rtn quantization."""
     from .weight_only.rtn import apply_rtn_on_single_module
diff --git a/neural_compressor/torch/autotune.py b/neural_compressor/torch/autotune.py
index cded26ebc48..73dda619d22 100644
--- a/neural_compressor/torch/autotune.py
+++ b/neural_compressor/torch/autotune.py
@@ -20,7 +20,7 @@
 from neural_compressor.common.base_tuning import TuningConfig, evaluator, init_tuning
 from neural_compressor.common.logger import Logger
 from neural_compressor.torch import quantize
-from neural_compressor.torch.quantization.config import GPTQConfig, RTNWeightQuantConfig
+from neural_compressor.torch.quantization.config import GPTQConfig, RTNConfig
 
 logger = Logger().get_logger()
 
@@ -33,7 +33,7 @@
 
 def get_default_tune_config() -> TuningConfig:
     # TODO use the registered default tuning config in the next PR
-    return TuningConfig(quant_configs=[GPTQConfig(weight_bits=[4, 8]), RTNWeightQuantConfig(weight_bits=[4, 8])])
+    return TuningConfig(quant_configs=[GPTQConfig(weight_bits=[4, 8]), RTNConfig(weight_bits=[4, 8])])
 
 
 def autotune(
diff --git a/neural_compressor/torch/quantization/__init__.py b/neural_compressor/torch/quantization/__init__.py
index d54393a21fc..c78a7b0552e 100644
--- a/neural_compressor/torch/quantization/__init__.py
+++ b/neural_compressor/torch/quantization/__init__.py
@@ -14,7 +14,7 @@
 
 from neural_compressor.torch.quantization.quantize import quantize, quantize_dynamic
 from neural_compressor.torch.quantization.config import (
-    RTNWeightQuantConfig,
+    RTNConfig,
     get_default_rtn_config,
     GPTQConfig,
     get_default_gptq_config,
diff --git a/neural_compressor/torch/quantization/autotune.py b/neural_compressor/torch/quantization/autotune.py
new file mode 100644
index 00000000000..73dda619d22
--- /dev/null
+++ b/neural_compressor/torch/quantization/autotune.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Optional, Union
+
+import torch
+
+from neural_compressor.common.base_config import BaseConfig
+from neural_compressor.common.base_tuning import TuningConfig, evaluator, init_tuning
+from neural_compressor.common.logger import Logger
+from neural_compressor.torch import quantize
+from neural_compressor.torch.quantization.config import GPTQConfig, RTNConfig
+
+logger = Logger().get_logger()
+
+
+__all__ = [
+    "get_default_tune_config",
+    "autotune",
+]
+
+
+def get_default_tune_config() -> TuningConfig:
+    # TODO use the registered default tuning config in the next PR
+    return TuningConfig(quant_configs=[GPTQConfig(weight_bits=[4, 8]), RTNConfig(weight_bits=[4, 8])])
+
+
+def autotune(
+    model: torch.nn.Module,
+    tune_config: TuningConfig,
+    eval_fns: Optional[Union[Dict, List[Dict]]] = None,
+    run_fn=None,
+    run_args=None,
+) -> Optional[torch.nn.Module]:
+    """The main entry of auto-tune."""
+    best_quant_model = None
+    evaluator.set_eval_fn_registry(eval_fns)
+    evaluator.self_check()
+    config_loader, tuning_logger, tuning_monitor = init_tuning(tuning_config=tune_config)
+    tuning_logger.tuning_start()
+    for trial_index, quant_config in enumerate(config_loader):
+        tuning_logger.trial_start(trial_index=trial_index)
+        tuning_logger.quantization_start()
+        q_model = quantize(model, quant_config=quant_config, run_fn=run_fn, run_args=run_args)
+        tuning_logger.quantization_end()
+        tuning_logger.evaluation_start()
+        eval_result: float = evaluator.evaluate(q_model)
+        tuning_logger.evaluation_end()
+        tuning_monitor.add_trial_result(trial_index, eval_result, quant_config)
+        if tuning_monitor.need_stop():
+            best_quant_config: BaseConfig = tuning_monitor.get_best_quant_config()
+            quantize(model, quant_config=best_quant_config, run_fn=run_fn, run_args=run_args, inplace=True)
+            best_quant_model = model  # quantize model inplace
+        tuning_logger.trial_end(trial_index)
+    tuning_logger.tuning_end()
+    return best_quant_model
diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py
index 11dc239d050..07dc85db5e1 100644
--- a/neural_compressor/torch/quantization/config.py
+++ b/neural_compressor/torch/quantization/config.py
@@ -24,13 +24,7 @@
 import torch
 
 from neural_compressor.common.base_config import BaseConfig, config_registry, register_config
-from neural_compressor.common.utility import (
-    DEFAULT_WHITE_LIST,
-    FP8_QUANT,
-    GPTQ,
-    OP_NAME_OR_MODULE_TYPE,
-    RTN_WEIGHT_ONLY_QUANT,
-)
+from neural_compressor.common.utility import DEFAULT_WHITE_LIST, FP8_QUANT, GPTQ, OP_NAME_OR_MODULE_TYPE, RTN
 from neural_compressor.torch.utils.constants import PRIORITY_GPTQ, PRIORITY_RTN
 from neural_compressor.torch.utils.utility import is_hpex_avaliable, logger
 
@@ -60,8 +54,8 @@ class OperatorConfig(NamedTuple):
 ######################## RNT Config ###############################
 
 
-@register_config(framework_name=FRAMEWORK_NAME, algo_name=RTN_WEIGHT_ONLY_QUANT, priority=PRIORITY_RTN)
-class RTNWeightQuantConfig(BaseConfig):
+@register_config(framework_name=FRAMEWORK_NAME, algo_name=RTN, priority=PRIORITY_RTN)
+class RTNConfig(BaseConfig):
     """Config class for round-to-nearest weight-only quantization."""
 
     supported_configs: List[OperatorConfig] = []
@@ -80,7 +74,7 @@ class RTNWeightQuantConfig(BaseConfig):
         "double_quant_sym",
         "double_quant_group_size",
     ]
-    name = RTN_WEIGHT_ONLY_QUANT
+    name = RTN
 
     def __init__(
         self,
@@ -137,12 +131,12 @@ def to_dict(self):
 
     @classmethod
     def from_dict(cls, config_dict):
-        return super(RTNWeightQuantConfig, cls).from_dict(config_dict=config_dict, str2operator=str2operator)
+        return super(RTNConfig, cls).from_dict(config_dict=config_dict, str2operator=str2operator)
 
     @classmethod
     def register_supported_configs(cls) -> List[OperatorConfig]:
         supported_configs = []
-        linear_rtn_config = RTNWeightQuantConfig(
+        linear_rtn_config = RTNConfig(
             weight_dtype=["int", "int8", "int4", "nf4", "fp4", "fp4_e2m1_bnb", "fp4_e2m1"],
             weight_bits=[4, 1, 2, 3, 5, 6, 7, 8],
             weight_group_size=[32, -1, 1, 4, 8, 16, 64, 128, 256, 512, 1024],
@@ -173,16 +167,16 @@ def get_model_info(model: torch.nn.Module) -> List[Tuple[str, Callable]]:
 
 
 # TODO(Yi) run `register_supported_configs` for all registered config.
-RTNWeightQuantConfig.register_supported_configs()
+RTNConfig.register_supported_configs()
 
 
-def get_default_rtn_config() -> RTNWeightQuantConfig:
+def get_default_rtn_config() -> RTNConfig:
     """Generate the default rtn config.
 
     Returns:
         the default rtn config.
     """
-    return RTNWeightQuantConfig()
+    return RTNConfig()
 
 
 ######################## GPTQ Config ###############################
diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py
index b1748d059eb..6abde03318d 100644
--- a/neural_compressor/torch/utils/utility.py
+++ b/neural_compressor/torch/utils/utility.py
@@ -33,7 +33,7 @@ def register_algo(name):
 
     Usage example:
         @register_algo(name=example_algo)
-        def example_algo(model: torch.nn.Module, quant_config: RTNWeightQuantConfig) -> torch.nn.Module:
+        def example_algo(model: torch.nn.Module, quant_config: RTNConfig) -> torch.nn.Module:
             ...
 
     Args:
diff --git a/test/3x/torch/quantization/weight_only/test_rtn.py b/test/3x/torch/quantization/weight_only/test_rtn.py
index 00159b55828..a53671c8bb7 100644
--- a/test/3x/torch/quantization/weight_only/test_rtn.py
+++ b/test/3x/torch/quantization/weight_only/test_rtn.py
@@ -60,7 +60,7 @@ def _apply_rtn(self, quant_config):
         return qmodel
 
     def test_rtn(self):
-        from neural_compressor.torch import RTNWeightQuantConfig
+        from neural_compressor.torch import RTNConfig
 
         # some tests were skipped to accelerate the CI
         rnt_options = {
@@ -76,7 +76,7 @@ def test_rtn(self):
         }
         from itertools import product
 
-        keys = RTNWeightQuantConfig.params_list
+        keys = RTNConfig.params_list
         for value in product(*rnt_options.values()):
             d = dict(zip(keys, value))
             if (d["weight_dtype"] == "int" and d["weight_bits"] != 8) or (
@@ -85,26 +85,26 @@ def test_rtn(self):
                 or (d["return_int"] and (d["group_dim"] != 1 or d["weight_bits"] != 8))
             ):
                 continue
-            quant_config = RTNWeightQuantConfig(**d)
+            quant_config = RTNConfig(**d)
             self._apply_rtn(quant_config)
 
     def test_rtn_return_type(self):
-        from neural_compressor.torch import RTNWeightQuantConfig
+        from neural_compressor.torch import RTNConfig
 
         for return_int in [True, False]:
-            quant_config = RTNWeightQuantConfig(return_int=return_int)
+            quant_config = RTNConfig(return_int=return_int)
             qmodel = self._apply_rtn(quant_config)
 
     def test_rtn_mse_search(self):
-        from neural_compressor.torch import RTNWeightQuantConfig
+        from neural_compressor.torch import RTNConfig
 
-        quant_config = RTNWeightQuantConfig(enable_mse_search=True)
+        quant_config = RTNConfig(enable_mse_search=True)
         qmodel = self._apply_rtn(quant_config)
 
     def test_rtn_recover(self):
-        from neural_compressor.torch import RTNWeightQuantConfig
+        from neural_compressor.torch import RTNConfig
 
-        quant_config = RTNWeightQuantConfig(return_int=True)
+        quant_config = RTNConfig(return_int=True)
         qmodel = self._apply_rtn(quant_config)
         input = torch.randn(4, 8)
         # test forward
diff --git a/test/3x/torch/test_autotune.py b/test/3x/torch/test_autotune.py
index 7e67436e87c..326da168dc8 100644
--- a/test/3x/torch/test_autotune.py
+++ b/test/3x/torch/test_autotune.py
@@ -62,12 +62,12 @@ def setUp(self):
     def test_autotune_api(self):
         logger.info("test_autotune_api")
         from neural_compressor.common.base_tuning import evaluator
-        from neural_compressor.torch import RTNWeightQuantConfig, TuningConfig, autotune
+        from neural_compressor.torch import RTNConfig, TuningConfig, autotune
 
         def eval_acc_fn(model) -> float:
             return 1.0
 
-        custom_tune_config = TuningConfig(quant_configs=[RTNWeightQuantConfig(weight_bits=[4, 6])], max_trials=2)
+        custom_tune_config = TuningConfig(quant_configs=[RTNConfig(weight_bits=[4, 6])], max_trials=2)
         best_model = autotune(
             model=build_simple_torch_model(), tune_config=custom_tune_config, eval_fns=[{"eval_fn": eval_acc_fn}]
         )
@@ -78,7 +78,7 @@ def eval_acc_fn(model) -> float:
     def test_autotune_api_2(self):
         logger.info("test_autotune_api")
         from neural_compressor.common.base_tuning import evaluator
-        from neural_compressor.torch import RTNWeightQuantConfig, TuningConfig, autotune
+        from neural_compressor.torch import RTNConfig, TuningConfig, autotune
 
         def eval_acc_fn(model) -> float:
             return 1.0
@@ -94,7 +94,7 @@ def eval_perf_fn(model) -> float:
             },
         ]
 
-        custom_tune_config = TuningConfig(quant_configs=[RTNWeightQuantConfig(weight_bits=[4, 6])], max_trials=2)
+        custom_tune_config = TuningConfig(quant_configs=[RTNConfig(weight_bits=[4, 6])], max_trials=2)
         best_model = autotune(model=build_simple_torch_model(), tune_config=custom_tune_config, eval_fns=eval_fns)
         self.assertIsNotNone(best_model)
         self.assertEqual(len(evaluator.eval_fn_registry), 2)
@@ -102,9 +102,9 @@ def eval_perf_fn(model) -> float:
     @reset_tuning_target
     def test_autotune_not_eval_func(self):
         logger.info("test_autotune_api")
-        from neural_compressor.torch import RTNWeightQuantConfig, TuningConfig, autotune
+        from neural_compressor.torch import RTNConfig, TuningConfig, autotune
 
-        custom_tune_config = TuningConfig(quant_configs=[RTNWeightQuantConfig(weight_bits=[4, 6])], max_trials=2)
+        custom_tune_config = TuningConfig(quant_configs=[RTNConfig(weight_bits=[4, 6])], max_trials=2)
 
         # Use assertRaises to check that an AssertionError is raised
         with self.assertRaises(AssertionError) as context:
diff --git a/test/3x/torch/test_config.py b/test/3x/torch/test_config.py
index 0e0925685c0..3004bd97c48 100644
--- a/test/3x/torch/test_config.py
+++ b/test/3x/torch/test_config.py
@@ -57,7 +57,7 @@ def test_quantize_rtn_from_dict_beginner(self):
         from neural_compressor.torch import quantize
 
         quant_config = {
-            "rtn_weight_only_quant": {
+            "RTN": {
                 "weight_dtype": "nf4",
                 "weight_bits": 4,
                 "weight_group_size": 32,
@@ -68,20 +68,20 @@ def test_quantize_rtn_from_dict_beginner(self):
         self.assertIsNotNone(qmodel)
 
     def test_quantize_rtn_from_class_beginner(self):
-        from neural_compressor.torch import RTNWeightQuantConfig, quantize
+        from neural_compressor.torch import RTNConfig, quantize
 
-        quant_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="nf4", weight_group_size=32)
+        quant_config = RTNConfig(weight_bits=4, weight_dtype="nf4", weight_group_size=32)
         fp32_model = build_simple_torch_model()
         qmodel = quantize(fp32_model, quant_config)
         self.assertIsNotNone(qmodel)
 
     def test_quantize_rtndq_from_class_beginner(self):
-        from neural_compressor.torch import RTNWeightQuantConfig, quantize
+        from neural_compressor.torch import RTNConfig, quantize
 
-        fp32_config = RTNWeightQuantConfig(weight_dtype="fp32")
+        fp32_config = RTNConfig(weight_dtype="fp32")
 
         fp32_model = copy.deepcopy(self.gptj)
-        quant_config = RTNWeightQuantConfig(
+        quant_config = RTNConfig(
             weight_bits=4,
             weight_dtype="int",
             weight_sym=False,
@@ -96,7 +96,7 @@ def test_quantize_rtndq_from_class_beginner(self):
         from neural_compressor.torch.utils.utility import get_double_quant_config
 
         double_quant_config_dict = get_double_quant_config("GGML_TYPE_Q4_K", weight_sym=False)
-        quant_config = RTNWeightQuantConfig.from_dict(double_quant_config_dict)
+        quant_config = RTNConfig.from_dict(double_quant_config_dict)
         quant_config.set_local("lm_head", fp32_config)
         qmodel = quantize(fp32_model, quant_config)
         out3 = qmodel(self.lm_input)
@@ -104,7 +104,7 @@ def test_quantize_rtndq_from_class_beginner(self):
 
         fp32_model = copy.deepcopy(self.gptj)
 
-        quant_config = RTNWeightQuantConfig(
+        quant_config = RTNConfig(
             weight_bits=4,
             weight_dtype="nf4",
             weight_group_size=32,
@@ -116,7 +116,7 @@ def test_quantize_rtndq_from_class_beginner(self):
         fp32_model = copy.deepcopy(self.gptj)
         # bitsandbytes double quant setting
         double_quant_config_dict = get_double_quant_config("BNB")
-        quant_config = RTNWeightQuantConfig.from_dict(double_quant_config_dict)
+        quant_config = RTNConfig.from_dict(double_quant_config_dict)
         quant_config.set_local("lm_head", fp32_config)
         qmodel = quantize(fp32_model, quant_config)
         out5 = qmodel(self.lm_input)
@@ -127,7 +127,7 @@ def test_quantize_rtn_from_dict_advance(self):
 
         fp32_model = build_simple_torch_model()
         quant_config = {
-            "rtn_weight_only_quant": {
+            "RTN": {
                 "global": {
                     "weight_dtype": "nf4",
                     "weight_bits": 4,
@@ -145,11 +145,11 @@ def test_quantize_rtn_from_dict_advance(self):
         self.assertIsNotNone(qmodel)
 
     def test_quantize_rtn_from_class_advance(self):
-        from neural_compressor.torch import RTNWeightQuantConfig, quantize
+        from neural_compressor.torch import RTNConfig, quantize
 
-        quant_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="nf4")
+        quant_config = RTNConfig(weight_bits=4, weight_dtype="nf4")
         # set operator instance
-        fc1_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="int8")
+        fc1_config = RTNConfig(weight_bits=4, weight_dtype="int8")
         quant_config.set_local("model.fc1", fc1_config)
         # get model and quantize
         fp32_model = build_simple_torch_model()
@@ -157,23 +157,23 @@ def test_quantize_rtn_from_class_advance(self):
         self.assertIsNotNone(qmodel)
 
     def test_config_white_lst(self):
-        from neural_compressor.torch import RTNWeightQuantConfig, quantize
+        from neural_compressor.torch import RTNConfig, quantize
 
-        global_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="nf4")
+        global_config = RTNConfig(weight_bits=4, weight_dtype="nf4")
         # set operator instance
-        fc1_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="int8", white_list=["model.fc1"])
+        fc1_config = RTNConfig(weight_bits=4, weight_dtype="int8", white_list=["model.fc1"])
         # get model and quantize
         fp32_model = build_simple_torch_model()
         qmodel = quantize(fp32_model, quant_config=global_config + fc1_config)
         self.assertIsNotNone(qmodel)
 
     def test_config_white_lst2(self):
-        from neural_compressor.torch import RTNWeightQuantConfig
+        from neural_compressor.torch import RTNConfig
         from neural_compressor.torch.utils.utility import get_model_info
 
-        global_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="nf4")
+        global_config = RTNConfig(weight_bits=4, weight_dtype="nf4")
         # set operator instance
-        fc1_config = RTNWeightQuantConfig(weight_bits=6, weight_dtype="int8", white_list=["fc1"])
+        fc1_config = RTNConfig(weight_bits=6, weight_dtype="int8", white_list=["fc1"])
         quant_config = global_config + fc1_config
         # get model and quantize
         fp32_model = build_simple_torch_model()
@@ -185,10 +185,10 @@ def test_config_white_lst2(self):
         self.assertTrue(configs_mapping[("fc2", torch.nn.Linear)].weight_bits == 4)
 
     def test_config_from_dict(self):
-        from neural_compressor.torch import RTNWeightQuantConfig
+        from neural_compressor.torch import RTNConfig
 
         quant_config = {
-            "rtn_weight_only_quant": {
+            "RTN": {
                 "global": {
                     "weight_dtype": "nf4",
                     "weight_bits": 4,
@@ -202,32 +202,32 @@ def test_config_from_dict(self):
                 },
             }
         }
-        config = RTNWeightQuantConfig.from_dict(quant_config["rtn_weight_only_quant"])
+        config = RTNConfig.from_dict(quant_config["RTN"])
         self.assertIsNotNone(config.local_config)
 
     def test_config_to_dict(self):
-        from neural_compressor.torch import RTNWeightQuantConfig
+        from neural_compressor.torch import RTNConfig
 
-        quant_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="nf4")
-        fc1_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="int8")
+        quant_config = RTNConfig(weight_bits=4, weight_dtype="nf4")
+        fc1_config = RTNConfig(weight_bits=4, weight_dtype="int8")
         quant_config.set_local("model.fc1", fc1_config)
         config_dict = quant_config.to_dict()
         self.assertIn("global", config_dict)
         self.assertIn("local", config_dict)
 
     def test_same_type_configs_addition(self):
-        from neural_compressor.torch import RTNWeightQuantConfig
+        from neural_compressor.torch import RTNConfig
 
         quant_config1 = {
-            "rtn_weight_only_quant": {
+            "RTN": {
                 "weight_dtype": "nf4",
                 "weight_bits": 4,
                 "weight_group_size": 32,
             },
         }
-        q_config = RTNWeightQuantConfig.from_dict(quant_config1["rtn_weight_only_quant"])
+        q_config = RTNConfig.from_dict(quant_config1["RTN"])
         quant_config2 = {
-            "rtn_weight_only_quant": {
+            "RTN": {
                 "global": {
                     "weight_bits": 8,
                     "weight_group_size": 32,
@@ -240,61 +240,59 @@ def test_same_type_configs_addition(self):
                 },
             }
         }
-        q_config2 = RTNWeightQuantConfig.from_dict(quant_config2["rtn_weight_only_quant"])
+        q_config2 = RTNConfig.from_dict(quant_config2["RTN"])
         q_config3 = q_config + q_config2
         q3_dict = q_config3.to_dict()
-        for op_name, op_config in quant_config2["rtn_weight_only_quant"]["local"].items():
+        for op_name, op_config in quant_config2["RTN"]["local"].items():
             for attr, val in op_config.items():
                 self.assertEqual(q3_dict["local"][op_name][attr], val)
-        self.assertNotEqual(
-            q3_dict["global"]["weight_bits"], quant_config2["rtn_weight_only_quant"]["global"]["weight_bits"]
-        )
+        self.assertNotEqual(q3_dict["global"]["weight_bits"], quant_config2["RTN"]["global"]["weight_bits"])
 
     def test_diff_types_configs_addition(self):
-        from neural_compressor.torch import GPTQConfig, RTNWeightQuantConfig
+        from neural_compressor.torch import GPTQConfig, RTNConfig
 
         quant_config1 = {
-            "rtn_weight_only_quant": {
+            "RTN": {
                 "weight_dtype": "nf4",
                 "weight_bits": 4,
                 "weight_group_size": 32,
             },
         }
-        q_config = RTNWeightQuantConfig.from_dict(quant_config1["rtn_weight_only_quant"])
+        q_config = RTNConfig.from_dict(quant_config1["RTN"])
         d_config = GPTQConfig(double_quant_bits=4)
         combined_config = q_config + d_config
         combined_config_d = combined_config.to_dict()
         logger.info(combined_config)
-        self.assertTrue("rtn_weight_only_quant" in combined_config_d)
+        self.assertTrue("RTN" in combined_config_d)
         self.assertIn("gptq", combined_config_d)
 
     def test_composable_config_addition(self):
-        from neural_compressor.torch import GPTQConfig, RTNWeightQuantConfig
+        from neural_compressor.torch import GPTQConfig, RTNConfig
 
         quant_config1 = {
-            "rtn_weight_only_quant": {
+            "RTN": {
                 "weight_dtype": "nf4",
                 "weight_bits": 4,
                 "weight_group_size": 32,
             },
         }
-        q_config = RTNWeightQuantConfig.from_dict(quant_config1["rtn_weight_only_quant"])
+        q_config = RTNConfig.from_dict(quant_config1["RTN"])
         d_config = GPTQConfig(double_quant_bits=4)
         combined_config = q_config + d_config
         combined_config_d = combined_config.to_dict()
         logger.info(combined_config)
-        self.assertTrue("rtn_weight_only_quant" in combined_config_d)
+        self.assertTrue("RTN" in combined_config_d)
         self.assertIn("gptq", combined_config_d)
         combined_config2 = combined_config + d_config
         combined_config3 = combined_config + combined_config2
 
     def test_config_mapping(self):
-        from neural_compressor.torch import RTNWeightQuantConfig
+        from neural_compressor.torch import RTNConfig
         from neural_compressor.torch.utils.utility import get_model_info
 
-        quant_config = RTNWeightQuantConfig(weight_bits=4, weight_dtype="nf4")
+        quant_config = RTNConfig(weight_bits=4, weight_dtype="nf4")
         # set operator instance
-        fc1_config = RTNWeightQuantConfig(weight_bits=6, weight_dtype="int8")
+        fc1_config = RTNConfig(weight_bits=6, weight_dtype="int8")
         quant_config.set_local("fc1", fc1_config)
         # get model and quantize
         fp32_model = build_simple_torch_model()
@@ -305,7 +303,7 @@ def test_config_mapping(self):
         self.assertTrue(configs_mapping[("fc1", torch.nn.Linear)].weight_bits == 6)
         self.assertTrue(configs_mapping[("fc2", torch.nn.Linear)].weight_bits == 4)
         # test regular matching
-        fc_config = RTNWeightQuantConfig(weight_bits=5, weight_dtype="int8")
+        fc_config = RTNConfig(weight_bits=5, weight_dtype="int8")
         quant_config.set_local("fc", fc_config)
         configs_mapping = quant_config.to_config_mapping(model_info=model_info)
         logger.info(configs_mapping)
@@ -327,10 +325,10 @@ def test_gptq_config(self):
 class TestQuantConfigForAutotune(unittest.TestCase):
     def test_expand_config(self):
         # test the expand functionalities, the user is not aware it
-        from neural_compressor.torch import RTNWeightQuantConfig
+        from neural_compressor.torch import RTNConfig
 
-        tune_config = RTNWeightQuantConfig(weight_bits=[4, 6])
-        expand_config_list = RTNWeightQuantConfig.expand(tune_config)
+        tune_config = RTNConfig(weight_bits=[4, 6])
+        expand_config_list = RTNConfig.expand(tune_config)
         self.assertEqual(expand_config_list[0].weight_bits, 4)
         self.assertEqual(expand_config_list[1].weight_bits, 6)
 

From 0cbc8e6bc955074d7facc97fba9a88167f1b8156 Mon Sep 17 00:00:00 2001
From: xin3he <xin3.he@intel.com>
Date: Thu, 18 Jan 2024 16:58:10 +0800
Subject: [PATCH 2/3] fix name bug

Signed-off-by: xin3he <xin3.he@intel.com>
---
 test/3x/torch/test_config.py | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/test/3x/torch/test_config.py b/test/3x/torch/test_config.py
index 3004bd97c48..642f6de9cf5 100644
--- a/test/3x/torch/test_config.py
+++ b/test/3x/torch/test_config.py
@@ -57,7 +57,7 @@ def test_quantize_rtn_from_dict_beginner(self):
         from neural_compressor.torch import quantize
 
         quant_config = {
-            "RTN": {
+            "rtn": {
                 "weight_dtype": "nf4",
                 "weight_bits": 4,
                 "weight_group_size": 32,
@@ -127,7 +127,7 @@ def test_quantize_rtn_from_dict_advance(self):
 
         fp32_model = build_simple_torch_model()
         quant_config = {
-            "RTN": {
+            "rtn": {
                 "global": {
                     "weight_dtype": "nf4",
                     "weight_bits": 4,
@@ -188,7 +188,7 @@ def test_config_from_dict(self):
         from neural_compressor.torch import RTNConfig
 
         quant_config = {
-            "RTN": {
+            "rtn": {
                 "global": {
                     "weight_dtype": "nf4",
                     "weight_bits": 4,
@@ -202,7 +202,7 @@ def test_config_from_dict(self):
                 },
             }
         }
-        config = RTNConfig.from_dict(quant_config["RTN"])
+        config = RTNConfig.from_dict(quant_config["rtn"])
         self.assertIsNotNone(config.local_config)
 
     def test_config_to_dict(self):
@@ -219,15 +219,15 @@ def test_same_type_configs_addition(self):
         from neural_compressor.torch import RTNConfig
 
         quant_config1 = {
-            "RTN": {
+            "rtn": {
                 "weight_dtype": "nf4",
                 "weight_bits": 4,
                 "weight_group_size": 32,
             },
         }
-        q_config = RTNConfig.from_dict(quant_config1["RTN"])
+        q_config = RTNConfig.from_dict(quant_config1["rtn"])
         quant_config2 = {
-            "RTN": {
+            "rtn": {
                 "global": {
                     "weight_bits": 8,
                     "weight_group_size": 32,
@@ -240,48 +240,48 @@ def test_same_type_configs_addition(self):
                 },
             }
         }
-        q_config2 = RTNConfig.from_dict(quant_config2["RTN"])
+        q_config2 = RTNConfig.from_dict(quant_config2["rtn"])
         q_config3 = q_config + q_config2
         q3_dict = q_config3.to_dict()
-        for op_name, op_config in quant_config2["RTN"]["local"].items():
+        for op_name, op_config in quant_config2["rtn"]["local"].items():
             for attr, val in op_config.items():
                 self.assertEqual(q3_dict["local"][op_name][attr], val)
-        self.assertNotEqual(q3_dict["global"]["weight_bits"], quant_config2["RTN"]["global"]["weight_bits"])
+        self.assertNotEqual(q3_dict["global"]["weight_bits"], quant_config2["rtn"]["global"]["weight_bits"])
 
     def test_diff_types_configs_addition(self):
         from neural_compressor.torch import GPTQConfig, RTNConfig
 
         quant_config1 = {
-            "RTN": {
+            "rtn": {
                 "weight_dtype": "nf4",
                 "weight_bits": 4,
                 "weight_group_size": 32,
             },
         }
-        q_config = RTNConfig.from_dict(quant_config1["RTN"])
+        q_config = RTNConfig.from_dict(quant_config1["rtn"])
         d_config = GPTQConfig(double_quant_bits=4)
         combined_config = q_config + d_config
         combined_config_d = combined_config.to_dict()
         logger.info(combined_config)
-        self.assertTrue("RTN" in combined_config_d)
+        self.assertTrue("rtn" in combined_config_d)
         self.assertIn("gptq", combined_config_d)
 
     def test_composable_config_addition(self):
         from neural_compressor.torch import GPTQConfig, RTNConfig
 
         quant_config1 = {
-            "RTN": {
+            "rtn": {
                 "weight_dtype": "nf4",
                 "weight_bits": 4,
                 "weight_group_size": 32,
             },
         }
-        q_config = RTNConfig.from_dict(quant_config1["RTN"])
+        q_config = RTNConfig.from_dict(quant_config1["rtn"])
         d_config = GPTQConfig(double_quant_bits=4)
         combined_config = q_config + d_config
         combined_config_d = combined_config.to_dict()
         logger.info(combined_config)
-        self.assertTrue("RTN" in combined_config_d)
+        self.assertTrue("rtn" in combined_config_d)
         self.assertIn("gptq", combined_config_d)
         combined_config2 = combined_config + d_config
         combined_config3 = combined_config + combined_config2

From 876e8960e4113c2de45992eff3ae63c4f3fd4d17 Mon Sep 17 00:00:00 2001
From: xin3he <xin3.he@intel.com>
Date: Thu, 18 Jan 2024 17:00:43 +0800
Subject: [PATCH 3/3] change autotune path

Signed-off-by: xin3he <xin3.he@intel.com>
---
 neural_compressor/torch/__init__.py |  2 +-
 neural_compressor/torch/autotune.py | 67 -----------------------------
 2 files changed, 1 insertion(+), 68 deletions(-)
 delete mode 100644 neural_compressor/torch/autotune.py

diff --git a/neural_compressor/torch/__init__.py b/neural_compressor/torch/__init__.py
index 46f4082de67..81f131ca114 100644
--- a/neural_compressor/torch/__init__.py
+++ b/neural_compressor/torch/__init__.py
@@ -24,4 +24,4 @@
 )
 
 from neural_compressor.common.base_tuning import TuningConfig
-from neural_compressor.torch.autotune import autotune, get_default_tune_config
+from neural_compressor.torch.quantization.autotune import autotune, get_default_tune_config
diff --git a/neural_compressor/torch/autotune.py b/neural_compressor/torch/autotune.py
deleted file mode 100644
index 73dda619d22..00000000000
--- a/neural_compressor/torch/autotune.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Dict, List, Optional, Union
-
-import torch
-
-from neural_compressor.common.base_config import BaseConfig
-from neural_compressor.common.base_tuning import TuningConfig, evaluator, init_tuning
-from neural_compressor.common.logger import Logger
-from neural_compressor.torch import quantize
-from neural_compressor.torch.quantization.config import GPTQConfig, RTNConfig
-
-logger = Logger().get_logger()
-
-
-__all__ = [
-    "get_default_tune_config",
-    "autotune",
-]
-
-
-def get_default_tune_config() -> TuningConfig:
-    # TODO use the registered default tuning config in the next PR
-    return TuningConfig(quant_configs=[GPTQConfig(weight_bits=[4, 8]), RTNConfig(weight_bits=[4, 8])])
-
-
-def autotune(
-    model: torch.nn.Module,
-    tune_config: TuningConfig,
-    eval_fns: Optional[Union[Dict, List[Dict]]] = None,
-    run_fn=None,
-    run_args=None,
-) -> Optional[torch.nn.Module]:
-    """The main entry of auto-tune."""
-    best_quant_model = None
-    evaluator.set_eval_fn_registry(eval_fns)
-    evaluator.self_check()
-    config_loader, tuning_logger, tuning_monitor = init_tuning(tuning_config=tune_config)
-    tuning_logger.tuning_start()
-    for trial_index, quant_config in enumerate(config_loader):
-        tuning_logger.trial_start(trial_index=trial_index)
-        tuning_logger.quantization_start()
-        q_model = quantize(model, quant_config=quant_config, run_fn=run_fn, run_args=run_args)
-        tuning_logger.quantization_end()
-        tuning_logger.evaluation_start()
-        eval_result: float = evaluator.evaluate(q_model)
-        tuning_logger.evaluation_end()
-        tuning_monitor.add_trial_result(trial_index, eval_result, quant_config)
-        if tuning_monitor.need_stop():
-            best_quant_config: BaseConfig = tuning_monitor.get_best_quant_config()
-            quantize(model, quant_config=best_quant_config, run_fn=run_fn, run_args=run_args, inplace=True)
-            best_quant_model = model  # quantize model inplace
-        tuning_logger.trial_end(trial_index)
-    tuning_logger.tuning_end()
-    return best_quant_model