yiliu30 · yiliu30 · Jul 19, 2024 · yiliu30 · Jul 19, 2024
diff --git a/.azure-pipelines/scripts/codeScan/pydocstyle/scan_path.txt b/.azure-pipelines/scripts/codeScan/pydocstyle/scan_path.txt
@@ -15,3 +15,4 @@
 /neural-compressor/neural_compressor/strategy
 /neural-compressor/neural_compressor/training.py
 /neural-compressor/neural_compressor/utils
+/neural-compressor/neural_compressor/torch/algorithms/weight_only/hqq
diff --git a/neural_compressor/torch/algorithms/weight_only/hqq/__init__.py b/neural_compressor/torch/algorithms/weight_only/hqq/__init__.py
@@ -14,3 +14,50 @@
 
 from .quantizer import HQQuantizer
 from .config import HQQModuleConfig, QTensorConfig
+
+class HQQuantizer:
+    """
+    A class for quantizing models using the HQQ algorithm.
+
+    Attributes:
+        quant_config (ConfigMappingType): Configuration for quantization.
+
+    Methods:
+        prepare(model: torch.nn.Module, *args, **kwargs) -> Optional[torch.nn.Module]:
+            Prepares a given model for quantization.
+        convert(model: torch.nn.Module, *args, **kwargs) -> Optional[torch.nn.Module]:
+            Converts a prepared model to a quantized model.
+        save(model, path):
+            Saves the quantized model to the specified path.
+    """
+
+class HQQModuleConfig:
+    """
+    Configuration for HQQ modules.
+
+    Attributes:
+        weight (QTensorConfig): Configuration for weight quantization.
+        scale (QTensorConfig): Configuration for scale quantization.
+        zero (QTensorConfig): Configuration for zero quantization.
+
+    Methods:
+        __repr__() -> str:
+            Returns a string representation of the HQQModuleConfig object.
+    """
+
+class QTensorConfig:
+    """
+    Configuration for quantized tensors.
+
+    Attributes:
+        nbits (int): Number of bits for quantization.
+        channel_wise (bool): Whether to use channel-wise quantization.
+        group_size (int): Size of the quantization group.
+        optimize (bool): Whether to optimize the quantization.
+        round_zero (Optional[bool]): Whether to round zero.
+        pack (bool): Whether to pack the quantized tensor.
+
+    Methods:
+        __repr__() -> str:
+            Returns a string representation of the QTensorConfig object.
+    """
diff --git a/neural_compressor/torch/algorithms/weight_only/hqq/bitpack.py b/neural_compressor/torch/algorithms/weight_only/hqq/bitpack.py
@@ -30,27 +30,79 @@
 
 # Bit packing logic. format: pack/unpack_nBits_target-<uint8 or int32>
 class BitPack:
+    """
+    A class for bit packing logic.
+
+    This class provides static methods for packing and unpacking tensors
+    with different bit-widths.
+
+    Methods:
+        pack_8bit_u8(W_q): Packs an 8-bit tensor to uint8.
+        unpack_8bit_u8(W_q): Unpacks an 8-bit tensor from uint8.
+        pack_4bit_u8(W_q): Packs a 4-bit tensor to uint8.
+        unpack_4bit_u8(W_q): Unpacks a 4-bit tensor from uint8.
+        pack_2bit_u8(W_q): Packs a 2-bit tensor to uint8.
+        unpack_2bit_u8(W_q): Unpacks a 2-bit tensor from uint8.
+        pack_3bit_32(W_q_in): Packs a 3-bit tensor to int32.
+        unpack_3bit_32(W_q): Unpacks a 3-bit tensor from int32.
+    """
+
     # 8-bit
     ################################################
     @staticmethod
     def pack_8bit_u8(W_q):
+        """
+        Packs an 8-bit tensor to uint8.
+
+        Args:
+            W_q (torch.Tensor): The tensor to be packed.
+
+        Returns:
+            torch.Tensor: The packed tensor.
+        """
         return W_q.to(torch.uint8)
 
     @staticmethod
     def unpack_8bit_u8(W_q):
+        """
+        Unpacks an 8-bit tensor from uint8.
+
+        Args:
+            W_q (torch.Tensor): The tensor to be unpacked.
+
+        Returns:
+            torch.Tensor: The unpacked tensor.
+        """
         return W_q
 
     # 4-bit
     ################################################
     @staticmethod
     def pack_4bit_u8(W_q):  # uint8 > uint8/2
+        """
+        Packs a 4-bit tensor to uint8.
+
+        Args:
+            W_q (torch.Tensor): The tensor to be packed.
+
+        Returns:
+            torch.Tensor: The packed tensor.
+        """
         W_q = W_q.to(torch.uint8)
         _step = int(len(W_q) / 2)
         return (W_q[:_step] << 4) | W_q[_step:]
 
-    # A bit faster than the _cat version
     @staticmethod
     def unpack_4bit_u8(W_q):  # uint8/2 > uint8
+        """
+        Unpacks a 4-bit tensor from uint8.
+
+        Args:
+            W_q (torch.Tensor): The tensor to be unpacked.
+
+        Returns:
+            torch.Tensor: The unpacked tensor.
+        """
         _step = W_q.shape[0]
         tmp = torch.empty([2 * _step, W_q.shape[1]], dtype=torch.uint8, device=W_q.device)
         tmp[:_step] = (W_q & 0b11110000) >> 4
@@ -61,13 +113,30 @@ def unpack_4bit_u8(W_q):  # uint8/2 > uint8
     ################################################
     @staticmethod
     def pack_2bit_u8(W_q):  # uint8 > uint8/4
+        """
+        Packs a 2-bit tensor to uint8.
+
+        Args:
+            W_q (torch.Tensor): The tensor to be packed.
+
+        Returns:
+            torch.Tensor: The packed tensor.
+        """
         W_q = W_q.to(torch.uint8)
         _step = int(len(W_q) / 4)
         return W_q[:_step] << 6 | W_q[_step : 2 * _step] << 4 | W_q[2 * _step : 3 * _step] << 2 | W_q[3 * _step :]
 
-    # A bit faster than the _cat version
     @staticmethod
     def unpack_2bit_u8(W_q):
+        """
+        Unpacks a 2-bit tensor from uint8.
+
+        Args:
+            W_q (torch.Tensor): The tensor to be unpacked.
+
+        Returns:
+            torch.Tensor: The unpacked tensor.
+        """
         _step = W_q.shape[0]
         tmp = torch.empty([4 * _step, W_q.shape[1]], dtype=torch.uint8, device=W_q.device)
         tmp[:_step] = (W_q & 0b11000000) >> 6
@@ -80,6 +149,15 @@ def unpack_2bit_u8(W_q):
     ################################################
     @staticmethod
     def pack_3bit_32(W_q_in):
+        """
+        Packs a 3-bit tensor to int32.
+
+        Args:
+            W_q_in (torch.Tensor): The tensor to be packed.
+
+        Returns:
+            torch.Tensor: The packed tensor.
+        """
         W_q = torch.zeros(
             [int(10 * np.ceil(W_q_in.shape[0] / 10.0)), W_q_in.shape[1]], device=W_q_in.device, dtype=torch.int32
         )
@@ -99,9 +177,17 @@ def pack_3bit_32(W_q_in):
         )
         return W_q
 
-    # A bit faster than _cat version
     @staticmethod
     def unpack_3bit_32(W_q):
+        """
+        Unpacks a 3-bit tensor from int32.
+
+        Args:
+            W_q (torch.Tensor): The tensor to be unpacked.
+
+        Returns:
+            torch.Tensor: The unpacked tensor.
+        """
         _step = W_q.shape[0]
         tmp = torch.empty([10 * _step, W_q.shape[1]], dtype=torch.uint8, device=W_q.device)
         tmp[:_step] = (W_q & 0b00111000000000000000000000000000) >> 27
@@ -118,6 +204,22 @@ def unpack_3bit_32(W_q):
 
 
 class Packer:
+    """
+    A class for managing bit packing functions.
+
+    This class provides methods to get the appropriate packing and unpacking
+    functions based on the number of bits.
+
+    Attributes:
+        bit_to_packing (dict): A mapping from bit-width to packing format.
+        pack_fn_mapping (dict): A mapping from packing format to packing function.
+        unpack_fn_mapping (dict): A mapping from packing format to unpacking function.
+
+    Methods:
+        get_pack_fn(nbits): Returns the packing function for the given bit-width.
+        get_unpack_fn(nbits): Returns the unpacking function for the given bit-width.
+    """
+
     # TODO: Refine the packer
     bit_to_packing = {8: "8bit_u8", 4: "4bit_u8", 3: "3bit_32", 2: "2bit_u8"}
 
@@ -137,8 +239,26 @@ class Packer:
 
     @staticmethod
     def get_pack_fn(nbits: int):
+        """
+        Returns the packing function for the given bit-width.
+
+        Args:
+            nbits (int): The bit-width.
+
+        Returns:
+            function: The packing function.
+        """
         return Packer.pack_fn_mapping[Packer.bit_to_packing[nbits]]
 
     @staticmethod
     def get_unpack_fn(nbits: int):
+        """
+        Returns the unpacking function for the given bit-width.
+
+        Args:
+            nbits (int): The bit-width.
+
+        Returns:
+            function: The unpacking function.
+        """
         return Packer.unpack_fn_mapping[Packer.bit_to_packing[nbits]]
diff --git a/neural_compressor/torch/algorithms/weight_only/hqq/config.py b/neural_compressor/torch/algorithms/weight_only/hqq/config.py
@@ -33,6 +33,12 @@
 
 
 class HQQGlobalOptions:
+    """
+    Global options for HQQ.
+
+    Attributes:
+        use_half (bool): Whether to use half precision.
+    """
     use_half = os.getenv("HQQ_NOT_USE_HALF", "0") == "0"
 
 
@@ -41,6 +47,17 @@ class HQQGlobalOptions:
 
 @dataclass
 class QTensorConfig:
+    """
+    Configuration for quantized tensors.
+
+    Attributes:
+        nbits (int): Number of bits for quantization.
+        channel_wise (bool): Whether to use channel-wise quantization.
+        group_size (int): Size of the quantization group.
+        optimize (bool): Whether to optimize the quantization.
+        round_zero (Optional[bool]): Whether to round zero.
+        pack (bool): Whether to pack the quantized tensor.
+    """
     nbits: int
     channel_wise: bool = True
     group_size: int = 128
@@ -67,6 +84,14 @@ class HQQModuleConfig(
         ["weight", "scale", "zero"],
     )
 ):
+    """
+    Configuration for HQQ modules.
+
+    Attributes:
+        weight (QTensorConfig): Configuration for weight quantization.
+        scale (QTensorConfig): Configuration for scale quantization.
+        zero (QTensorConfig): Configuration for zero quantization.
+    """
     def __new__(
         cls,
         weight=default_weight_quant_config,