Skip to content

Commit

Permalink
add QUANT_TYPE in qlinear (#736)
Browse files Browse the repository at this point in the history
* add QUANT_TYPE in qlinear

Signed-off-by: jiqing-feng <[email protected]>

* add comments

Signed-off-by: jiqing-feng <[email protected]>

---------

Signed-off-by: jiqing-feng <[email protected]>
  • Loading branch information
jiqing-feng authored Dec 4, 2024
1 parent 1995602 commit 0326109
Show file tree
Hide file tree
Showing 8 changed files with 16 additions and 0 deletions.
2 changes: 2 additions & 0 deletions gptqmodel/nn_modules/qlinear/qlinear_bitblas.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ class BitBLASQuantLinear(BaseQuantLinear):
torch.half: "float16",
torch.int8: "int8",
}
# for transformers/optimum tests compat
QUANT_TYPE = "bitblas"

def __init__(
self,
Expand Down
2 changes: 2 additions & 0 deletions gptqmodel/nn_modules/qlinear/qlinear_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
class CudaQuantLinear(TorchQuantLinear):
SUPPORTS_BITS = [2, 3, 4, 8]
SUPPORTS_DEVICES = [DEVICE.CUDA]
# for transformers/optimum tests compat
QUANT_TYPE = "cuda"

def __init__(
self,
Expand Down
2 changes: 2 additions & 0 deletions gptqmodel/nn_modules/qlinear/qlinear_exllama.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ class ExllamaQuantLinear(BaseQuantLinear):
SUPPORTS_IN_FEATURES_DIVISIBLE_BY = [32]
SUPPORTS_OUT_FEATURES_DIVISIBLE_BY = [32]
SUPPORTS_DEVICES = [DEVICE.CUDA]
# for transformers/optimum tests compat
QUANT_TYPE = "exllama"

"""Linear layer implementation with per-group 4-bit quantization of the weights"""

Expand Down
2 changes: 2 additions & 0 deletions gptqmodel/nn_modules/qlinear/qlinear_exllamav2.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ class ExllamaV2QuantLinear(BaseQuantLinear):
SUPPORTS_IN_FEATURES_DIVISIBLE_BY = [32]
SUPPORTS_OUT_FEATURES_DIVISIBLE_BY = [32]
SUPPORTS_DEVICES = [DEVICE.CUDA]
# for transformers/optimum tests compat
QUANT_TYPE = "exllamav2"

"""Linear layer implementation with per-group 4-bit quantization of the weights"""

Expand Down
2 changes: 2 additions & 0 deletions gptqmodel/nn_modules/qlinear/qlinear_ipex.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ def convert_dtype_torch2str(dtype):
class IPEXQuantLinear(BaseQuantLinear):
SUPPORTS_BITS = [4]
SUPPORTS_DEVICES = [DEVICE.CPU, DEVICE.XPU]
# for transformers/optimum tests compat
QUANT_TYPE = "ipex"

def __init__(
self,
Expand Down
2 changes: 2 additions & 0 deletions gptqmodel/nn_modules/qlinear/qlinear_marlin.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,8 @@ class MarlinQuantLinear(BaseQuantLinear):
SUPPORTS_SYM = [True]
SUPPORTS_OUT_FEATURES_DIVISIBLE_BY = [64]
SUPPORTS_DEVICES = [DEVICE.CUDA]
# for transformers/optimum tests compat
QUANT_TYPE = "marlin"

def __init__(self, bits: int, group_size: int, desc_act: bool, sym: bool, infeatures: int, outfeatures: int,
bias: bool, **kwargs):
Expand Down
2 changes: 2 additions & 0 deletions gptqmodel/nn_modules/qlinear/qlinear_torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
class TorchQuantLinear(BaseQuantLinear):
SUPPORTS_BITS = [2, 3, 4, 8]
SUPPORTS_DEVICES = [DEVICE.CPU, DEVICE.XPU, DEVICE.CUDA]
# for transformers/optimum tests compat
QUANT_TYPE = "torch"

def __init__(
self,
Expand Down
2 changes: 2 additions & 0 deletions gptqmodel/nn_modules/qlinear/qlinear_tritonv2.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ class TritonV2QuantLinear(BaseQuantLinear, TritonModuleMixin):
SUPPORTS_IN_FEATURES_DIVISIBLE_BY = [32]
SUPPORTS_OUT_FEATURES_DIVISIBLE_BY = [32]
SUPPORTS_DEVICES = [DEVICE.CUDA]
# for transformers/optimum tests compat
QUANT_TYPE = "tritonv2"

"""
Triton v2 quantized linear layer.
Expand Down

0 comments on commit 0326109

Please sign in to comment.