Skip to content

Commit

Permalink
add warning for no exllamav2 kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
wenhuach21 committed Nov 14, 2024
1 parent 8918df2 commit 5724c90
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 5 deletions.
14 changes: 10 additions & 4 deletions auto_round/auto_quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@ def is_auto_round_available():
)



#
def get_device(obj: Union[torch.Tensor, nn.Module]):
if isinstance(obj, torch.Tensor):
Expand Down Expand Up @@ -535,6 +534,13 @@ def remove_str(input_string: str, sub_str) -> str:
layer_backend = get_layer_backend(
target_device, target_backend, orig_backend, bits, group_size, sym, in_features, out_features
)
if "gptq" in layer_backend and "exllamav2" in layer_backend:
try:
from exllamav2_kernels import gemm_half_q_half, make_q_matrix
except:
logger.warning_once(
"For better inference performance, please install exllamav2 kernel "
"via `pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@b8b4127`")

QuantLinear = dynamic_import_inference_linear(layer_backend, bits, group_size, sym)

Expand Down Expand Up @@ -575,9 +581,9 @@ def remove_str(input_string: str, sub_str) -> str:
def cpu_post_init(self, model):
dep_check = True
message = "Repacking to CPU format"
layers = [] ## ipex post_init will add one more layer
for n,m in model.named_modules():
layers.append((n,m))
layers = [] ## ipex post_init will add one more layer
for n, m in model.named_modules():
layers.append((n, m))

for n, layer in tqdm(layers, desc=message, total=len(layers),
leave=True):
Expand Down
2 changes: 1 addition & 1 deletion auto_round/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -834,7 +834,7 @@ def get_autogptq_packing_qlinear(backend, bits=4, group_size=128, sym=False):
from auto_gptq.utils.import_utils import dynamically_import_QuantLinear # pylint: disable=E0401
version = get_library_version("auto_gptq")
from packaging.version import Version
if Version(version) <= Version("0.7.1"):
if Version(version) < Version("0.7.2"):
QuantLinear = dynamically_import_QuantLinear(
use_triton=use_triton,
desc_act=False,
Expand Down

0 comments on commit 5724c90

Please sign in to comment.