diff --git a/src/compressed_tensors/quantization/lifecycle/apply.py b/src/compressed_tensors/quantization/lifecycle/apply.py index 79aac4e9b..1c3faf07b 100644 --- a/src/compressed_tensors/quantization/lifecycle/apply.py +++ b/src/compressed_tensors/quantization/lifecycle/apply.py @@ -14,7 +14,7 @@ import logging import re -from collections import OrderedDict +from collections import OrderedDict, defaultdict from copy import deepcopy from typing import Dict, Iterable, List, Optional from typing import OrderedDict as OrderedDictType @@ -125,13 +125,14 @@ def apply_quantization_config(model: Module, config: QuantizationConfig) -> Dict target_to_scheme[target] = scheme # list of submodules to ignore - ignored_submodules = [] + ignored_submodules = defaultdict(list) # mark appropriate layers for quantization by setting their quantization schemes for name, submodule in iter_named_leaf_modules(model): # potentially fix module name to remove FSDP wrapper prefix name = fix_fsdp_module_name(name) - if find_name_or_class_matches(name, submodule, config.ignore): - ignored_submodules.append(name) + if matches := find_name_or_class_matches(name, submodule, config.ignore): + for match in matches: + ignored_submodules[match].append(name) continue # layer matches ignore list, continue targets = find_name_or_class_matches(name, submodule, target_to_scheme) if targets: diff --git a/src/compressed_tensors/quantization/lifecycle/calibration.py b/src/compressed_tensors/quantization/lifecycle/calibration.py index b1fe2126a..cef81adbe 100644 --- a/src/compressed_tensors/quantization/lifecycle/calibration.py +++ b/src/compressed_tensors/quantization/lifecycle/calibration.py @@ -36,8 +36,8 @@ def set_module_for_calibration(module: Module, quantize_weights_upfront: bool = apply to full model with `model.apply(set_module_for_calibration)` :param module: module to set for calibration - :param quantize_weights_upfront: whether to automatically run weight quantization at the - start of calibration + :param quantize_weights_upfront: whether to automatically + run weight quantization at the start of calibration """ if not getattr(module, "quantization_scheme", None): # no quantization scheme nothing to do diff --git a/tests/test_quantization/lifecycle/test_apply.py b/tests/test_quantization/lifecycle/test_apply.py index 558b9ee1f..42efe4420 100644 --- a/tests/test_quantization/lifecycle/test_apply.py +++ b/tests/test_quantization/lifecycle/test_apply.py @@ -15,6 +15,7 @@ import re from typing import Optional +import pytest import torch from compressed_tensors.config import CompressionFormat from compressed_tensors.quantization import ( @@ -223,3 +224,51 @@ def get_sample_tinyllama_quant_config(status: str = "frozen"): "ignore": ["LlamaRotaryEmbedding", "model.layers.1.mlp.down_proj"], } return QuantizationConfig.parse_obj(config_dict) + + +@pytest.mark.parametrize( + "ignore,should_raise_warning", + [ + [("lm_head", "re:.*gate"), False], + [("lm_head", "re:.*foobarbaz"), True], + ], +) +def test_apply_quantization_status(caplog, ignore, should_raise_warning): + import logging + + from transformers import AutoModelForCausalLM + + # load a dense, unquantized tiny llama model + device = "cuda:0" + model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" + model = AutoModelForCausalLM.from_pretrained( + model_name, device_map=device, torch_dtype="auto" + ) + + quantization_config_dict = { + "quant_method": "sparseml", + "format": "pack-quantized", + "global_compression_ratio": None, + "config_groups": { + "group_1": { + "weights": { + "num_bits": 4, + "type": "int", + "symmetric": False, + "strategy": "tensor", + }, + "targets": ["Linear"], + } + }, + } + quantization_config_dict["ignore"] = ignore + + config = QuantizationConfig(**quantization_config_dict) + config.quantization_status = QuantizationStatus.CALIBRATION + + if should_raise_warning: + # mismatch in the ignore key of quantization_config_dict + with caplog.at_level(logging.WARNING): + apply_quantization_config(model, config) + else: + apply_quantization_config(model, config)