Skip to content

Commit

Permalink
Fix weight_only algorithms import (#1742)
Browse files Browse the repository at this point in the history
Signed-off-by: Kaihui-intel <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
Kaihui-intel and pre-commit-ci[bot] authored Apr 23, 2024
1 parent 0ba5732 commit e87c95f
Show file tree
Hide file tree
Showing 7 changed files with 31 additions and 34 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ wandb
einops
neural-compressor
intel-extension-for-transformers
git+https://github.com/EleutherAI/lm-evaluation-harness.git@cc9778fbe4fa1a709be2abed9deb6180fd40e7e2
lm-eval
peft
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,7 @@
help="Pad input ids to max length.")
parser.add_argument("--calib_iters", default=512, type=int,
help="calibration iters.")
parser.add_argument("--tasks", nargs='+', default=["lambada_openai",
"hellaswag", "winogrande", "piqa", "wikitext"],
parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext",
type=str, help="tasks list for accuracy validation")
parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
# ============SmoothQuant configs==============
Expand Down Expand Up @@ -390,24 +389,27 @@ def run_fn(model):

if args.accuracy:
user_model.eval()
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate

results = evaluate(
model="hf-causal",
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
eval_args = LMEvalParser(
model="hf",
model_args='pretrained=' + args.model + ',tokenizer=' + args.model + ',dtype=float32',
user_model=user_model,
tokenizer = tokenizer,
batch_size=args.batch_size,
tasks=args.tasks,
device="cpu",
)
results = evaluate(eval_args)

dumped = json.dumps(results, indent=2)
if args.save_accuracy_path:
with open(args.save_accuracy_path, "w") as f:
f.write(dumped)
for task_name in args.tasks:
for task_name in args.tasks.split(","):
if task_name == "wikitext":
acc = results["results"][task_name]["word_perplexity"]
acc = results["results"][task_name]["word_perplexity,none"]
else:
acc = results["results"][task_name]["acc"]
acc = results["results"][task_name]["acc,none"]
print("Accuracy: %.5f" % acc)
print('Batch size = %d' % args.batch_size)

Expand All @@ -417,21 +419,25 @@ def run_fn(model):
import time

samples = args.iters * args.batch_size
start = time.time()
results = evaluate(
model="hf-causal",
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
eval_args = LMEvalParser(
model="hf",
model_args='pretrained=' + args.model + ',tokenizer=' + args.model + ',dtype=float32',
user_model=user_model,
tokenizer = tokenizer,
batch_size=args.batch_size,
tasks=args.tasks,
limit=samples,
device="cpu",
)
start = time.time()
results = evaluate(eval_args)
end = time.time()
for task_name in args.tasks:
for task_name in args.tasks.split(","):
if task_name == "wikitext":
acc = results["results"][task_name]["word_perplexity"]
acc = results["results"][task_name]["word_perplexity,none"]
else:
acc = results["results"][task_name]["acc"]
acc = results["results"][task_name]["acc,none"]
print("Accuracy: %.5f" % acc)
print('Throughput: %.3f samples/sec' % (samples / (end - start)))
print('Latency: %.3f ms' % ((end - start) * 1000 / samples))
Expand Down
9 changes: 0 additions & 9 deletions neural_compressor/torch/algorithms/weight_only/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .rtn import rtn_quantize
from .gptq import gptq_quantize
from .awq import awq_quantize
from .teq import teq_quantize
from .autoround import autoround_quantize
from .hqq import hqq_quantize
from .modules import WeightOnlyLinear
from .utility import *
2 changes: 1 addition & 1 deletion neural_compressor/torch/algorithms/weight_only/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def __init__(
bits = self.dtype.lstrip("int")
self.dtype = "int"
if "int" not in self.dtype: # for nf4, fp4
from neural_compressor.torch.algorithms.weight_only import FLOAT_MAPPING, INT_MAPPING
from neural_compressor.torch.algorithms.weight_only.utility import FLOAT_MAPPING, INT_MAPPING

self.use_optimum_format = False # optimum_format doesn't suit for symmetric nf4 fp4.
float_list = FLOAT_MAPPING[self.dtype]
Expand Down
12 changes: 6 additions & 6 deletions neural_compressor/torch/quantization/algorithm_entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def rtn_entry(
model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], RTNConfig], *args, **kwargs
) -> torch.nn.Module:
"""The main entry to apply rtn quantization."""
from neural_compressor.torch.algorithms.weight_only import rtn_quantize
from neural_compressor.torch.algorithms.weight_only.rtn import rtn_quantize

# rebuild weight_config for rtn_quantize function
weight_config = {}
Expand Down Expand Up @@ -75,7 +75,7 @@ def gptq_entry(
model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], GPTQConfig], *args, **kwargs
) -> torch.nn.Module:
logger.info("Quantize model with the GPTQ algorithm.")
from neural_compressor.torch.algorithms.weight_only import gptq_quantize
from neural_compressor.torch.algorithms.weight_only.gptq import gptq_quantize

# rebuild weight_config for gptq_quantize function
weight_config = {}
Expand Down Expand Up @@ -228,7 +228,7 @@ def awq_quantize_entry(
model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], AWQConfig], *args, **kwargs
) -> torch.nn.Module:
logger.info("Quantize model with the AWQ algorithm.")
from neural_compressor.torch.algorithms.weight_only import awq_quantize
from neural_compressor.torch.algorithms.weight_only.awq import awq_quantize

weight_config = {}
for (op_name, op_type), op_config in configs_mapping.items():
Expand Down Expand Up @@ -288,7 +288,7 @@ def awq_quantize_entry(
def teq_quantize_entry(
model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], TEQConfig], *args, **kwargs
) -> torch.nn.Module:
from neural_compressor.torch.algorithms.weight_only import teq_quantize
from neural_compressor.torch.algorithms.weight_only.teq import teq_quantize

logger.info("Quantize model with the TEQ algorithm.")
weight_config = {}
Expand Down Expand Up @@ -338,7 +338,7 @@ def teq_quantize_entry(
def autoround_quantize_entry(
model: torch.nn.Module, configs_mapping: Dict[Tuple[str, callable], AutoRoundConfig], *args, **kwargs
) -> torch.nn.Module:
from neural_compressor.torch.algorithms.weight_only import autoround_quantize
from neural_compressor.torch.algorithms.weight_only.autoround import autoround_quantize

logger.info("Quantize model with the AutoRound algorithm.")
calib_func = kwargs.get("run_fn", None)
Expand Down Expand Up @@ -407,7 +407,7 @@ def autoround_quantize_entry(
def hqq_entry(
model: torch.nn.Module, configs_mapping: Dict[Tuple[str, Callable], HQQConfig], *args, **kwargs
) -> torch.nn.Module:
from neural_compressor.torch.algorithms.weight_only import hqq_quantize
from neural_compressor.torch.algorithms.weight_only.hqq import hqq_quantize

logger.info("Quantize model with the HQQ algorithm.")
q_model = hqq_quantize(model, configs_mapping)
Expand Down
2 changes: 1 addition & 1 deletion test/3x/torch/quantization/weight_only/test_gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import torch
import transformers

from neural_compressor.torch.algorithms.weight_only import WeightOnlyLinear
from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear
from neural_compressor.torch.quantization import GPTQConfig, get_default_gptq_config, get_default_rtn_config, quantize


Expand Down
2 changes: 1 addition & 1 deletion test/3x/torch/quantization/weight_only/test_rtn.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import torch
import transformers

from neural_compressor.torch.algorithms.weight_only import WeightOnlyLinear
from neural_compressor.torch.algorithms.weight_only.modules import WeightOnlyLinear
from neural_compressor.torch.quantization import (
RTNConfig,
get_default_double_quant_config,
Expand Down

0 comments on commit e87c95f

Please sign in to comment.