From a2247765aea5c52bb5c64eeea36724d4830f6f09 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Thu, 6 Jun 2024 14:13:16 +0800 Subject: [PATCH 01/15] add pt2e llm example Signed-off-by: Kaihui-intel --- .../quantization/static_quant/pt2e/README.md | 27 ++++ .../static_quant/pt2e/requirements.txt | 7 + .../static_quant/pt2e/run_clm_no_trainer.py | 153 ++++++++++++++++++ 3 files changed, 187 insertions(+) create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md new file mode 100644 index 00000000000..bc8cb057ee5 --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md @@ -0,0 +1,27 @@ +Step-by-Step +============ +This document describes the step-by-step instructions to run large language models (LLMs) on 4th Gen Intel® Xeon® Scalable Processor (codenamed Sapphire Rapids) with PyTorch 2 Export Quantization. + +The script `run_clm_no_trainer.py` supports `OPT` quantization and validates last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git) now, and we are adding more models. + +# Prerequisite +## 1. Create Environment +``` +# Installation +pip install -r requirements.txt +``` + +# Run + +Here is how to run the scripts: + +**Causal Language Modeling (CLM)** + +`run_clm_no_trainer.py` quantizes the large language models using the dataset [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) validates `lambada_openai`, `piqa`, `winogrande`, `hellaswag` and other datasets accuracy provided by lm_eval, an example command is as follows. +### OPT-125m + +#### Quantization + +```bash +python run_clm_no_trainer.py --model facebook/opt-125m --quantize --accuracy +``` \ No newline at end of file diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt new file mode 100644 index 00000000000..b6d9b6c55de --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt @@ -0,0 +1,7 @@ +transformers +torch +sentencepiece +neural-compressor +intel-extension-for-transformers >= 1.4.1 +lm-eval==0.4.2 +peft \ No newline at end of file diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py new file mode 100644 index 00000000000..2a3ce8ebbcf --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py @@ -0,0 +1,153 @@ +import argparse +import sys + +sys.path.append('./') +import time +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +parser = argparse.ArgumentParser() +parser.add_argument( + "--model", nargs="?", default="facebook/opt-125m" +) +parser.add_argument( + "--trust_remote_code", default=True, + help="Transformers parameter: use the external repo") +parser.add_argument( + "--revision", default=None, + help="Transformers parameter: set the model hub commit number") +parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k") +parser.add_argument("--output_dir", nargs="?", default="./saved_results") +parser.add_argument("--quantize", action="store_true") +parser.add_argument("--approach", type=str, default='static', + help="Select from ['dynamic', 'static', 'weight-only']") +parser.add_argument("--int8", action="store_true") +parser.add_argument("--accuracy", action="store_true") +parser.add_argument("--performance", action="store_true") +parser.add_argument("--iters", default=100, type=int, + help="For accuracy measurement only.") +parser.add_argument("--batch_size", default=1, type=int, + help="For accuracy measurement only.") +parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext", + type=str, help="tasks for accuracy validation") +parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model") +# ======================================= + +args = parser.parse_args() + + +def get_user_model(): + torchscript = False + user_model = AutoModelForCausalLM.from_pretrained( + args.model, + torchscript=torchscript, # torchscript will force `return_dict=False` to avoid jit errors + trust_remote_code=args.trust_remote_code, + revision=args.revision, + ) + tokenizer = AutoTokenizer.from_pretrained(args.model) + + if args.peft_model_id is not None: + from peft import PeftModel + user_model = PeftModel.from_pretrained(user_model, args.peft_model_id) + + # to channels last + user_model = user_model.to(memory_format=torch.channels_last) + user_model.eval() + return user_model, tokenizer + +user_model, tokenizer = get_user_model() +if args.quantize: + + from neural_compressor.torch.quantization import ( + convert, + get_default_static_config, + prepare, + ) + from neural_compressor.torch.export import export + from torch.export import Dim + # set TOKENIZERS_PARALLELISM to false + def get_example_inputs(tokenizer): + text = "Hello, welcome to LLM world." + encoded_input = tokenizer(text, return_tensors="pt") + + example_inputs = encoded_input + # print(f"example_inputs: {example_inputs}") + input_ids = example_inputs["input_ids"] + input_ids_batch = torch.cat((input_ids, input_ids), dim=0) + print(f"input_ids_batch shape: {input_ids_batch.shape}") + tuple_inputs = (input_ids_batch,) + return tuple_inputs + # os.environ["TOKENIZERS_PARALLELISM"] = "false" + # torch._dynamo.config.cache_size_limit = 4 # set limitation if out of memory + batch = Dim(name="batch_size") + seq_len = Dim(name="seq_len") + dynamic_shapes = {"input_ids": (batch, seq_len)} + example_inputs = get_example_inputs(tokenizer) + exported_model = export(user_model, example_inputs=example_inputs, dynamic_shapes=dynamic_shapes) + + quant_config = get_default_static_config() + # prepare + prepare_model = prepare(exported_model, quant_config) + + # calibrate + for i in range(2): + prepare_model(*example_inputs) + # convert + converted_model = convert(prepare_model) + # inference + from torch._inductor import config + + config.freezing = True + opt_model = torch.compile(converted_model) + + opt_model.config = user_model.config # for lm eval + user_model = opt_model + + +if args.accuracy: + # user_model.eval() + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + eval_args = LMEvalParser( + model="hf", + user_model=user_model, + tokenizer=tokenizer, + batch_size=args.batch_size, + tasks=args.tasks, + device="cpu", + ) + results = evaluate(eval_args) + for task_name in args.tasks.split(","): + if task_name == "wikitext": + acc = results["results"][task_name]["word_perplexity,none"] + else: + acc = results["results"][task_name]["acc,none"] + print("Accuracy: %.5f" % acc) + print('Batch size = %d' % args.batch_size) + +if args.performance: + # user_model.eval() + from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser + import time + + samples = args.iters * args.batch_size + eval_args = LMEvalParser( + model="hf", + user_model=user_model, + tokenizer=tokenizer, + batch_size=args.batch_size, + tasks=args.tasks, + limit=samples, + device="cpu", + ) + start = time.time() + results = evaluate(eval_args) + end = time.time() + for task_name in args.tasks.split(","): + if task_name == "wikitext": + acc = results["results"][task_name]["word_perplexity,none"] + else: + acc = results["results"][task_name]["acc,none"] + print("Accuracy: %.5f" % acc) + print('Throughput: %.3f samples/sec' % (samples / (end - start))) + print('Latency: %.3f ms' % ((end - start) * 1000 / samples)) + print('Batch size = %d' % args.batch_size) From 545228906d3bcb40a9dcfa745121644e2f4d51da Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Thu, 6 Jun 2024 15:45:00 +0800 Subject: [PATCH 02/15] add cv example Signed-off-by: Kaihui-intel --- .../3.x_api/pytorch/cv/static_quant/README.md | 27 + .../pytorch/cv/static_quant/extract_ILSVRC.sh | 80 +++ .../3.x_api/pytorch/cv/static_quant/main.py | 514 ++++++++++++++++++ .../pytorch/cv/static_quant/requirements.txt | 3 + 4 files changed, 624 insertions(+) create mode 100644 examples/3.x_api/pytorch/cv/static_quant/README.md create mode 100644 examples/3.x_api/pytorch/cv/static_quant/extract_ILSVRC.sh create mode 100644 examples/3.x_api/pytorch/cv/static_quant/main.py create mode 100644 examples/3.x_api/pytorch/cv/static_quant/requirements.txt diff --git a/examples/3.x_api/pytorch/cv/static_quant/README.md b/examples/3.x_api/pytorch/cv/static_quant/README.md new file mode 100644 index 00000000000..2ba08952e1b --- /dev/null +++ b/examples/3.x_api/pytorch/cv/static_quant/README.md @@ -0,0 +1,27 @@ +# ImageNet training in PyTorch + +This implements quantization of popular model architectures, such as ResNet, AlexNet, and VGG on the ImageNet dataset. + +## Requirements + +- Install requirements +- `pip install -r requirements.txt` +- Download the ImageNet dataset from http://www.image-net.org/ + - Then, move and extract the training and validation images to labeled subfolders, using [the following shell script](extract_ILSVRC.sh) + +## Quantizaiton + +To quant a model and validate accaracy, run `main.py` with the desired model architecture and the path to the ImageNet dataset: + +```bash +python main.py -a resnet18 [imagenet-folder with train and val folders] -q -e +``` + + +## Use Dummy Data + +ImageNet dataset is large and time-consuming to download. To get started quickly, run `main.py` using dummy data by "--dummy". Note that the loss or accuracy is useless in this case. + +```bash +python main.py -a resnet18 --dummy -q -e +``` \ No newline at end of file diff --git a/examples/3.x_api/pytorch/cv/static_quant/extract_ILSVRC.sh b/examples/3.x_api/pytorch/cv/static_quant/extract_ILSVRC.sh new file mode 100644 index 00000000000..3ec05e8f328 --- /dev/null +++ b/examples/3.x_api/pytorch/cv/static_quant/extract_ILSVRC.sh @@ -0,0 +1,80 @@ +#!/bin/bash +# +# script to extract ImageNet dataset +# ILSVRC2012_img_train.tar (about 138 GB) +# ILSVRC2012_img_val.tar (about 6.3 GB) +# make sure ILSVRC2012_img_train.tar & ILSVRC2012_img_val.tar in your current directory +# +# Adapted from: +# https://github.com/facebook/fb.resnet.torch/blob/master/INSTALL.md +# https://gist.github.com/BIGBALLON/8a71d225eff18d88e469e6ea9b39cef4 +# +# imagenet/train/ +# ├── n01440764 +# │ ├── n01440764_10026.JPEG +# │ ├── n01440764_10027.JPEG +# │ ├── ...... +# ├── ...... +# imagenet/val/ +# ├── n01440764 +# │ ├── ILSVRC2012_val_00000293.JPEG +# │ ├── ILSVRC2012_val_00002138.JPEG +# │ ├── ...... +# ├── ...... +# +# +# Make imagnet directory +# +mkdir imagenet +# +# Extract the training data: +# +# Create train directory; move .tar file; change directory +mkdir imagenet/train && mv ILSVRC2012_img_train.tar imagenet/train/ && cd imagenet/train +# Extract training set; remove compressed file +tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar +# +# At this stage imagenet/train will contain 1000 compressed .tar files, one for each category +# +# For each .tar file: +# 1. create directory with same name as .tar file +# 2. extract and copy contents of .tar file into directory +# 3. remove .tar file +find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done +# +# This results in a training directory like so: +# +# imagenet/train/ +# ├── n01440764 +# │ ├── n01440764_10026.JPEG +# │ ├── n01440764_10027.JPEG +# │ ├── ...... +# ├── ...... +# +# Change back to original directory +cd ../.. +# +# Extract the validation data and move images to subfolders: +# +# Create validation directory; move .tar file; change directory; extract validation .tar; remove compressed file +mkdir imagenet/val && mv ILSVRC2012_img_val.tar imagenet/val/ && cd imagenet/val && tar -xvf ILSVRC2012_img_val.tar && rm -f ILSVRC2012_img_val.tar +# get script from soumith and run; this script creates all class directories and moves images into corresponding directories +wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash +# +# This results in a validation directory like so: +# +# imagenet/val/ +# ├── n01440764 +# │ ├── ILSVRC2012_val_00000293.JPEG +# │ ├── ILSVRC2012_val_00002138.JPEG +# │ ├── ...... +# ├── ...... +# +# +# Check total files after extract +# +# $ find train/ -name "*.JPEG" | wc -l +# 1281167 +# $ find val/ -name "*.JPEG" | wc -l +# 50000 +# \ No newline at end of file diff --git a/examples/3.x_api/pytorch/cv/static_quant/main.py b/examples/3.x_api/pytorch/cv/static_quant/main.py new file mode 100644 index 00000000000..6617dadba5e --- /dev/null +++ b/examples/3.x_api/pytorch/cv/static_quant/main.py @@ -0,0 +1,514 @@ +import argparse +import os +import random +import shutil +import time +import warnings +from enum import Enum + +import torch +import torch.backends.cudnn as cudnn +import torch.distributed as dist +import torch.multiprocessing as mp +import torch.nn as nn +import torch.nn.parallel +import torch.optim +import torch.utils.data +import torch.utils.data.distributed +import torchvision.datasets as datasets +import torchvision.models as models +import torchvision.transforms as transforms +from torch.optim.lr_scheduler import StepLR +from torch.utils.data import Subset + +model_names = sorted(name for name in models.__dict__ + if name.islower() and not name.startswith("__") + and callable(models.__dict__[name])) + +parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') +parser.add_argument('data', metavar='DIR', nargs='?', default='imagenet', + help='path to dataset (default: imagenet)') +parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18', + choices=model_names, + help='model architecture: ' + + ' | '.join(model_names) + + ' (default: resnet18)') +parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', + help='number of data loading workers (default: 4)') +parser.add_argument('--epochs', default=90, type=int, metavar='N', + help='number of total epochs to run') +parser.add_argument('--start-epoch', default=0, type=int, metavar='N', + help='manual epoch number (useful on restarts)') +parser.add_argument('-b', '--batch-size', default=256, type=int, + metavar='N', + help='mini-batch size (default: 256), this is the total ' + 'batch size of all GPUs on the current node when ' + 'using Data Parallel or Distributed Data Parallel') +parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, + metavar='LR', help='initial learning rate', dest='lr') +parser.add_argument('--momentum', default=0.9, type=float, metavar='M', + help='momentum') +parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, + metavar='W', help='weight decay (default: 1e-4)', + dest='weight_decay') +parser.add_argument('-p', '--print-freq', default=10, type=int, + metavar='N', help='print frequency (default: 10)') +parser.add_argument('--resume', default='', type=str, metavar='PATH', + help='path to latest checkpoint (default: none)') +parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', + help='evaluate model on validation set') +parser.add_argument('--pretrained', dest='pretrained', action='store_true', + help='use pre-trained model') +parser.add_argument('--world-size', default=-1, type=int, + help='number of nodes for distributed training') +parser.add_argument('--rank', default=-1, type=int, + help='node rank for distributed training') +parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, + help='url used to set up distributed training') +parser.add_argument('--dist-backend', default='nccl', type=str, + help='distributed backend') +parser.add_argument('--seed', default=None, type=int, + help='seed for initializing training. ') +parser.add_argument('--gpu', default=None, type=int, + help='GPU id to use.') +parser.add_argument('--multiprocessing-distributed', action='store_true', + help='Use multi-processing distributed training to launch ' + 'N processes per node, which has N GPUs. This is the ' + 'fastest way to use PyTorch for either single node or ' + 'multi node data parallel training') +parser.add_argument('--dummy', action='store_true', help="use fake data to benchmark") +parser.add_argument('-q', '--quantize', dest='quantize', action='store_true', + help='quantize model') + +best_acc1 = 0 + + +def main(): + args = parser.parse_args() + + if args.seed is not None: + random.seed(args.seed) + torch.manual_seed(args.seed) + cudnn.deterministic = True + cudnn.benchmark = False + warnings.warn('You have chosen to seed training. ' + 'This will turn on the CUDNN deterministic setting, ' + 'which can slow down your training considerably! ' + 'You may see unexpected behavior when restarting ' + 'from checkpoints.') + + if args.gpu is not None: + warnings.warn('You have chosen a specific GPU. This will completely ' + 'disable data parallelism.') + + if args.dist_url == "env://" and args.world_size == -1: + args.world_size = int(os.environ["WORLD_SIZE"]) + + args.distributed = args.world_size > 1 or args.multiprocessing_distributed + + if torch.cuda.is_available(): + ngpus_per_node = torch.cuda.device_count() + if ngpus_per_node == 1 and args.dist_backend == "nccl": + warnings.warn("nccl backend >=2.5 requires GPU count>1, see https://github.com/NVIDIA/nccl/issues/103 perhaps use 'gloo'") + else: + ngpus_per_node = 1 + + if args.multiprocessing_distributed: + # Since we have ngpus_per_node processes per node, the total world_size + # needs to be adjusted accordingly + args.world_size = ngpus_per_node * args.world_size + # Use torch.multiprocessing.spawn to launch distributed processes: the + # main_worker process function + mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) + else: + # Simply call main_worker function + main_worker(args.gpu, ngpus_per_node, args) + + +def main_worker(gpu, ngpus_per_node, args): + global best_acc1 + args.gpu = gpu + + if args.gpu is not None: + print("Use GPU: {} for training".format(args.gpu)) + + if args.distributed: + if args.dist_url == "env://" and args.rank == -1: + args.rank = int(os.environ["RANK"]) + if args.multiprocessing_distributed: + # For multiprocessing distributed training, rank needs to be the + # global rank among all the processes + args.rank = args.rank * ngpus_per_node + gpu + dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + # create model + if args.pretrained: + print("=> using pre-trained model '{}'".format(args.arch)) + model = models.__dict__[args.arch](pretrained=True) + else: + print("=> creating model '{}'".format(args.arch)) + model = models.__dict__[args.arch]() + + if not torch.cuda.is_available() and not torch.backends.mps.is_available(): + print('using CPU, this will be slow') + elif args.distributed: + # For multiprocessing distributed, DistributedDataParallel constructor + # should always set the single device scope, otherwise, + # DistributedDataParallel will use all available devices. + if torch.cuda.is_available(): + if args.gpu is not None: + torch.cuda.set_device(args.gpu) + model.cuda(args.gpu) + # When using a single GPU per process and per + # DistributedDataParallel, we need to divide the batch size + # ourselves based on the total number of GPUs of the current node. + args.batch_size = int(args.batch_size / ngpus_per_node) + args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) + else: + model.cuda() + # DistributedDataParallel will divide and allocate batch_size to all + # available GPUs if device_ids are not set + model = torch.nn.parallel.DistributedDataParallel(model) + elif args.gpu is not None and torch.cuda.is_available(): + torch.cuda.set_device(args.gpu) + model = model.cuda(args.gpu) + elif torch.backends.mps.is_available(): + device = torch.device("mps") + model = model.to(device) + else: + # DataParallel will divide and allocate batch_size to all available GPUs + if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): + model.features = torch.nn.DataParallel(model.features) + model.cuda() + else: + model = torch.nn.DataParallel(model).cuda() + + if torch.cuda.is_available(): + if args.gpu: + device = torch.device('cuda:{}'.format(args.gpu)) + else: + device = torch.device("cuda") + elif torch.backends.mps.is_available(): + device = torch.device("mps") + else: + device = torch.device("cpu") + # define loss function (criterion), optimizer, and learning rate scheduler + criterion = nn.CrossEntropyLoss().to(device) + + optimizer = torch.optim.SGD(model.parameters(), args.lr, + momentum=args.momentum, + weight_decay=args.weight_decay) + + """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" + scheduler = StepLR(optimizer, step_size=30, gamma=0.1) + + # optionally resume from a checkpoint + if args.resume: + if os.path.isfile(args.resume): + print("=> loading checkpoint '{}'".format(args.resume)) + if args.gpu is None: + checkpoint = torch.load(args.resume) + elif torch.cuda.is_available(): + # Map model to be loaded to specified single gpu. + loc = 'cuda:{}'.format(args.gpu) + checkpoint = torch.load(args.resume, map_location=loc) + args.start_epoch = checkpoint['epoch'] + best_acc1 = checkpoint['best_acc1'] + if args.gpu is not None: + # best_acc1 may be from a checkpoint from a different GPU + best_acc1 = best_acc1.to(args.gpu) + model.load_state_dict(checkpoint['state_dict']) + optimizer.load_state_dict(checkpoint['optimizer']) + scheduler.load_state_dict(checkpoint['scheduler']) + print("=> loaded checkpoint '{}' (epoch {})" + .format(args.resume, checkpoint['epoch'])) + else: + print("=> no checkpoint found at '{}'".format(args.resume)) + + + # Data loading code + if args.dummy: + print("=> Dummy data is used!") + train_dataset = datasets.FakeData(1281167, (3, 224, 224), 1000, transforms.ToTensor()) + val_dataset = datasets.FakeData(50000, (3, 224, 224), 1000, transforms.ToTensor()) + else: + traindir = os.path.join(args.data, 'train') + valdir = os.path.join(args.data, 'val') + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + + train_dataset = datasets.ImageFolder( + traindir, + transforms.Compose([ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ])) + + val_dataset = datasets.ImageFolder( + valdir, + transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ])) + + if args.distributed: + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) + val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=True) + else: + train_sampler = None + val_sampler = None + + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), + num_workers=args.workers, pin_memory=True, sampler=train_sampler) + + val_loader = torch.utils.data.DataLoader( + val_dataset, batch_size=args.batch_size, shuffle=False, + num_workers=args.workers, pin_memory=True, sampler=val_sampler) + + if args.quantize: + from neural_compressor.torch.export import export + from neural_compressor.torch.quantization import prepare, convert, get_default_static_config + + # Prepare the float model and example inputs for export model + model = model + x = torch.randn(args.batch_size, 3, 224, 224).contiguous(memory_format=torch.channels_last) + example_inputs = (x,) + + # Export eager model into FX graph model + exported_model = export(model=model, example_inputs=example_inputs) + # Quantize the model + quant_config = get_default_static_config() + + prepared_model = prepare(exported_model, quant_config=quant_config) + # Calibrate + prepared_model(*example_inputs) + q_model = convert(prepared_model) + # Compile the quantized model and replace the Q/DQ pattern with Q-operator + from torch._inductor import config + + config.freezing = True + opt_model = torch.compile(q_model) + model = opt_model + + + if args.evaluate: + validate(val_loader, model, criterion, args) + return + +def train(train_loader, model, criterion, optimizer, epoch, device, args): + batch_time = AverageMeter('Time', ':6.3f') + data_time = AverageMeter('Data', ':6.3f') + losses = AverageMeter('Loss', ':.4e') + top1 = AverageMeter('Acc@1', ':6.2f') + top5 = AverageMeter('Acc@5', ':6.2f') + progress = ProgressMeter( + len(train_loader), + [batch_time, data_time, losses, top1, top5], + prefix="Epoch: [{}]".format(epoch)) + + # switch to train mode + model.train() + + end = time.time() + for i, (images, target) in enumerate(train_loader): + # measure data loading time + data_time.update(time.time() - end) + + # move data to the same device as model + images = images.to(device, non_blocking=True) + target = target.to(device, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # compute gradient and do SGD step + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i + 1) + + +def validate(val_loader, model, criterion, args): + + def run_validate(loader, base_progress=0): + with torch.no_grad(): + end = time.time() + for i, (images, target) in enumerate(loader): + i = base_progress + i + if args.gpu is not None and torch.cuda.is_available(): + images = images.cuda(args.gpu, non_blocking=True) + if torch.backends.mps.is_available(): + images = images.to('mps') + target = target.to('mps') + if torch.cuda.is_available(): + target = target.cuda(args.gpu, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i + 1) + + batch_time = AverageMeter('Time', ':6.3f', Summary.NONE) + losses = AverageMeter('Loss', ':.4e', Summary.NONE) + top1 = AverageMeter('Acc@1', ':6.2f', Summary.AVERAGE) + top5 = AverageMeter('Acc@5', ':6.2f', Summary.AVERAGE) + progress = ProgressMeter( + len(val_loader) + (args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset))), + [batch_time, losses, top1, top5], + prefix='Test: ') + + # switch to evaluate mode, pt2e no eval() or train() + # model.eval() + + run_validate(val_loader) + if args.distributed: + top1.all_reduce() + top5.all_reduce() + + if args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset)): + aux_val_dataset = Subset(val_loader.dataset, + range(len(val_loader.sampler) * args.world_size, len(val_loader.dataset))) + aux_val_loader = torch.utils.data.DataLoader( + aux_val_dataset, batch_size=args.batch_size, shuffle=False, + num_workers=args.workers, pin_memory=True) + run_validate(aux_val_loader, len(val_loader)) + + progress.display_summary() + + return top1.avg + + +def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): + torch.save(state, filename) + if is_best: + shutil.copyfile(filename, 'model_best.pth.tar') + +class Summary(Enum): + NONE = 0 + AVERAGE = 1 + SUM = 2 + COUNT = 3 + +class AverageMeter(object): + """Computes and stores the average and current value""" + def __init__(self, name, fmt=':f', summary_type=Summary.AVERAGE): + self.name = name + self.fmt = fmt + self.summary_type = summary_type + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + def all_reduce(self): + if torch.cuda.is_available(): + device = torch.device("cuda") + elif torch.backends.mps.is_available(): + device = torch.device("mps") + else: + device = torch.device("cpu") + total = torch.tensor([self.sum, self.count], dtype=torch.float32, device=device) + dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False) + self.sum, self.count = total.tolist() + self.avg = self.sum / self.count + + def __str__(self): + fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' + return fmtstr.format(**self.__dict__) + + def summary(self): + fmtstr = '' + if self.summary_type is Summary.NONE: + fmtstr = '' + elif self.summary_type is Summary.AVERAGE: + fmtstr = '{name} {avg:.3f}' + elif self.summary_type is Summary.SUM: + fmtstr = '{name} {sum:.3f}' + elif self.summary_type is Summary.COUNT: + fmtstr = '{name} {count:.3f}' + else: + raise ValueError('invalid summary type %r' % self.summary_type) + + return fmtstr.format(**self.__dict__) + + +class ProgressMeter(object): + def __init__(self, num_batches, meters, prefix=""): + self.batch_fmtstr = self._get_batch_fmtstr(num_batches) + self.meters = meters + self.prefix = prefix + + def display(self, batch): + entries = [self.prefix + self.batch_fmtstr.format(batch)] + entries += [str(meter) for meter in self.meters] + print('\t'.join(entries)) + + def display_summary(self): + entries = [" *"] + entries += [meter.summary() for meter in self.meters] + print(' '.join(entries)) + + def _get_batch_fmtstr(self, num_batches): + num_digits = len(str(num_batches // 1)) + fmt = '{:' + str(num_digits) + 'd}' + return '[' + fmt + '/' + fmt.format(num_batches) + ']' + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/examples/3.x_api/pytorch/cv/static_quant/requirements.txt b/examples/3.x_api/pytorch/cv/static_quant/requirements.txt new file mode 100644 index 00000000000..ebd3df6ae7a --- /dev/null +++ b/examples/3.x_api/pytorch/cv/static_quant/requirements.txt @@ -0,0 +1,3 @@ +torch +torchvision +neural-compressor \ No newline at end of file From ded7257751dc8314f65f11ccb51ddc710b7797ef Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Thu, 6 Jun 2024 16:03:10 +0800 Subject: [PATCH 03/15] add dynamic example Signed-off-by: Kaihui-intel --- .../pytorch/cv/dynamic_quant/README.md | 27 + .../cv/dynamic_quant/extract_ILSVRC.sh | 80 +++ .../3.x_api/pytorch/cv/dynamic_quant/main.py | 512 ++++++++++++++++++ .../pytorch/cv/dynamic_quant/requirements.txt | 3 + 4 files changed, 622 insertions(+) create mode 100644 examples/3.x_api/pytorch/cv/dynamic_quant/README.md create mode 100644 examples/3.x_api/pytorch/cv/dynamic_quant/extract_ILSVRC.sh create mode 100644 examples/3.x_api/pytorch/cv/dynamic_quant/main.py create mode 100644 examples/3.x_api/pytorch/cv/dynamic_quant/requirements.txt diff --git a/examples/3.x_api/pytorch/cv/dynamic_quant/README.md b/examples/3.x_api/pytorch/cv/dynamic_quant/README.md new file mode 100644 index 00000000000..2ba08952e1b --- /dev/null +++ b/examples/3.x_api/pytorch/cv/dynamic_quant/README.md @@ -0,0 +1,27 @@ +# ImageNet training in PyTorch + +This implements quantization of popular model architectures, such as ResNet, AlexNet, and VGG on the ImageNet dataset. + +## Requirements + +- Install requirements +- `pip install -r requirements.txt` +- Download the ImageNet dataset from http://www.image-net.org/ + - Then, move and extract the training and validation images to labeled subfolders, using [the following shell script](extract_ILSVRC.sh) + +## Quantizaiton + +To quant a model and validate accaracy, run `main.py` with the desired model architecture and the path to the ImageNet dataset: + +```bash +python main.py -a resnet18 [imagenet-folder with train and val folders] -q -e +``` + + +## Use Dummy Data + +ImageNet dataset is large and time-consuming to download. To get started quickly, run `main.py` using dummy data by "--dummy". Note that the loss or accuracy is useless in this case. + +```bash +python main.py -a resnet18 --dummy -q -e +``` \ No newline at end of file diff --git a/examples/3.x_api/pytorch/cv/dynamic_quant/extract_ILSVRC.sh b/examples/3.x_api/pytorch/cv/dynamic_quant/extract_ILSVRC.sh new file mode 100644 index 00000000000..3ec05e8f328 --- /dev/null +++ b/examples/3.x_api/pytorch/cv/dynamic_quant/extract_ILSVRC.sh @@ -0,0 +1,80 @@ +#!/bin/bash +# +# script to extract ImageNet dataset +# ILSVRC2012_img_train.tar (about 138 GB) +# ILSVRC2012_img_val.tar (about 6.3 GB) +# make sure ILSVRC2012_img_train.tar & ILSVRC2012_img_val.tar in your current directory +# +# Adapted from: +# https://github.com/facebook/fb.resnet.torch/blob/master/INSTALL.md +# https://gist.github.com/BIGBALLON/8a71d225eff18d88e469e6ea9b39cef4 +# +# imagenet/train/ +# ├── n01440764 +# │ ├── n01440764_10026.JPEG +# │ ├── n01440764_10027.JPEG +# │ ├── ...... +# ├── ...... +# imagenet/val/ +# ├── n01440764 +# │ ├── ILSVRC2012_val_00000293.JPEG +# │ ├── ILSVRC2012_val_00002138.JPEG +# │ ├── ...... +# ├── ...... +# +# +# Make imagnet directory +# +mkdir imagenet +# +# Extract the training data: +# +# Create train directory; move .tar file; change directory +mkdir imagenet/train && mv ILSVRC2012_img_train.tar imagenet/train/ && cd imagenet/train +# Extract training set; remove compressed file +tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar +# +# At this stage imagenet/train will contain 1000 compressed .tar files, one for each category +# +# For each .tar file: +# 1. create directory with same name as .tar file +# 2. extract and copy contents of .tar file into directory +# 3. remove .tar file +find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done +# +# This results in a training directory like so: +# +# imagenet/train/ +# ├── n01440764 +# │ ├── n01440764_10026.JPEG +# │ ├── n01440764_10027.JPEG +# │ ├── ...... +# ├── ...... +# +# Change back to original directory +cd ../.. +# +# Extract the validation data and move images to subfolders: +# +# Create validation directory; move .tar file; change directory; extract validation .tar; remove compressed file +mkdir imagenet/val && mv ILSVRC2012_img_val.tar imagenet/val/ && cd imagenet/val && tar -xvf ILSVRC2012_img_val.tar && rm -f ILSVRC2012_img_val.tar +# get script from soumith and run; this script creates all class directories and moves images into corresponding directories +wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash +# +# This results in a validation directory like so: +# +# imagenet/val/ +# ├── n01440764 +# │ ├── ILSVRC2012_val_00000293.JPEG +# │ ├── ILSVRC2012_val_00002138.JPEG +# │ ├── ...... +# ├── ...... +# +# +# Check total files after extract +# +# $ find train/ -name "*.JPEG" | wc -l +# 1281167 +# $ find val/ -name "*.JPEG" | wc -l +# 50000 +# \ No newline at end of file diff --git a/examples/3.x_api/pytorch/cv/dynamic_quant/main.py b/examples/3.x_api/pytorch/cv/dynamic_quant/main.py new file mode 100644 index 00000000000..6b420ec342a --- /dev/null +++ b/examples/3.x_api/pytorch/cv/dynamic_quant/main.py @@ -0,0 +1,512 @@ +import argparse +import os +import random +import shutil +import time +import warnings +from enum import Enum + +import torch +import torch.backends.cudnn as cudnn +import torch.distributed as dist +import torch.multiprocessing as mp +import torch.nn as nn +import torch.nn.parallel +import torch.optim +import torch.utils.data +import torch.utils.data.distributed +import torchvision.datasets as datasets +import torchvision.models as models +import torchvision.transforms as transforms +from torch.optim.lr_scheduler import StepLR +from torch.utils.data import Subset + +model_names = sorted(name for name in models.__dict__ + if name.islower() and not name.startswith("__") + and callable(models.__dict__[name])) + +parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') +parser.add_argument('data', metavar='DIR', nargs='?', default='imagenet', + help='path to dataset (default: imagenet)') +parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18', + choices=model_names, + help='model architecture: ' + + ' | '.join(model_names) + + ' (default: resnet18)') +parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', + help='number of data loading workers (default: 4)') +parser.add_argument('--epochs', default=90, type=int, metavar='N', + help='number of total epochs to run') +parser.add_argument('--start-epoch', default=0, type=int, metavar='N', + help='manual epoch number (useful on restarts)') +parser.add_argument('-b', '--batch-size', default=256, type=int, + metavar='N', + help='mini-batch size (default: 256), this is the total ' + 'batch size of all GPUs on the current node when ' + 'using Data Parallel or Distributed Data Parallel') +parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, + metavar='LR', help='initial learning rate', dest='lr') +parser.add_argument('--momentum', default=0.9, type=float, metavar='M', + help='momentum') +parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, + metavar='W', help='weight decay (default: 1e-4)', + dest='weight_decay') +parser.add_argument('-p', '--print-freq', default=10, type=int, + metavar='N', help='print frequency (default: 10)') +parser.add_argument('--resume', default='', type=str, metavar='PATH', + help='path to latest checkpoint (default: none)') +parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', + help='evaluate model on validation set') +parser.add_argument('--pretrained', dest='pretrained', action='store_true', + help='use pre-trained model') +parser.add_argument('--world-size', default=-1, type=int, + help='number of nodes for distributed training') +parser.add_argument('--rank', default=-1, type=int, + help='node rank for distributed training') +parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, + help='url used to set up distributed training') +parser.add_argument('--dist-backend', default='nccl', type=str, + help='distributed backend') +parser.add_argument('--seed', default=None, type=int, + help='seed for initializing training. ') +parser.add_argument('--gpu', default=None, type=int, + help='GPU id to use.') +parser.add_argument('--multiprocessing-distributed', action='store_true', + help='Use multi-processing distributed training to launch ' + 'N processes per node, which has N GPUs. This is the ' + 'fastest way to use PyTorch for either single node or ' + 'multi node data parallel training') +parser.add_argument('--dummy', action='store_true', help="use fake data to benchmark") +parser.add_argument('-q', '--quantize', dest='quantize', action='store_true', + help='quantize model') + +best_acc1 = 0 + + +def main(): + args = parser.parse_args() + + if args.seed is not None: + random.seed(args.seed) + torch.manual_seed(args.seed) + cudnn.deterministic = True + cudnn.benchmark = False + warnings.warn('You have chosen to seed training. ' + 'This will turn on the CUDNN deterministic setting, ' + 'which can slow down your training considerably! ' + 'You may see unexpected behavior when restarting ' + 'from checkpoints.') + + if args.gpu is not None: + warnings.warn('You have chosen a specific GPU. This will completely ' + 'disable data parallelism.') + + if args.dist_url == "env://" and args.world_size == -1: + args.world_size = int(os.environ["WORLD_SIZE"]) + + args.distributed = args.world_size > 1 or args.multiprocessing_distributed + + if torch.cuda.is_available(): + ngpus_per_node = torch.cuda.device_count() + if ngpus_per_node == 1 and args.dist_backend == "nccl": + warnings.warn("nccl backend >=2.5 requires GPU count>1, see https://github.com/NVIDIA/nccl/issues/103 perhaps use 'gloo'") + else: + ngpus_per_node = 1 + + if args.multiprocessing_distributed: + # Since we have ngpus_per_node processes per node, the total world_size + # needs to be adjusted accordingly + args.world_size = ngpus_per_node * args.world_size + # Use torch.multiprocessing.spawn to launch distributed processes: the + # main_worker process function + mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) + else: + # Simply call main_worker function + main_worker(args.gpu, ngpus_per_node, args) + + +def main_worker(gpu, ngpus_per_node, args): + global best_acc1 + args.gpu = gpu + + if args.gpu is not None: + print("Use GPU: {} for training".format(args.gpu)) + + if args.distributed: + if args.dist_url == "env://" and args.rank == -1: + args.rank = int(os.environ["RANK"]) + if args.multiprocessing_distributed: + # For multiprocessing distributed training, rank needs to be the + # global rank among all the processes + args.rank = args.rank * ngpus_per_node + gpu + dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + # create model + if args.pretrained: + print("=> using pre-trained model '{}'".format(args.arch)) + model = models.__dict__[args.arch](pretrained=True) + else: + print("=> creating model '{}'".format(args.arch)) + model = models.__dict__[args.arch]() + + if not torch.cuda.is_available() and not torch.backends.mps.is_available(): + print('using CPU, this will be slow') + elif args.distributed: + # For multiprocessing distributed, DistributedDataParallel constructor + # should always set the single device scope, otherwise, + # DistributedDataParallel will use all available devices. + if torch.cuda.is_available(): + if args.gpu is not None: + torch.cuda.set_device(args.gpu) + model.cuda(args.gpu) + # When using a single GPU per process and per + # DistributedDataParallel, we need to divide the batch size + # ourselves based on the total number of GPUs of the current node. + args.batch_size = int(args.batch_size / ngpus_per_node) + args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) + else: + model.cuda() + # DistributedDataParallel will divide and allocate batch_size to all + # available GPUs if device_ids are not set + model = torch.nn.parallel.DistributedDataParallel(model) + elif args.gpu is not None and torch.cuda.is_available(): + torch.cuda.set_device(args.gpu) + model = model.cuda(args.gpu) + elif torch.backends.mps.is_available(): + device = torch.device("mps") + model = model.to(device) + else: + # DataParallel will divide and allocate batch_size to all available GPUs + if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): + model.features = torch.nn.DataParallel(model.features) + model.cuda() + else: + model = torch.nn.DataParallel(model).cuda() + + if torch.cuda.is_available(): + if args.gpu: + device = torch.device('cuda:{}'.format(args.gpu)) + else: + device = torch.device("cuda") + elif torch.backends.mps.is_available(): + device = torch.device("mps") + else: + device = torch.device("cpu") + # define loss function (criterion), optimizer, and learning rate scheduler + criterion = nn.CrossEntropyLoss().to(device) + + optimizer = torch.optim.SGD(model.parameters(), args.lr, + momentum=args.momentum, + weight_decay=args.weight_decay) + + """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" + scheduler = StepLR(optimizer, step_size=30, gamma=0.1) + + # optionally resume from a checkpoint + if args.resume: + if os.path.isfile(args.resume): + print("=> loading checkpoint '{}'".format(args.resume)) + if args.gpu is None: + checkpoint = torch.load(args.resume) + elif torch.cuda.is_available(): + # Map model to be loaded to specified single gpu. + loc = 'cuda:{}'.format(args.gpu) + checkpoint = torch.load(args.resume, map_location=loc) + args.start_epoch = checkpoint['epoch'] + best_acc1 = checkpoint['best_acc1'] + if args.gpu is not None: + # best_acc1 may be from a checkpoint from a different GPU + best_acc1 = best_acc1.to(args.gpu) + model.load_state_dict(checkpoint['state_dict']) + optimizer.load_state_dict(checkpoint['optimizer']) + scheduler.load_state_dict(checkpoint['scheduler']) + print("=> loaded checkpoint '{}' (epoch {})" + .format(args.resume, checkpoint['epoch'])) + else: + print("=> no checkpoint found at '{}'".format(args.resume)) + + + # Data loading code + if args.dummy: + print("=> Dummy data is used!") + train_dataset = datasets.FakeData(1281167, (3, 224, 224), 1000, transforms.ToTensor()) + val_dataset = datasets.FakeData(50000, (3, 224, 224), 1000, transforms.ToTensor()) + else: + traindir = os.path.join(args.data, 'train') + valdir = os.path.join(args.data, 'val') + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + + train_dataset = datasets.ImageFolder( + traindir, + transforms.Compose([ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ])) + + val_dataset = datasets.ImageFolder( + valdir, + transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ])) + + if args.distributed: + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) + val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=True) + else: + train_sampler = None + val_sampler = None + + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), + num_workers=args.workers, pin_memory=True, sampler=train_sampler) + + val_loader = torch.utils.data.DataLoader( + val_dataset, batch_size=args.batch_size, shuffle=False, + num_workers=args.workers, pin_memory=True, sampler=val_sampler) + + if args.quantize: + from neural_compressor.torch.export import export + from neural_compressor.torch.quantization import prepare, convert, get_default_dynamic_config + + # Prepare the float model and example inputs for export model + model = model + x = torch.randn(args.batch_size, 3, 224, 224).contiguous(memory_format=torch.channels_last) + example_inputs = (x,) + + # Export eager model into FX graph model + exported_model = export(model=model, example_inputs=example_inputs) + # Quantize the model + quant_config = get_default_dynamic_config() + + prepared_model = prepare(exported_model, quant_config=quant_config) + q_model = convert(prepared_model) + # Compile the quantized model and replace the Q/DQ pattern with Q-operator + from torch._inductor import config + + config.freezing = True + opt_model = torch.compile(q_model) + model = opt_model + + + if args.evaluate: + validate(val_loader, model, criterion, args) + return + +def train(train_loader, model, criterion, optimizer, epoch, device, args): + batch_time = AverageMeter('Time', ':6.3f') + data_time = AverageMeter('Data', ':6.3f') + losses = AverageMeter('Loss', ':.4e') + top1 = AverageMeter('Acc@1', ':6.2f') + top5 = AverageMeter('Acc@5', ':6.2f') + progress = ProgressMeter( + len(train_loader), + [batch_time, data_time, losses, top1, top5], + prefix="Epoch: [{}]".format(epoch)) + + # switch to train mode + model.train() + + end = time.time() + for i, (images, target) in enumerate(train_loader): + # measure data loading time + data_time.update(time.time() - end) + + # move data to the same device as model + images = images.to(device, non_blocking=True) + target = target.to(device, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # compute gradient and do SGD step + optimizer.zero_grad() + loss.backward() + optimizer.step() + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i + 1) + + +def validate(val_loader, model, criterion, args): + + def run_validate(loader, base_progress=0): + with torch.no_grad(): + end = time.time() + for i, (images, target) in enumerate(loader): + i = base_progress + i + if args.gpu is not None and torch.cuda.is_available(): + images = images.cuda(args.gpu, non_blocking=True) + if torch.backends.mps.is_available(): + images = images.to('mps') + target = target.to('mps') + if torch.cuda.is_available(): + target = target.cuda(args.gpu, non_blocking=True) + + # compute output + output = model(images) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), images.size(0)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + progress.display(i + 1) + + batch_time = AverageMeter('Time', ':6.3f', Summary.NONE) + losses = AverageMeter('Loss', ':.4e', Summary.NONE) + top1 = AverageMeter('Acc@1', ':6.2f', Summary.AVERAGE) + top5 = AverageMeter('Acc@5', ':6.2f', Summary.AVERAGE) + progress = ProgressMeter( + len(val_loader) + (args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset))), + [batch_time, losses, top1, top5], + prefix='Test: ') + + # switch to evaluate mode, pt2e no eval() or train() + # model.eval() + + run_validate(val_loader) + if args.distributed: + top1.all_reduce() + top5.all_reduce() + + if args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset)): + aux_val_dataset = Subset(val_loader.dataset, + range(len(val_loader.sampler) * args.world_size, len(val_loader.dataset))) + aux_val_loader = torch.utils.data.DataLoader( + aux_val_dataset, batch_size=args.batch_size, shuffle=False, + num_workers=args.workers, pin_memory=True) + run_validate(aux_val_loader, len(val_loader)) + + progress.display_summary() + + return top1.avg + + +def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): + torch.save(state, filename) + if is_best: + shutil.copyfile(filename, 'model_best.pth.tar') + +class Summary(Enum): + NONE = 0 + AVERAGE = 1 + SUM = 2 + COUNT = 3 + +class AverageMeter(object): + """Computes and stores the average and current value""" + def __init__(self, name, fmt=':f', summary_type=Summary.AVERAGE): + self.name = name + self.fmt = fmt + self.summary_type = summary_type + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + def all_reduce(self): + if torch.cuda.is_available(): + device = torch.device("cuda") + elif torch.backends.mps.is_available(): + device = torch.device("mps") + else: + device = torch.device("cpu") + total = torch.tensor([self.sum, self.count], dtype=torch.float32, device=device) + dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False) + self.sum, self.count = total.tolist() + self.avg = self.sum / self.count + + def __str__(self): + fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' + return fmtstr.format(**self.__dict__) + + def summary(self): + fmtstr = '' + if self.summary_type is Summary.NONE: + fmtstr = '' + elif self.summary_type is Summary.AVERAGE: + fmtstr = '{name} {avg:.3f}' + elif self.summary_type is Summary.SUM: + fmtstr = '{name} {sum:.3f}' + elif self.summary_type is Summary.COUNT: + fmtstr = '{name} {count:.3f}' + else: + raise ValueError('invalid summary type %r' % self.summary_type) + + return fmtstr.format(**self.__dict__) + + +class ProgressMeter(object): + def __init__(self, num_batches, meters, prefix=""): + self.batch_fmtstr = self._get_batch_fmtstr(num_batches) + self.meters = meters + self.prefix = prefix + + def display(self, batch): + entries = [self.prefix + self.batch_fmtstr.format(batch)] + entries += [str(meter) for meter in self.meters] + print('\t'.join(entries)) + + def display_summary(self): + entries = [" *"] + entries += [meter.summary() for meter in self.meters] + print(' '.join(entries)) + + def _get_batch_fmtstr(self, num_batches): + num_digits = len(str(num_batches // 1)) + fmt = '{:' + str(num_digits) + 'd}' + return '[' + fmt + '/' + fmt.format(num_batches) + ']' + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/examples/3.x_api/pytorch/cv/dynamic_quant/requirements.txt b/examples/3.x_api/pytorch/cv/dynamic_quant/requirements.txt new file mode 100644 index 00000000000..ebd3df6ae7a --- /dev/null +++ b/examples/3.x_api/pytorch/cv/dynamic_quant/requirements.txt @@ -0,0 +1,3 @@ +torch +torchvision +neural-compressor \ No newline at end of file From 5ef555b37754030a1e5606b99969dd6fa19ea7e1 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Thu, 6 Jun 2024 16:15:48 +0800 Subject: [PATCH 04/15] remove dynamic Signed-off-by: Kaihui-intel --- .../pytorch/cv/dynamic_quant/README.md | 27 - .../cv/dynamic_quant/extract_ILSVRC.sh | 80 --- .../3.x_api/pytorch/cv/dynamic_quant/main.py | 512 ------------------ .../pytorch/cv/dynamic_quant/requirements.txt | 3 - 4 files changed, 622 deletions(-) delete mode 100644 examples/3.x_api/pytorch/cv/dynamic_quant/README.md delete mode 100644 examples/3.x_api/pytorch/cv/dynamic_quant/extract_ILSVRC.sh delete mode 100644 examples/3.x_api/pytorch/cv/dynamic_quant/main.py delete mode 100644 examples/3.x_api/pytorch/cv/dynamic_quant/requirements.txt diff --git a/examples/3.x_api/pytorch/cv/dynamic_quant/README.md b/examples/3.x_api/pytorch/cv/dynamic_quant/README.md deleted file mode 100644 index 2ba08952e1b..00000000000 --- a/examples/3.x_api/pytorch/cv/dynamic_quant/README.md +++ /dev/null @@ -1,27 +0,0 @@ -# ImageNet training in PyTorch - -This implements quantization of popular model architectures, such as ResNet, AlexNet, and VGG on the ImageNet dataset. - -## Requirements - -- Install requirements -- `pip install -r requirements.txt` -- Download the ImageNet dataset from http://www.image-net.org/ - - Then, move and extract the training and validation images to labeled subfolders, using [the following shell script](extract_ILSVRC.sh) - -## Quantizaiton - -To quant a model and validate accaracy, run `main.py` with the desired model architecture and the path to the ImageNet dataset: - -```bash -python main.py -a resnet18 [imagenet-folder with train and val folders] -q -e -``` - - -## Use Dummy Data - -ImageNet dataset is large and time-consuming to download. To get started quickly, run `main.py` using dummy data by "--dummy". Note that the loss or accuracy is useless in this case. - -```bash -python main.py -a resnet18 --dummy -q -e -``` \ No newline at end of file diff --git a/examples/3.x_api/pytorch/cv/dynamic_quant/extract_ILSVRC.sh b/examples/3.x_api/pytorch/cv/dynamic_quant/extract_ILSVRC.sh deleted file mode 100644 index 3ec05e8f328..00000000000 --- a/examples/3.x_api/pytorch/cv/dynamic_quant/extract_ILSVRC.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/bin/bash -# -# script to extract ImageNet dataset -# ILSVRC2012_img_train.tar (about 138 GB) -# ILSVRC2012_img_val.tar (about 6.3 GB) -# make sure ILSVRC2012_img_train.tar & ILSVRC2012_img_val.tar in your current directory -# -# Adapted from: -# https://github.com/facebook/fb.resnet.torch/blob/master/INSTALL.md -# https://gist.github.com/BIGBALLON/8a71d225eff18d88e469e6ea9b39cef4 -# -# imagenet/train/ -# ├── n01440764 -# │ ├── n01440764_10026.JPEG -# │ ├── n01440764_10027.JPEG -# │ ├── ...... -# ├── ...... -# imagenet/val/ -# ├── n01440764 -# │ ├── ILSVRC2012_val_00000293.JPEG -# │ ├── ILSVRC2012_val_00002138.JPEG -# │ ├── ...... -# ├── ...... -# -# -# Make imagnet directory -# -mkdir imagenet -# -# Extract the training data: -# -# Create train directory; move .tar file; change directory -mkdir imagenet/train && mv ILSVRC2012_img_train.tar imagenet/train/ && cd imagenet/train -# Extract training set; remove compressed file -tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar -# -# At this stage imagenet/train will contain 1000 compressed .tar files, one for each category -# -# For each .tar file: -# 1. create directory with same name as .tar file -# 2. extract and copy contents of .tar file into directory -# 3. remove .tar file -find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done -# -# This results in a training directory like so: -# -# imagenet/train/ -# ├── n01440764 -# │ ├── n01440764_10026.JPEG -# │ ├── n01440764_10027.JPEG -# │ ├── ...... -# ├── ...... -# -# Change back to original directory -cd ../.. -# -# Extract the validation data and move images to subfolders: -# -# Create validation directory; move .tar file; change directory; extract validation .tar; remove compressed file -mkdir imagenet/val && mv ILSVRC2012_img_val.tar imagenet/val/ && cd imagenet/val && tar -xvf ILSVRC2012_img_val.tar && rm -f ILSVRC2012_img_val.tar -# get script from soumith and run; this script creates all class directories and moves images into corresponding directories -wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash -# -# This results in a validation directory like so: -# -# imagenet/val/ -# ├── n01440764 -# │ ├── ILSVRC2012_val_00000293.JPEG -# │ ├── ILSVRC2012_val_00002138.JPEG -# │ ├── ...... -# ├── ...... -# -# -# Check total files after extract -# -# $ find train/ -name "*.JPEG" | wc -l -# 1281167 -# $ find val/ -name "*.JPEG" | wc -l -# 50000 -# \ No newline at end of file diff --git a/examples/3.x_api/pytorch/cv/dynamic_quant/main.py b/examples/3.x_api/pytorch/cv/dynamic_quant/main.py deleted file mode 100644 index 6b420ec342a..00000000000 --- a/examples/3.x_api/pytorch/cv/dynamic_quant/main.py +++ /dev/null @@ -1,512 +0,0 @@ -import argparse -import os -import random -import shutil -import time -import warnings -from enum import Enum - -import torch -import torch.backends.cudnn as cudnn -import torch.distributed as dist -import torch.multiprocessing as mp -import torch.nn as nn -import torch.nn.parallel -import torch.optim -import torch.utils.data -import torch.utils.data.distributed -import torchvision.datasets as datasets -import torchvision.models as models -import torchvision.transforms as transforms -from torch.optim.lr_scheduler import StepLR -from torch.utils.data import Subset - -model_names = sorted(name for name in models.__dict__ - if name.islower() and not name.startswith("__") - and callable(models.__dict__[name])) - -parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') -parser.add_argument('data', metavar='DIR', nargs='?', default='imagenet', - help='path to dataset (default: imagenet)') -parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18', - choices=model_names, - help='model architecture: ' + - ' | '.join(model_names) + - ' (default: resnet18)') -parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', - help='number of data loading workers (default: 4)') -parser.add_argument('--epochs', default=90, type=int, metavar='N', - help='number of total epochs to run') -parser.add_argument('--start-epoch', default=0, type=int, metavar='N', - help='manual epoch number (useful on restarts)') -parser.add_argument('-b', '--batch-size', default=256, type=int, - metavar='N', - help='mini-batch size (default: 256), this is the total ' - 'batch size of all GPUs on the current node when ' - 'using Data Parallel or Distributed Data Parallel') -parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, - metavar='LR', help='initial learning rate', dest='lr') -parser.add_argument('--momentum', default=0.9, type=float, metavar='M', - help='momentum') -parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, - metavar='W', help='weight decay (default: 1e-4)', - dest='weight_decay') -parser.add_argument('-p', '--print-freq', default=10, type=int, - metavar='N', help='print frequency (default: 10)') -parser.add_argument('--resume', default='', type=str, metavar='PATH', - help='path to latest checkpoint (default: none)') -parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', - help='evaluate model on validation set') -parser.add_argument('--pretrained', dest='pretrained', action='store_true', - help='use pre-trained model') -parser.add_argument('--world-size', default=-1, type=int, - help='number of nodes for distributed training') -parser.add_argument('--rank', default=-1, type=int, - help='node rank for distributed training') -parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, - help='url used to set up distributed training') -parser.add_argument('--dist-backend', default='nccl', type=str, - help='distributed backend') -parser.add_argument('--seed', default=None, type=int, - help='seed for initializing training. ') -parser.add_argument('--gpu', default=None, type=int, - help='GPU id to use.') -parser.add_argument('--multiprocessing-distributed', action='store_true', - help='Use multi-processing distributed training to launch ' - 'N processes per node, which has N GPUs. This is the ' - 'fastest way to use PyTorch for either single node or ' - 'multi node data parallel training') -parser.add_argument('--dummy', action='store_true', help="use fake data to benchmark") -parser.add_argument('-q', '--quantize', dest='quantize', action='store_true', - help='quantize model') - -best_acc1 = 0 - - -def main(): - args = parser.parse_args() - - if args.seed is not None: - random.seed(args.seed) - torch.manual_seed(args.seed) - cudnn.deterministic = True - cudnn.benchmark = False - warnings.warn('You have chosen to seed training. ' - 'This will turn on the CUDNN deterministic setting, ' - 'which can slow down your training considerably! ' - 'You may see unexpected behavior when restarting ' - 'from checkpoints.') - - if args.gpu is not None: - warnings.warn('You have chosen a specific GPU. This will completely ' - 'disable data parallelism.') - - if args.dist_url == "env://" and args.world_size == -1: - args.world_size = int(os.environ["WORLD_SIZE"]) - - args.distributed = args.world_size > 1 or args.multiprocessing_distributed - - if torch.cuda.is_available(): - ngpus_per_node = torch.cuda.device_count() - if ngpus_per_node == 1 and args.dist_backend == "nccl": - warnings.warn("nccl backend >=2.5 requires GPU count>1, see https://github.com/NVIDIA/nccl/issues/103 perhaps use 'gloo'") - else: - ngpus_per_node = 1 - - if args.multiprocessing_distributed: - # Since we have ngpus_per_node processes per node, the total world_size - # needs to be adjusted accordingly - args.world_size = ngpus_per_node * args.world_size - # Use torch.multiprocessing.spawn to launch distributed processes: the - # main_worker process function - mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) - else: - # Simply call main_worker function - main_worker(args.gpu, ngpus_per_node, args) - - -def main_worker(gpu, ngpus_per_node, args): - global best_acc1 - args.gpu = gpu - - if args.gpu is not None: - print("Use GPU: {} for training".format(args.gpu)) - - if args.distributed: - if args.dist_url == "env://" and args.rank == -1: - args.rank = int(os.environ["RANK"]) - if args.multiprocessing_distributed: - # For multiprocessing distributed training, rank needs to be the - # global rank among all the processes - args.rank = args.rank * ngpus_per_node + gpu - dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, - world_size=args.world_size, rank=args.rank) - # create model - if args.pretrained: - print("=> using pre-trained model '{}'".format(args.arch)) - model = models.__dict__[args.arch](pretrained=True) - else: - print("=> creating model '{}'".format(args.arch)) - model = models.__dict__[args.arch]() - - if not torch.cuda.is_available() and not torch.backends.mps.is_available(): - print('using CPU, this will be slow') - elif args.distributed: - # For multiprocessing distributed, DistributedDataParallel constructor - # should always set the single device scope, otherwise, - # DistributedDataParallel will use all available devices. - if torch.cuda.is_available(): - if args.gpu is not None: - torch.cuda.set_device(args.gpu) - model.cuda(args.gpu) - # When using a single GPU per process and per - # DistributedDataParallel, we need to divide the batch size - # ourselves based on the total number of GPUs of the current node. - args.batch_size = int(args.batch_size / ngpus_per_node) - args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) - else: - model.cuda() - # DistributedDataParallel will divide and allocate batch_size to all - # available GPUs if device_ids are not set - model = torch.nn.parallel.DistributedDataParallel(model) - elif args.gpu is not None and torch.cuda.is_available(): - torch.cuda.set_device(args.gpu) - model = model.cuda(args.gpu) - elif torch.backends.mps.is_available(): - device = torch.device("mps") - model = model.to(device) - else: - # DataParallel will divide and allocate batch_size to all available GPUs - if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): - model.features = torch.nn.DataParallel(model.features) - model.cuda() - else: - model = torch.nn.DataParallel(model).cuda() - - if torch.cuda.is_available(): - if args.gpu: - device = torch.device('cuda:{}'.format(args.gpu)) - else: - device = torch.device("cuda") - elif torch.backends.mps.is_available(): - device = torch.device("mps") - else: - device = torch.device("cpu") - # define loss function (criterion), optimizer, and learning rate scheduler - criterion = nn.CrossEntropyLoss().to(device) - - optimizer = torch.optim.SGD(model.parameters(), args.lr, - momentum=args.momentum, - weight_decay=args.weight_decay) - - """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" - scheduler = StepLR(optimizer, step_size=30, gamma=0.1) - - # optionally resume from a checkpoint - if args.resume: - if os.path.isfile(args.resume): - print("=> loading checkpoint '{}'".format(args.resume)) - if args.gpu is None: - checkpoint = torch.load(args.resume) - elif torch.cuda.is_available(): - # Map model to be loaded to specified single gpu. - loc = 'cuda:{}'.format(args.gpu) - checkpoint = torch.load(args.resume, map_location=loc) - args.start_epoch = checkpoint['epoch'] - best_acc1 = checkpoint['best_acc1'] - if args.gpu is not None: - # best_acc1 may be from a checkpoint from a different GPU - best_acc1 = best_acc1.to(args.gpu) - model.load_state_dict(checkpoint['state_dict']) - optimizer.load_state_dict(checkpoint['optimizer']) - scheduler.load_state_dict(checkpoint['scheduler']) - print("=> loaded checkpoint '{}' (epoch {})" - .format(args.resume, checkpoint['epoch'])) - else: - print("=> no checkpoint found at '{}'".format(args.resume)) - - - # Data loading code - if args.dummy: - print("=> Dummy data is used!") - train_dataset = datasets.FakeData(1281167, (3, 224, 224), 1000, transforms.ToTensor()) - val_dataset = datasets.FakeData(50000, (3, 224, 224), 1000, transforms.ToTensor()) - else: - traindir = os.path.join(args.data, 'train') - valdir = os.path.join(args.data, 'val') - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]) - - train_dataset = datasets.ImageFolder( - traindir, - transforms.Compose([ - transforms.RandomResizedCrop(224), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - normalize, - ])) - - val_dataset = datasets.ImageFolder( - valdir, - transforms.Compose([ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - normalize, - ])) - - if args.distributed: - train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) - val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=True) - else: - train_sampler = None - val_sampler = None - - train_loader = torch.utils.data.DataLoader( - train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), - num_workers=args.workers, pin_memory=True, sampler=train_sampler) - - val_loader = torch.utils.data.DataLoader( - val_dataset, batch_size=args.batch_size, shuffle=False, - num_workers=args.workers, pin_memory=True, sampler=val_sampler) - - if args.quantize: - from neural_compressor.torch.export import export - from neural_compressor.torch.quantization import prepare, convert, get_default_dynamic_config - - # Prepare the float model and example inputs for export model - model = model - x = torch.randn(args.batch_size, 3, 224, 224).contiguous(memory_format=torch.channels_last) - example_inputs = (x,) - - # Export eager model into FX graph model - exported_model = export(model=model, example_inputs=example_inputs) - # Quantize the model - quant_config = get_default_dynamic_config() - - prepared_model = prepare(exported_model, quant_config=quant_config) - q_model = convert(prepared_model) - # Compile the quantized model and replace the Q/DQ pattern with Q-operator - from torch._inductor import config - - config.freezing = True - opt_model = torch.compile(q_model) - model = opt_model - - - if args.evaluate: - validate(val_loader, model, criterion, args) - return - -def train(train_loader, model, criterion, optimizer, epoch, device, args): - batch_time = AverageMeter('Time', ':6.3f') - data_time = AverageMeter('Data', ':6.3f') - losses = AverageMeter('Loss', ':.4e') - top1 = AverageMeter('Acc@1', ':6.2f') - top5 = AverageMeter('Acc@5', ':6.2f') - progress = ProgressMeter( - len(train_loader), - [batch_time, data_time, losses, top1, top5], - prefix="Epoch: [{}]".format(epoch)) - - # switch to train mode - model.train() - - end = time.time() - for i, (images, target) in enumerate(train_loader): - # measure data loading time - data_time.update(time.time() - end) - - # move data to the same device as model - images = images.to(device, non_blocking=True) - target = target.to(device, non_blocking=True) - - # compute output - output = model(images) - loss = criterion(output, target) - - # measure accuracy and record loss - acc1, acc5 = accuracy(output, target, topk=(1, 5)) - losses.update(loss.item(), images.size(0)) - top1.update(acc1[0], images.size(0)) - top5.update(acc5[0], images.size(0)) - - # compute gradient and do SGD step - optimizer.zero_grad() - loss.backward() - optimizer.step() - - # measure elapsed time - batch_time.update(time.time() - end) - end = time.time() - - if i % args.print_freq == 0: - progress.display(i + 1) - - -def validate(val_loader, model, criterion, args): - - def run_validate(loader, base_progress=0): - with torch.no_grad(): - end = time.time() - for i, (images, target) in enumerate(loader): - i = base_progress + i - if args.gpu is not None and torch.cuda.is_available(): - images = images.cuda(args.gpu, non_blocking=True) - if torch.backends.mps.is_available(): - images = images.to('mps') - target = target.to('mps') - if torch.cuda.is_available(): - target = target.cuda(args.gpu, non_blocking=True) - - # compute output - output = model(images) - loss = criterion(output, target) - - # measure accuracy and record loss - acc1, acc5 = accuracy(output, target, topk=(1, 5)) - losses.update(loss.item(), images.size(0)) - top1.update(acc1[0], images.size(0)) - top5.update(acc5[0], images.size(0)) - - # measure elapsed time - batch_time.update(time.time() - end) - end = time.time() - - if i % args.print_freq == 0: - progress.display(i + 1) - - batch_time = AverageMeter('Time', ':6.3f', Summary.NONE) - losses = AverageMeter('Loss', ':.4e', Summary.NONE) - top1 = AverageMeter('Acc@1', ':6.2f', Summary.AVERAGE) - top5 = AverageMeter('Acc@5', ':6.2f', Summary.AVERAGE) - progress = ProgressMeter( - len(val_loader) + (args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset))), - [batch_time, losses, top1, top5], - prefix='Test: ') - - # switch to evaluate mode, pt2e no eval() or train() - # model.eval() - - run_validate(val_loader) - if args.distributed: - top1.all_reduce() - top5.all_reduce() - - if args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset)): - aux_val_dataset = Subset(val_loader.dataset, - range(len(val_loader.sampler) * args.world_size, len(val_loader.dataset))) - aux_val_loader = torch.utils.data.DataLoader( - aux_val_dataset, batch_size=args.batch_size, shuffle=False, - num_workers=args.workers, pin_memory=True) - run_validate(aux_val_loader, len(val_loader)) - - progress.display_summary() - - return top1.avg - - -def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): - torch.save(state, filename) - if is_best: - shutil.copyfile(filename, 'model_best.pth.tar') - -class Summary(Enum): - NONE = 0 - AVERAGE = 1 - SUM = 2 - COUNT = 3 - -class AverageMeter(object): - """Computes and stores the average and current value""" - def __init__(self, name, fmt=':f', summary_type=Summary.AVERAGE): - self.name = name - self.fmt = fmt - self.summary_type = summary_type - self.reset() - - def reset(self): - self.val = 0 - self.avg = 0 - self.sum = 0 - self.count = 0 - - def update(self, val, n=1): - self.val = val - self.sum += val * n - self.count += n - self.avg = self.sum / self.count - - def all_reduce(self): - if torch.cuda.is_available(): - device = torch.device("cuda") - elif torch.backends.mps.is_available(): - device = torch.device("mps") - else: - device = torch.device("cpu") - total = torch.tensor([self.sum, self.count], dtype=torch.float32, device=device) - dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False) - self.sum, self.count = total.tolist() - self.avg = self.sum / self.count - - def __str__(self): - fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' - return fmtstr.format(**self.__dict__) - - def summary(self): - fmtstr = '' - if self.summary_type is Summary.NONE: - fmtstr = '' - elif self.summary_type is Summary.AVERAGE: - fmtstr = '{name} {avg:.3f}' - elif self.summary_type is Summary.SUM: - fmtstr = '{name} {sum:.3f}' - elif self.summary_type is Summary.COUNT: - fmtstr = '{name} {count:.3f}' - else: - raise ValueError('invalid summary type %r' % self.summary_type) - - return fmtstr.format(**self.__dict__) - - -class ProgressMeter(object): - def __init__(self, num_batches, meters, prefix=""): - self.batch_fmtstr = self._get_batch_fmtstr(num_batches) - self.meters = meters - self.prefix = prefix - - def display(self, batch): - entries = [self.prefix + self.batch_fmtstr.format(batch)] - entries += [str(meter) for meter in self.meters] - print('\t'.join(entries)) - - def display_summary(self): - entries = [" *"] - entries += [meter.summary() for meter in self.meters] - print(' '.join(entries)) - - def _get_batch_fmtstr(self, num_batches): - num_digits = len(str(num_batches // 1)) - fmt = '{:' + str(num_digits) + 'd}' - return '[' + fmt + '/' + fmt.format(num_batches) + ']' - -def accuracy(output, target, topk=(1,)): - """Computes the accuracy over the k top predictions for the specified values of k""" - with torch.no_grad(): - maxk = max(topk) - batch_size = target.size(0) - - _, pred = output.topk(maxk, 1, True, True) - pred = pred.t() - correct = pred.eq(target.view(1, -1).expand_as(pred)) - - res = [] - for k in topk: - correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) - res.append(correct_k.mul_(100.0 / batch_size)) - return res - - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/examples/3.x_api/pytorch/cv/dynamic_quant/requirements.txt b/examples/3.x_api/pytorch/cv/dynamic_quant/requirements.txt deleted file mode 100644 index ebd3df6ae7a..00000000000 --- a/examples/3.x_api/pytorch/cv/dynamic_quant/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -torch -torchvision -neural-compressor \ No newline at end of file From d131d74b91fe98f7deef2e63f1caafc960dce598 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Thu, 6 Jun 2024 16:21:52 +0800 Subject: [PATCH 05/15] fix title Signed-off-by: Kaihui-intel --- examples/3.x_api/pytorch/cv/static_quant/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/3.x_api/pytorch/cv/static_quant/README.md b/examples/3.x_api/pytorch/cv/static_quant/README.md index 2ba08952e1b..922fd12ae8d 100644 --- a/examples/3.x_api/pytorch/cv/static_quant/README.md +++ b/examples/3.x_api/pytorch/cv/static_quant/README.md @@ -1,4 +1,4 @@ -# ImageNet training in PyTorch +# ImageNet Quantization This implements quantization of popular model architectures, such as ResNet, AlexNet, and VGG on the ImageNet dataset. From e8d6888dbd82709248ab0c759a2c49109b968898 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Fri, 14 Jun 2024 11:03:32 +0800 Subject: [PATCH 06/15] update cv example Signed-off-by: Kaihui-intel --- .../3.x_api/pytorch/cv/static_quant/README.md | 2 +- .../3.x_api/pytorch/cv/static_quant/main.py | 48 +------------------ 2 files changed, 2 insertions(+), 48 deletions(-) diff --git a/examples/3.x_api/pytorch/cv/static_quant/README.md b/examples/3.x_api/pytorch/cv/static_quant/README.md index 922fd12ae8d..172f8b0e12f 100644 --- a/examples/3.x_api/pytorch/cv/static_quant/README.md +++ b/examples/3.x_api/pytorch/cv/static_quant/README.md @@ -1,6 +1,6 @@ # ImageNet Quantization -This implements quantization of popular model architectures, such as ResNet, AlexNet, and VGG on the ImageNet dataset. +This implements quantization of popular model architectures, such as ResNet on the ImageNet dataset. ## Requirements diff --git a/examples/3.x_api/pytorch/cv/static_quant/main.py b/examples/3.x_api/pytorch/cv/static_quant/main.py index 6617dadba5e..a9174e72837 100644 --- a/examples/3.x_api/pytorch/cv/static_quant/main.py +++ b/examples/3.x_api/pytorch/cv/static_quant/main.py @@ -275,8 +275,7 @@ def main_worker(gpu, ngpus_per_node, args): from neural_compressor.torch.export import export from neural_compressor.torch.quantization import prepare, convert, get_default_static_config - # Prepare the float model and example inputs for export model - model = model + # Prepare the float model and example inputs for exporting model x = torch.randn(args.batch_size, 3, 224, 224).contiguous(memory_format=torch.channels_last) example_inputs = (x,) @@ -301,51 +300,6 @@ def main_worker(gpu, ngpus_per_node, args): validate(val_loader, model, criterion, args) return -def train(train_loader, model, criterion, optimizer, epoch, device, args): - batch_time = AverageMeter('Time', ':6.3f') - data_time = AverageMeter('Data', ':6.3f') - losses = AverageMeter('Loss', ':.4e') - top1 = AverageMeter('Acc@1', ':6.2f') - top5 = AverageMeter('Acc@5', ':6.2f') - progress = ProgressMeter( - len(train_loader), - [batch_time, data_time, losses, top1, top5], - prefix="Epoch: [{}]".format(epoch)) - - # switch to train mode - model.train() - - end = time.time() - for i, (images, target) in enumerate(train_loader): - # measure data loading time - data_time.update(time.time() - end) - - # move data to the same device as model - images = images.to(device, non_blocking=True) - target = target.to(device, non_blocking=True) - - # compute output - output = model(images) - loss = criterion(output, target) - - # measure accuracy and record loss - acc1, acc5 = accuracy(output, target, topk=(1, 5)) - losses.update(loss.item(), images.size(0)) - top1.update(acc1[0], images.size(0)) - top5.update(acc5[0], images.size(0)) - - # compute gradient and do SGD step - optimizer.zero_grad() - loss.backward() - optimizer.step() - - # measure elapsed time - batch_time.update(time.time() - end) - end = time.time() - - if i % args.print_freq == 0: - progress.display(i + 1) - def validate(val_loader, model, criterion, args): From 96c946fd79f3a9102c47d81d58d8ea5c87d08612 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Fri, 14 Jun 2024 11:05:49 +0800 Subject: [PATCH 07/15] update llm example Signed-off-by: Kaihui-intel --- .../quantization/static_quant/pt2e/run_clm_no_trainer.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py index 2a3ce8ebbcf..406cd11d7fe 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py @@ -65,19 +65,16 @@ def get_user_model(): ) from neural_compressor.torch.export import export from torch.export import Dim - # set TOKENIZERS_PARALLELISM to false def get_example_inputs(tokenizer): text = "Hello, welcome to LLM world." encoded_input = tokenizer(text, return_tensors="pt") example_inputs = encoded_input - # print(f"example_inputs: {example_inputs}") input_ids = example_inputs["input_ids"] input_ids_batch = torch.cat((input_ids, input_ids), dim=0) print(f"input_ids_batch shape: {input_ids_batch.shape}") tuple_inputs = (input_ids_batch,) return tuple_inputs - # os.environ["TOKENIZERS_PARALLELISM"] = "false" # torch._dynamo.config.cache_size_limit = 4 # set limitation if out of memory batch = Dim(name="batch_size") seq_len = Dim(name="seq_len") @@ -105,7 +102,6 @@ def get_example_inputs(tokenizer): if args.accuracy: - # user_model.eval() from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser eval_args = LMEvalParser( model="hf", From 856d03d0bfb0ccd0026289d8d08116f70959d277 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Fri, 14 Jun 2024 13:43:08 +0800 Subject: [PATCH 08/15] add run_quant.sh&model config Signed-off-by: Kaihui-intel --- examples/.config/model_params_pytorch_3x.json | 18 ++++++++ .../pytorch/cv/static_quant/run_quant.sh | 45 ++++++++++++++++++ .../static_quant/pt2e/run_quant.sh | 46 +++++++++++++++++++ 3 files changed, 109 insertions(+) create mode 100644 examples/.config/model_params_pytorch_3x.json create mode 100644 examples/3.x_api/pytorch/cv/static_quant/run_quant.sh create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json new file mode 100644 index 00000000000..e8298def78e --- /dev/null +++ b/examples/.config/model_params_pytorch_3x.json @@ -0,0 +1,18 @@ +{ + "pytorch": { + "resnet18_pt2e_static":{ + "model_src_dir": "cv/static_quant", + "dataset_location": "", + "input_model": "", + "main_script": "main.py", + "batch_size": 1 + }, + "opt_125m_pt2e_static":{ + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_pt2e", + "dataset_location": "", + "input_model": "", + "main_script": "run_clm_no_trainer.py", + "batch_size": 1 + } + } +} \ No newline at end of file diff --git a/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh b/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh new file mode 100644 index 00000000000..ac4a5a2b668 --- /dev/null +++ b/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh @@ -0,0 +1,45 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_tuning + +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --output_model=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + +# run_tuning +function run_tuning { + if [ "${topology}" = "resnet18_pt2e_static" ]; then + model_name_or_path="resnet18" + fi + python main.py -a ${model_name_or_path} ${dataset_location} -q -e +} + +main "$@" diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh new file mode 100644 index 00000000000..bc6c15d25db --- /dev/null +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh @@ -0,0 +1,46 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_tuning + +} + +# init params +function init_params { + for var in "$@" + do + case $var in + --topology=*) + topology=$(echo $var |cut -f2 -d=) + ;; + --dataset_location=*) + dataset_location=$(echo $var |cut -f2 -d=) + ;; + --input_model=*) + input_model=$(echo $var |cut -f2 -d=) + ;; + --output_model=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + +# run_tuning +function run_tuning { + + if [ "${topology}" = "opt_125m_pt2e_static" ]; then + model_name_or_path="facebook/opt-125m" + fi + python run_clm_no_trainer.py --model ${model_name_or_path} --quantize --accuracy +} + +main "$@" From c1b2fc6df05f270593074bae1d7af8abe0337898 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Fri, 14 Jun 2024 13:59:19 +0800 Subject: [PATCH 09/15] update doc Signed-off-by: Kaihui-intel --- docs/3x/PT_StaticQuant.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/3x/PT_StaticQuant.md b/docs/3x/PT_StaticQuant.md index ec967a780d4..eb5bf55d2cc 100644 --- a/docs/3x/PT_StaticQuant.md +++ b/docs/3x/PT_StaticQuant.md @@ -1,6 +1,5 @@ PyTorch Static Quantization ======================================== - 1. [Introduction](#introduction) 2. [Get Started](#get-started) \ 2.1 [Static Quantization with IPEX Backend](#static-quantization-with-ipex-backend) \ @@ -9,6 +8,7 @@ PyTorch Static Quantization 2.1.3 [Model Examples](#model-examples) \ 2.2 [Static Quantization with PT2E Backend](#static-quantization-with-pt2e-backend) \ 2.2.1 [Usage Sample with PT2E](#usage-sample-with-pt2e) + 2.2.2 [Model Examples with PT2E](#model-examples-with-pt2e) ## Introduction @@ -102,3 +102,7 @@ opt_model = torch.compile(q_model) ``` > Note: The `set_local` of `StaticQuantConfig` will be supported after the torch 2.4 release. + +#### Model Examples with PT2E + +Users could refer to [cv examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/cv/static_quant) and [llm examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e) on how to quantize a new model. From 3d29c9565304f97758bec04eb0bea255562a9685 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Fri, 14 Jun 2024 14:02:23 +0800 Subject: [PATCH 10/15] add imagenet location Signed-off-by: Kaihui-intel --- examples/.config/model_params_pytorch_3x.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json index e8298def78e..1652fd3f3b3 100644 --- a/examples/.config/model_params_pytorch_3x.json +++ b/examples/.config/model_params_pytorch_3x.json @@ -9,7 +9,7 @@ }, "opt_125m_pt2e_static":{ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_pt2e", - "dataset_location": "", + "dataset_location": "/tf_dataset/pytorch/ImageNet/raw", "input_model": "", "main_script": "run_clm_no_trainer.py", "batch_size": 1 From bf109e1687100e2a253254f8d85bf774f428e9c4 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Fri, 14 Jun 2024 14:59:25 +0800 Subject: [PATCH 11/15] add calib_iters argument Signed-off-by: Kaihui-intel --- examples/3.x_api/pytorch/cv/static_quant/main.py | 5 ++++- .../quantization/static_quant/pt2e/run_clm_no_trainer.py | 7 +++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/3.x_api/pytorch/cv/static_quant/main.py b/examples/3.x_api/pytorch/cv/static_quant/main.py index a9174e72837..3ab2d6bd6ad 100644 --- a/examples/3.x_api/pytorch/cv/static_quant/main.py +++ b/examples/3.x_api/pytorch/cv/static_quant/main.py @@ -79,6 +79,8 @@ parser.add_argument('--dummy', action='store_true', help="use fake data to benchmark") parser.add_argument('-q', '--quantize', dest='quantize', action='store_true', help='quantize model') +parser.add_argument("--calib_iters", default=2, type=int, + help="For calibration only.") best_acc1 = 0 @@ -286,7 +288,8 @@ def main_worker(gpu, ngpus_per_node, args): prepared_model = prepare(exported_model, quant_config=quant_config) # Calibrate - prepared_model(*example_inputs) + for i in range(args.calib_iters): + prepared_model(*example_inputs) q_model = convert(prepared_model) # Compile the quantized model and replace the Q/DQ pattern with Q-operator from torch._inductor import config diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py index 406cd11d7fe..98d3f11a1dd 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py @@ -1,7 +1,4 @@ import argparse -import sys - -sys.path.append('./') import time import torch from transformers import AutoModelForCausalLM, AutoTokenizer @@ -24,6 +21,8 @@ parser.add_argument("--int8", action="store_true") parser.add_argument("--accuracy", action="store_true") parser.add_argument("--performance", action="store_true") +parser.add_argument("--calib_iters", default=2, type=int, + help="For calibration only.") parser.add_argument("--iters", default=100, type=int, help="For accuracy measurement only.") parser.add_argument("--batch_size", default=1, type=int, @@ -87,7 +86,7 @@ def get_example_inputs(tokenizer): prepare_model = prepare(exported_model, quant_config) # calibrate - for i in range(2): + for i in range(args.calib_iters): prepare_model(*example_inputs) # convert converted_model = convert(prepare_model) From 72e5b058d814ae94cfef17ed69c70dc041c1681a Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Fri, 14 Jun 2024 15:06:24 +0800 Subject: [PATCH 12/15] fix model config Signed-off-by: Kaihui-intel --- examples/.config/model_params_pytorch_3x.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json index 1652fd3f3b3..0fa38480788 100644 --- a/examples/.config/model_params_pytorch_3x.json +++ b/examples/.config/model_params_pytorch_3x.json @@ -2,14 +2,14 @@ "pytorch": { "resnet18_pt2e_static":{ "model_src_dir": "cv/static_quant", - "dataset_location": "", + "dataset_location": "/tf_dataset/pytorch/ImageNet/raw", "input_model": "", "main_script": "main.py", "batch_size": 1 }, "opt_125m_pt2e_static":{ "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_pt2e", - "dataset_location": "/tf_dataset/pytorch/ImageNet/raw", + "dataset_location": "", "input_model": "", "main_script": "run_clm_no_trainer.py", "batch_size": 1 From 21de6ff4907e34a54b6706f5a781f86a36a101c9 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Fri, 14 Jun 2024 16:12:11 +0800 Subject: [PATCH 13/15] fix model config Signed-off-by: Kaihui-intel --- examples/.config/model_params_pytorch_3x.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json index 90479ffcb3e..029e6841040 100644 --- a/examples/.config/model_params_pytorch_3x.json +++ b/examples/.config/model_params_pytorch_3x.json @@ -50,7 +50,7 @@ "batch_size": 1 }, "opt_125m_pt2e_static":{ - "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_pt2e", + "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e", "dataset_location": "", "input_model": "", "main_script": "run_clm_no_trainer.py", From cf44439480662d7e5707f1e47971911f9095bd6f Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Fri, 14 Jun 2024 16:36:24 +0800 Subject: [PATCH 14/15] update tasks Signed-off-by: Kaihui-intel --- .../quantization/static_quant/pt2e/run_quant.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh index bc6c15d25db..6bd599483ff 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh @@ -40,7 +40,7 @@ function run_tuning { if [ "${topology}" = "opt_125m_pt2e_static" ]; then model_name_or_path="facebook/opt-125m" fi - python run_clm_no_trainer.py --model ${model_name_or_path} --quantize --accuracy + python run_clm_no_trainer.py --model ${model_name_or_path} --quantize --accuracy --tasks "lambada_openai" } main "$@" From 8dbd980768418411773fa8f6b62150a64cdc024e Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Fri, 14 Jun 2024 17:35:16 +0800 Subject: [PATCH 15/15] Update examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md Co-authored-by: Yi Liu <106061964+yiliu30@users.noreply.github.com> --- .../language-modeling/quantization/static_quant/pt2e/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md index bc8cb057ee5..7ad8b76bd1e 100644 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md +++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md @@ -2,7 +2,7 @@ Step-by-Step ============ This document describes the step-by-step instructions to run large language models (LLMs) on 4th Gen Intel® Xeon® Scalable Processor (codenamed Sapphire Rapids) with PyTorch 2 Export Quantization. -The script `run_clm_no_trainer.py` supports `OPT` quantization and validates last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git) now, and we are adding more models. +Currently, users can use `run_clm_no_trainer.py` to quantize the `OPT` series models and validate the last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git). We will add more models in the near future. # Prerequisite ## 1. Create Environment