From a2247765aea5c52bb5c64eeea36724d4830f6f09 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Thu, 6 Jun 2024 14:13:16 +0800
Subject: [PATCH 01/15] add pt2e llm example

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../quantization/static_quant/pt2e/README.md  |  27 ++++
 .../static_quant/pt2e/requirements.txt        |   7 +
 .../static_quant/pt2e/run_clm_no_trainer.py   | 153 ++++++++++++++++++
 3 files changed, 187 insertions(+)
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md
new file mode 100644
index 00000000000..bc8cb057ee5
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md
@@ -0,0 +1,27 @@
+Step-by-Step
+============
+This document describes the step-by-step instructions to run large language models (LLMs) on 4th Gen Intel® Xeon® Scalable Processor (codenamed Sapphire Rapids) with PyTorch 2 Export Quantization.
+
+The script `run_clm_no_trainer.py` supports `OPT` quantization and validates last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git) now, and we are adding more models.
+
+# Prerequisite
+## 1. Create Environment
+```
+# Installation
+pip install -r requirements.txt
+```
+
+# Run
+
+Here is how to run the scripts:
+
+**Causal Language Modeling (CLM)**
+
+`run_clm_no_trainer.py` quantizes the large language models using the dataset [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) validates `lambada_openai`, `piqa`, `winogrande`, `hellaswag` and other datasets accuracy provided by lm_eval, an example command is as follows.
+### OPT-125m
+
+#### Quantization
+
+```bash
+python run_clm_no_trainer.py --model facebook/opt-125m --quantize --accuracy
+```
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt
new file mode 100644
index 00000000000..b6d9b6c55de
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/requirements.txt
@@ -0,0 +1,7 @@
+transformers
+torch
+sentencepiece
+neural-compressor
+intel-extension-for-transformers >= 1.4.1
+lm-eval==0.4.2
+peft
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
new file mode 100644
index 00000000000..2a3ce8ebbcf
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
@@ -0,0 +1,153 @@
+import argparse
+import sys
+
+sys.path.append('./')
+import time
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--model", nargs="?", default="facebook/opt-125m"
+)
+parser.add_argument(
+    "--trust_remote_code", default=True,
+    help="Transformers parameter: use the external repo")
+parser.add_argument(
+    "--revision", default=None,
+    help="Transformers parameter: set the model hub commit number")
+parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k")
+parser.add_argument("--output_dir", nargs="?", default="./saved_results")
+parser.add_argument("--quantize", action="store_true")
+parser.add_argument("--approach", type=str, default='static',
+                    help="Select from ['dynamic', 'static', 'weight-only']")
+parser.add_argument("--int8", action="store_true")
+parser.add_argument("--accuracy", action="store_true")
+parser.add_argument("--performance", action="store_true")
+parser.add_argument("--iters", default=100, type=int,
+                    help="For accuracy measurement only.")
+parser.add_argument("--batch_size", default=1, type=int,
+                    help="For accuracy measurement only.")
+parser.add_argument("--tasks", default="lambada_openai,hellaswag,winogrande,piqa,wikitext",
+                    type=str, help="tasks for accuracy validation")
+parser.add_argument("--peft_model_id", type=str, default=None, help="model_name_or_path of peft model")
+# =======================================
+
+args = parser.parse_args()
+
+
+def get_user_model():
+    torchscript = False
+    user_model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        torchscript=torchscript,  # torchscript will force `return_dict=False` to avoid jit errors
+        trust_remote_code=args.trust_remote_code,
+        revision=args.revision,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+
+    if args.peft_model_id is not None:
+        from peft import PeftModel
+        user_model = PeftModel.from_pretrained(user_model, args.peft_model_id)
+
+    # to channels last
+    user_model = user_model.to(memory_format=torch.channels_last)
+    user_model.eval()
+    return user_model, tokenizer
+
+user_model, tokenizer = get_user_model()
+if args.quantize:
+    
+    from neural_compressor.torch.quantization import (
+            convert,
+            get_default_static_config,
+            prepare,
+        )
+    from neural_compressor.torch.export import export
+    from torch.export import Dim
+    # set TOKENIZERS_PARALLELISM to false
+    def get_example_inputs(tokenizer):
+        text = "Hello, welcome to LLM world."
+        encoded_input = tokenizer(text, return_tensors="pt")
+
+        example_inputs = encoded_input
+        # print(f"example_inputs: {example_inputs}")
+        input_ids = example_inputs["input_ids"]
+        input_ids_batch = torch.cat((input_ids, input_ids), dim=0)
+        print(f"input_ids_batch shape: {input_ids_batch.shape}")
+        tuple_inputs = (input_ids_batch,)
+        return tuple_inputs
+    # os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    # torch._dynamo.config.cache_size_limit = 4 # set limitation if out of memory
+    batch = Dim(name="batch_size")
+    seq_len = Dim(name="seq_len")
+    dynamic_shapes = {"input_ids": (batch, seq_len)}
+    example_inputs = get_example_inputs(tokenizer)
+    exported_model = export(user_model, example_inputs=example_inputs, dynamic_shapes=dynamic_shapes)
+
+    quant_config = get_default_static_config()
+    # prepare
+    prepare_model = prepare(exported_model, quant_config)
+
+    # calibrate
+    for i in range(2):
+        prepare_model(*example_inputs)
+    # convert
+    converted_model = convert(prepare_model)
+    # inference
+    from torch._inductor import config
+
+    config.freezing = True
+    opt_model = torch.compile(converted_model)
+
+    opt_model.config = user_model.config # for lm eval
+    user_model = opt_model
+
+
+if args.accuracy:
+    # user_model.eval()
+    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+    eval_args = LMEvalParser(
+        model="hf",
+        user_model=user_model,
+        tokenizer=tokenizer,
+        batch_size=args.batch_size,
+        tasks=args.tasks,
+        device="cpu",
+    )
+    results = evaluate(eval_args)
+    for task_name in args.tasks.split(","):
+        if task_name == "wikitext":
+            acc = results["results"][task_name]["word_perplexity,none"]
+        else:
+            acc = results["results"][task_name]["acc,none"]
+    print("Accuracy: %.5f" % acc)
+    print('Batch size = %d' % args.batch_size)
+
+if args.performance:
+    # user_model.eval()
+    from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
+    import time
+
+    samples = args.iters * args.batch_size
+    eval_args = LMEvalParser(
+        model="hf",
+        user_model=user_model,
+        tokenizer=tokenizer,
+        batch_size=args.batch_size,
+        tasks=args.tasks,
+        limit=samples,
+        device="cpu",
+    )
+    start = time.time()
+    results = evaluate(eval_args)
+    end = time.time()
+    for task_name in args.tasks.split(","):
+        if task_name == "wikitext":
+            acc = results["results"][task_name]["word_perplexity,none"]
+        else:
+            acc = results["results"][task_name]["acc,none"]
+    print("Accuracy: %.5f" % acc)
+    print('Throughput: %.3f samples/sec' % (samples / (end - start)))
+    print('Latency: %.3f ms' % ((end - start) * 1000 / samples))
+    print('Batch size = %d' % args.batch_size)

From 545228906d3bcb40a9dcfa745121644e2f4d51da Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Thu, 6 Jun 2024 15:45:00 +0800
Subject: [PATCH 02/15] add cv example

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../3.x_api/pytorch/cv/static_quant/README.md |  27 +
 .../pytorch/cv/static_quant/extract_ILSVRC.sh |  80 +++
 .../3.x_api/pytorch/cv/static_quant/main.py   | 514 ++++++++++++++++++
 .../pytorch/cv/static_quant/requirements.txt  |   3 +
 4 files changed, 624 insertions(+)
 create mode 100644 examples/3.x_api/pytorch/cv/static_quant/README.md
 create mode 100644 examples/3.x_api/pytorch/cv/static_quant/extract_ILSVRC.sh
 create mode 100644 examples/3.x_api/pytorch/cv/static_quant/main.py
 create mode 100644 examples/3.x_api/pytorch/cv/static_quant/requirements.txt

diff --git a/examples/3.x_api/pytorch/cv/static_quant/README.md b/examples/3.x_api/pytorch/cv/static_quant/README.md
new file mode 100644
index 00000000000..2ba08952e1b
--- /dev/null
+++ b/examples/3.x_api/pytorch/cv/static_quant/README.md
@@ -0,0 +1,27 @@
+# ImageNet training in PyTorch
+
+This implements quantization of popular model architectures, such as ResNet, AlexNet, and VGG on the ImageNet dataset.
+
+## Requirements
+
+- Install requirements
+- `pip install -r requirements.txt`
+- Download the ImageNet dataset from http://www.image-net.org/
+  - Then, move and extract the training and validation images to labeled subfolders, using [the following shell script](extract_ILSVRC.sh)
+
+## Quantizaiton
+
+To quant a model and validate accaracy, run `main.py` with the desired model architecture and the path to the ImageNet dataset:
+
+```bash
+python main.py -a resnet18 [imagenet-folder with train and val folders] -q -e
+```
+
+
+## Use Dummy Data
+
+ImageNet dataset is large and time-consuming to download. To get started quickly, run `main.py` using dummy data by "--dummy". Note that the loss or accuracy is useless in this case.
+
+```bash
+python main.py -a resnet18 --dummy -q -e
+```
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/cv/static_quant/extract_ILSVRC.sh b/examples/3.x_api/pytorch/cv/static_quant/extract_ILSVRC.sh
new file mode 100644
index 00000000000..3ec05e8f328
--- /dev/null
+++ b/examples/3.x_api/pytorch/cv/static_quant/extract_ILSVRC.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+#
+# script to extract ImageNet dataset
+# ILSVRC2012_img_train.tar (about 138 GB)
+# ILSVRC2012_img_val.tar (about 6.3 GB)
+# make sure ILSVRC2012_img_train.tar & ILSVRC2012_img_val.tar in your current directory
+#
+#  Adapted from:
+#  https://github.com/facebook/fb.resnet.torch/blob/master/INSTALL.md
+#  https://gist.github.com/BIGBALLON/8a71d225eff18d88e469e6ea9b39cef4
+# 
+#  imagenet/train/
+#  ├── n01440764
+#  │   ├── n01440764_10026.JPEG
+#  │   ├── n01440764_10027.JPEG
+#  │   ├── ......
+#  ├── ......
+#  imagenet/val/
+#  ├── n01440764
+#  │   ├── ILSVRC2012_val_00000293.JPEG
+#  │   ├── ILSVRC2012_val_00002138.JPEG
+#  │   ├── ......
+#  ├── ......
+#
+#
+# Make imagnet directory
+#
+mkdir imagenet
+#
+# Extract the training data:
+#
+# Create train directory; move .tar file; change directory
+mkdir imagenet/train && mv ILSVRC2012_img_train.tar imagenet/train/ && cd imagenet/train
+# Extract training set; remove compressed file
+tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
+#
+# At this stage imagenet/train will contain 1000 compressed .tar files, one for each category
+#
+# For each .tar file: 
+#   1. create directory with same name as .tar file
+#   2. extract and copy contents of .tar file into directory
+#   3. remove .tar file
+find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
+#
+# This results in a training directory like so:
+#
+#  imagenet/train/
+#  ├── n01440764
+#  │   ├── n01440764_10026.JPEG
+#  │   ├── n01440764_10027.JPEG
+#  │   ├── ......
+#  ├── ......
+#
+# Change back to original directory
+cd ../..
+#
+# Extract the validation data and move images to subfolders:
+#
+# Create validation directory; move .tar file; change directory; extract validation .tar; remove compressed file
+mkdir imagenet/val && mv ILSVRC2012_img_val.tar imagenet/val/ && cd imagenet/val && tar -xvf ILSVRC2012_img_val.tar && rm -f ILSVRC2012_img_val.tar
+# get script from soumith and run; this script creates all class directories and moves images into corresponding directories
+wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash
+#
+# This results in a validation directory like so:
+#
+#  imagenet/val/
+#  ├── n01440764
+#  │   ├── ILSVRC2012_val_00000293.JPEG
+#  │   ├── ILSVRC2012_val_00002138.JPEG
+#  │   ├── ......
+#  ├── ......
+#
+#
+# Check total files after extract
+#
+#  $ find train/ -name "*.JPEG" | wc -l
+#  1281167
+#  $ find val/ -name "*.JPEG" | wc -l
+#  50000
+#
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/cv/static_quant/main.py b/examples/3.x_api/pytorch/cv/static_quant/main.py
new file mode 100644
index 00000000000..6617dadba5e
--- /dev/null
+++ b/examples/3.x_api/pytorch/cv/static_quant/main.py
@@ -0,0 +1,514 @@
+import argparse
+import os
+import random
+import shutil
+import time
+import warnings
+from enum import Enum
+
+import torch
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import torch.nn as nn
+import torch.nn.parallel
+import torch.optim
+import torch.utils.data
+import torch.utils.data.distributed
+import torchvision.datasets as datasets
+import torchvision.models as models
+import torchvision.transforms as transforms
+from torch.optim.lr_scheduler import StepLR
+from torch.utils.data import Subset
+
+model_names = sorted(name for name in models.__dict__
+    if name.islower() and not name.startswith("__")
+    and callable(models.__dict__[name]))
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+parser.add_argument('data', metavar='DIR', nargs='?', default='imagenet',
+                    help='path to dataset (default: imagenet)')
+parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
+                    choices=model_names,
+                    help='model architecture: ' +
+                        ' | '.join(model_names) +
+                        ' (default: resnet18)')
+parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+                    help='number of data loading workers (default: 4)')
+parser.add_argument('--epochs', default=90, type=int, metavar='N',
+                    help='number of total epochs to run')
+parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
+                    help='manual epoch number (useful on restarts)')
+parser.add_argument('-b', '--batch-size', default=256, type=int,
+                    metavar='N',
+                    help='mini-batch size (default: 256), this is the total '
+                         'batch size of all GPUs on the current node when '
+                         'using Data Parallel or Distributed Data Parallel')
+parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
+                    metavar='LR', help='initial learning rate', dest='lr')
+parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                    help='momentum')
+parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                    metavar='W', help='weight decay (default: 1e-4)',
+                    dest='weight_decay')
+parser.add_argument('-p', '--print-freq', default=10, type=int,
+                    metavar='N', help='print frequency (default: 10)')
+parser.add_argument('--resume', default='', type=str, metavar='PATH',
+                    help='path to latest checkpoint (default: none)')
+parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
+                    help='evaluate model on validation set')
+parser.add_argument('--pretrained', dest='pretrained', action='store_true',
+                    help='use pre-trained model')
+parser.add_argument('--world-size', default=-1, type=int,
+                    help='number of nodes for distributed training')
+parser.add_argument('--rank', default=-1, type=int,
+                    help='node rank for distributed training')
+parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
+                    help='url used to set up distributed training')
+parser.add_argument('--dist-backend', default='nccl', type=str,
+                    help='distributed backend')
+parser.add_argument('--seed', default=None, type=int,
+                    help='seed for initializing training. ')
+parser.add_argument('--gpu', default=None, type=int,
+                    help='GPU id to use.')
+parser.add_argument('--multiprocessing-distributed', action='store_true',
+                    help='Use multi-processing distributed training to launch '
+                         'N processes per node, which has N GPUs. This is the '
+                         'fastest way to use PyTorch for either single node or '
+                         'multi node data parallel training')
+parser.add_argument('--dummy', action='store_true', help="use fake data to benchmark")
+parser.add_argument('-q', '--quantize', dest='quantize', action='store_true',
+                    help='quantize model')
+
+best_acc1 = 0
+
+
+def main():
+    args = parser.parse_args()
+
+    if args.seed is not None:
+        random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        cudnn.deterministic = True
+        cudnn.benchmark = False
+        warnings.warn('You have chosen to seed training. '
+                      'This will turn on the CUDNN deterministic setting, '
+                      'which can slow down your training considerably! '
+                      'You may see unexpected behavior when restarting '
+                      'from checkpoints.')
+
+    if args.gpu is not None:
+        warnings.warn('You have chosen a specific GPU. This will completely '
+                      'disable data parallelism.')
+
+    if args.dist_url == "env://" and args.world_size == -1:
+        args.world_size = int(os.environ["WORLD_SIZE"])
+
+    args.distributed = args.world_size > 1 or args.multiprocessing_distributed
+
+    if torch.cuda.is_available():
+        ngpus_per_node = torch.cuda.device_count()
+        if ngpus_per_node == 1 and args.dist_backend == "nccl":
+            warnings.warn("nccl backend >=2.5 requires GPU count>1, see https://github.com/NVIDIA/nccl/issues/103 perhaps use 'gloo'")
+    else:
+        ngpus_per_node = 1
+
+    if args.multiprocessing_distributed:
+        # Since we have ngpus_per_node processes per node, the total world_size
+        # needs to be adjusted accordingly
+        args.world_size = ngpus_per_node * args.world_size
+        # Use torch.multiprocessing.spawn to launch distributed processes: the
+        # main_worker process function
+        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
+    else:
+        # Simply call main_worker function
+        main_worker(args.gpu, ngpus_per_node, args)
+
+
+def main_worker(gpu, ngpus_per_node, args):
+    global best_acc1
+    args.gpu = gpu
+
+    if args.gpu is not None:
+        print("Use GPU: {} for training".format(args.gpu))
+
+    if args.distributed:
+        if args.dist_url == "env://" and args.rank == -1:
+            args.rank = int(os.environ["RANK"])
+        if args.multiprocessing_distributed:
+            # For multiprocessing distributed training, rank needs to be the
+            # global rank among all the processes
+            args.rank = args.rank * ngpus_per_node + gpu
+        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                world_size=args.world_size, rank=args.rank)
+    # create model
+    if args.pretrained:
+        print("=> using pre-trained model '{}'".format(args.arch))
+        model = models.__dict__[args.arch](pretrained=True)
+    else:
+        print("=> creating model '{}'".format(args.arch))
+        model = models.__dict__[args.arch]()
+
+    if not torch.cuda.is_available() and not torch.backends.mps.is_available():
+        print('using CPU, this will be slow')
+    elif args.distributed:
+        # For multiprocessing distributed, DistributedDataParallel constructor
+        # should always set the single device scope, otherwise,
+        # DistributedDataParallel will use all available devices.
+        if torch.cuda.is_available():
+            if args.gpu is not None:
+                torch.cuda.set_device(args.gpu)
+                model.cuda(args.gpu)
+                # When using a single GPU per process and per
+                # DistributedDataParallel, we need to divide the batch size
+                # ourselves based on the total number of GPUs of the current node.
+                args.batch_size = int(args.batch_size / ngpus_per_node)
+                args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
+                model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+            else:
+                model.cuda()
+                # DistributedDataParallel will divide and allocate batch_size to all
+                # available GPUs if device_ids are not set
+                model = torch.nn.parallel.DistributedDataParallel(model)
+    elif args.gpu is not None and torch.cuda.is_available():
+        torch.cuda.set_device(args.gpu)
+        model = model.cuda(args.gpu)
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+        model = model.to(device)
+    else:
+        # DataParallel will divide and allocate batch_size to all available GPUs
+        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
+            model.features = torch.nn.DataParallel(model.features)
+            model.cuda()
+        else:
+            model = torch.nn.DataParallel(model).cuda()
+
+    if torch.cuda.is_available():
+        if args.gpu:
+            device = torch.device('cuda:{}'.format(args.gpu))
+        else:
+            device = torch.device("cuda")
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
+    # define loss function (criterion), optimizer, and learning rate scheduler
+    criterion = nn.CrossEntropyLoss().to(device)
+
+    optimizer = torch.optim.SGD(model.parameters(), args.lr,
+                                momentum=args.momentum,
+                                weight_decay=args.weight_decay)
+    
+    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+    scheduler = StepLR(optimizer, step_size=30, gamma=0.1)
+    
+    # optionally resume from a checkpoint
+    if args.resume:
+        if os.path.isfile(args.resume):
+            print("=> loading checkpoint '{}'".format(args.resume))
+            if args.gpu is None:
+                checkpoint = torch.load(args.resume)
+            elif torch.cuda.is_available():
+                # Map model to be loaded to specified single gpu.
+                loc = 'cuda:{}'.format(args.gpu)
+                checkpoint = torch.load(args.resume, map_location=loc)
+            args.start_epoch = checkpoint['epoch']
+            best_acc1 = checkpoint['best_acc1']
+            if args.gpu is not None:
+                # best_acc1 may be from a checkpoint from a different GPU
+                best_acc1 = best_acc1.to(args.gpu)
+            model.load_state_dict(checkpoint['state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            scheduler.load_state_dict(checkpoint['scheduler'])
+            print("=> loaded checkpoint '{}' (epoch {})"
+                  .format(args.resume, checkpoint['epoch']))
+        else:
+            print("=> no checkpoint found at '{}'".format(args.resume))
+
+
+    # Data loading code
+    if args.dummy:
+        print("=> Dummy data is used!")
+        train_dataset = datasets.FakeData(1281167, (3, 224, 224), 1000, transforms.ToTensor())
+        val_dataset = datasets.FakeData(50000, (3, 224, 224), 1000, transforms.ToTensor())
+    else:
+        traindir = os.path.join(args.data, 'train')
+        valdir = os.path.join(args.data, 'val')
+        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+
+        train_dataset = datasets.ImageFolder(
+            traindir,
+            transforms.Compose([
+                transforms.RandomResizedCrop(224),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                normalize,
+            ]))
+
+        val_dataset = datasets.ImageFolder(
+            valdir,
+            transforms.Compose([
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                normalize,
+            ]))
+
+    if args.distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+        val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=True)
+    else:
+        train_sampler = None
+        val_sampler = None
+
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
+        num_workers=args.workers, pin_memory=True, sampler=train_sampler)
+
+    val_loader = torch.utils.data.DataLoader(
+        val_dataset, batch_size=args.batch_size, shuffle=False,
+        num_workers=args.workers, pin_memory=True, sampler=val_sampler)
+    
+    if args.quantize:
+        from neural_compressor.torch.export import export
+        from neural_compressor.torch.quantization import prepare, convert, get_default_static_config
+
+        # Prepare the float model and example inputs for export model
+        model = model
+        x = torch.randn(args.batch_size, 3, 224, 224).contiguous(memory_format=torch.channels_last)
+        example_inputs = (x,)
+
+        # Export eager model into FX graph model
+        exported_model = export(model=model, example_inputs=example_inputs)
+        # Quantize the model
+        quant_config = get_default_static_config()
+        
+        prepared_model = prepare(exported_model, quant_config=quant_config)
+        # Calibrate
+        prepared_model(*example_inputs)
+        q_model = convert(prepared_model)
+        # Compile the quantized model and replace the Q/DQ pattern with Q-operator
+        from torch._inductor import config
+
+        config.freezing = True
+        opt_model = torch.compile(q_model)
+        model = opt_model
+
+    
+    if args.evaluate:
+        validate(val_loader, model, criterion, args)
+        return
+
+def train(train_loader, model, criterion, optimizer, epoch, device, args):
+    batch_time = AverageMeter('Time', ':6.3f')
+    data_time = AverageMeter('Data', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(
+        len(train_loader),
+        [batch_time, data_time, losses, top1, top5],
+        prefix="Epoch: [{}]".format(epoch))
+
+    # switch to train mode
+    model.train()
+
+    end = time.time()
+    for i, (images, target) in enumerate(train_loader):
+        # measure data loading time
+        data_time.update(time.time() - end)
+
+        # move data to the same device as model
+        images = images.to(device, non_blocking=True)
+        target = target.to(device, non_blocking=True)
+
+        # compute output
+        output = model(images)
+        loss = criterion(output, target)
+
+        # measure accuracy and record loss
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+        losses.update(loss.item(), images.size(0))
+        top1.update(acc1[0], images.size(0))
+        top5.update(acc5[0], images.size(0))
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if i % args.print_freq == 0:
+            progress.display(i + 1)
+
+
+def validate(val_loader, model, criterion, args):
+
+    def run_validate(loader, base_progress=0):
+        with torch.no_grad():
+            end = time.time()
+            for i, (images, target) in enumerate(loader):
+                i = base_progress + i
+                if args.gpu is not None and torch.cuda.is_available():
+                    images = images.cuda(args.gpu, non_blocking=True)
+                if torch.backends.mps.is_available():
+                    images = images.to('mps')
+                    target = target.to('mps')
+                if torch.cuda.is_available():
+                    target = target.cuda(args.gpu, non_blocking=True)
+
+                # compute output
+                output = model(images)
+                loss = criterion(output, target)
+
+                # measure accuracy and record loss
+                acc1, acc5 = accuracy(output, target, topk=(1, 5))
+                losses.update(loss.item(), images.size(0))
+                top1.update(acc1[0], images.size(0))
+                top5.update(acc5[0], images.size(0))
+
+                # measure elapsed time
+                batch_time.update(time.time() - end)
+                end = time.time()
+
+                if i % args.print_freq == 0:
+                    progress.display(i + 1)
+
+    batch_time = AverageMeter('Time', ':6.3f', Summary.NONE)
+    losses = AverageMeter('Loss', ':.4e', Summary.NONE)
+    top1 = AverageMeter('Acc@1', ':6.2f', Summary.AVERAGE)
+    top5 = AverageMeter('Acc@5', ':6.2f', Summary.AVERAGE)
+    progress = ProgressMeter(
+        len(val_loader) + (args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset))),
+        [batch_time, losses, top1, top5],
+        prefix='Test: ')
+
+    # switch to evaluate mode, pt2e no eval() or train()
+    # model.eval()
+
+    run_validate(val_loader)
+    if args.distributed:
+        top1.all_reduce()
+        top5.all_reduce()
+
+    if args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset)):
+        aux_val_dataset = Subset(val_loader.dataset,
+                                 range(len(val_loader.sampler) * args.world_size, len(val_loader.dataset)))
+        aux_val_loader = torch.utils.data.DataLoader(
+            aux_val_dataset, batch_size=args.batch_size, shuffle=False,
+            num_workers=args.workers, pin_memory=True)
+        run_validate(aux_val_loader, len(val_loader))
+
+    progress.display_summary()
+
+    return top1.avg
+
+
+def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
+    torch.save(state, filename)
+    if is_best:
+        shutil.copyfile(filename, 'model_best.pth.tar')
+
+class Summary(Enum):
+    NONE = 0
+    AVERAGE = 1
+    SUM = 2
+    COUNT = 3
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self, name, fmt=':f', summary_type=Summary.AVERAGE):
+        self.name = name
+        self.fmt = fmt
+        self.summary_type = summary_type
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def all_reduce(self):
+        if torch.cuda.is_available():
+            device = torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            device = torch.device("mps")
+        else:
+            device = torch.device("cpu")
+        total = torch.tensor([self.sum, self.count], dtype=torch.float32, device=device)
+        dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False)
+        self.sum, self.count = total.tolist()
+        self.avg = self.sum / self.count
+
+    def __str__(self):
+        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+        return fmtstr.format(**self.__dict__)
+    
+    def summary(self):
+        fmtstr = ''
+        if self.summary_type is Summary.NONE:
+            fmtstr = ''
+        elif self.summary_type is Summary.AVERAGE:
+            fmtstr = '{name} {avg:.3f}'
+        elif self.summary_type is Summary.SUM:
+            fmtstr = '{name} {sum:.3f}'
+        elif self.summary_type is Summary.COUNT:
+            fmtstr = '{name} {count:.3f}'
+        else:
+            raise ValueError('invalid summary type %r' % self.summary_type)
+        
+        return fmtstr.format(**self.__dict__)
+
+
+class ProgressMeter(object):
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print('\t'.join(entries))
+        
+    def display_summary(self):
+        entries = [" *"]
+        entries += [meter.summary() for meter in self.meters]
+        print(' '.join(entries))
+
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = '{:' + str(num_digits) + 'd}'
+        return '[' + fmt + '/' + fmt.format(num_batches) + ']'
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/cv/static_quant/requirements.txt b/examples/3.x_api/pytorch/cv/static_quant/requirements.txt
new file mode 100644
index 00000000000..ebd3df6ae7a
--- /dev/null
+++ b/examples/3.x_api/pytorch/cv/static_quant/requirements.txt
@@ -0,0 +1,3 @@
+torch
+torchvision
+neural-compressor
\ No newline at end of file

From ded7257751dc8314f65f11ccb51ddc710b7797ef Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Thu, 6 Jun 2024 16:03:10 +0800
Subject: [PATCH 03/15] add dynamic example

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../pytorch/cv/dynamic_quant/README.md        |  27 +
 .../cv/dynamic_quant/extract_ILSVRC.sh        |  80 +++
 .../3.x_api/pytorch/cv/dynamic_quant/main.py  | 512 ++++++++++++++++++
 .../pytorch/cv/dynamic_quant/requirements.txt |   3 +
 4 files changed, 622 insertions(+)
 create mode 100644 examples/3.x_api/pytorch/cv/dynamic_quant/README.md
 create mode 100644 examples/3.x_api/pytorch/cv/dynamic_quant/extract_ILSVRC.sh
 create mode 100644 examples/3.x_api/pytorch/cv/dynamic_quant/main.py
 create mode 100644 examples/3.x_api/pytorch/cv/dynamic_quant/requirements.txt

diff --git a/examples/3.x_api/pytorch/cv/dynamic_quant/README.md b/examples/3.x_api/pytorch/cv/dynamic_quant/README.md
new file mode 100644
index 00000000000..2ba08952e1b
--- /dev/null
+++ b/examples/3.x_api/pytorch/cv/dynamic_quant/README.md
@@ -0,0 +1,27 @@
+# ImageNet training in PyTorch
+
+This implements quantization of popular model architectures, such as ResNet, AlexNet, and VGG on the ImageNet dataset.
+
+## Requirements
+
+- Install requirements
+- `pip install -r requirements.txt`
+- Download the ImageNet dataset from http://www.image-net.org/
+  - Then, move and extract the training and validation images to labeled subfolders, using [the following shell script](extract_ILSVRC.sh)
+
+## Quantizaiton
+
+To quant a model and validate accaracy, run `main.py` with the desired model architecture and the path to the ImageNet dataset:
+
+```bash
+python main.py -a resnet18 [imagenet-folder with train and val folders] -q -e
+```
+
+
+## Use Dummy Data
+
+ImageNet dataset is large and time-consuming to download. To get started quickly, run `main.py` using dummy data by "--dummy". Note that the loss or accuracy is useless in this case.
+
+```bash
+python main.py -a resnet18 --dummy -q -e
+```
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/cv/dynamic_quant/extract_ILSVRC.sh b/examples/3.x_api/pytorch/cv/dynamic_quant/extract_ILSVRC.sh
new file mode 100644
index 00000000000..3ec05e8f328
--- /dev/null
+++ b/examples/3.x_api/pytorch/cv/dynamic_quant/extract_ILSVRC.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+#
+# script to extract ImageNet dataset
+# ILSVRC2012_img_train.tar (about 138 GB)
+# ILSVRC2012_img_val.tar (about 6.3 GB)
+# make sure ILSVRC2012_img_train.tar & ILSVRC2012_img_val.tar in your current directory
+#
+#  Adapted from:
+#  https://github.com/facebook/fb.resnet.torch/blob/master/INSTALL.md
+#  https://gist.github.com/BIGBALLON/8a71d225eff18d88e469e6ea9b39cef4
+# 
+#  imagenet/train/
+#  ├── n01440764
+#  │   ├── n01440764_10026.JPEG
+#  │   ├── n01440764_10027.JPEG
+#  │   ├── ......
+#  ├── ......
+#  imagenet/val/
+#  ├── n01440764
+#  │   ├── ILSVRC2012_val_00000293.JPEG
+#  │   ├── ILSVRC2012_val_00002138.JPEG
+#  │   ├── ......
+#  ├── ......
+#
+#
+# Make imagnet directory
+#
+mkdir imagenet
+#
+# Extract the training data:
+#
+# Create train directory; move .tar file; change directory
+mkdir imagenet/train && mv ILSVRC2012_img_train.tar imagenet/train/ && cd imagenet/train
+# Extract training set; remove compressed file
+tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
+#
+# At this stage imagenet/train will contain 1000 compressed .tar files, one for each category
+#
+# For each .tar file: 
+#   1. create directory with same name as .tar file
+#   2. extract and copy contents of .tar file into directory
+#   3. remove .tar file
+find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
+#
+# This results in a training directory like so:
+#
+#  imagenet/train/
+#  ├── n01440764
+#  │   ├── n01440764_10026.JPEG
+#  │   ├── n01440764_10027.JPEG
+#  │   ├── ......
+#  ├── ......
+#
+# Change back to original directory
+cd ../..
+#
+# Extract the validation data and move images to subfolders:
+#
+# Create validation directory; move .tar file; change directory; extract validation .tar; remove compressed file
+mkdir imagenet/val && mv ILSVRC2012_img_val.tar imagenet/val/ && cd imagenet/val && tar -xvf ILSVRC2012_img_val.tar && rm -f ILSVRC2012_img_val.tar
+# get script from soumith and run; this script creates all class directories and moves images into corresponding directories
+wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash
+#
+# This results in a validation directory like so:
+#
+#  imagenet/val/
+#  ├── n01440764
+#  │   ├── ILSVRC2012_val_00000293.JPEG
+#  │   ├── ILSVRC2012_val_00002138.JPEG
+#  │   ├── ......
+#  ├── ......
+#
+#
+# Check total files after extract
+#
+#  $ find train/ -name "*.JPEG" | wc -l
+#  1281167
+#  $ find val/ -name "*.JPEG" | wc -l
+#  50000
+#
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/cv/dynamic_quant/main.py b/examples/3.x_api/pytorch/cv/dynamic_quant/main.py
new file mode 100644
index 00000000000..6b420ec342a
--- /dev/null
+++ b/examples/3.x_api/pytorch/cv/dynamic_quant/main.py
@@ -0,0 +1,512 @@
+import argparse
+import os
+import random
+import shutil
+import time
+import warnings
+from enum import Enum
+
+import torch
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import torch.nn as nn
+import torch.nn.parallel
+import torch.optim
+import torch.utils.data
+import torch.utils.data.distributed
+import torchvision.datasets as datasets
+import torchvision.models as models
+import torchvision.transforms as transforms
+from torch.optim.lr_scheduler import StepLR
+from torch.utils.data import Subset
+
+model_names = sorted(name for name in models.__dict__
+    if name.islower() and not name.startswith("__")
+    and callable(models.__dict__[name]))
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+parser.add_argument('data', metavar='DIR', nargs='?', default='imagenet',
+                    help='path to dataset (default: imagenet)')
+parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
+                    choices=model_names,
+                    help='model architecture: ' +
+                        ' | '.join(model_names) +
+                        ' (default: resnet18)')
+parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+                    help='number of data loading workers (default: 4)')
+parser.add_argument('--epochs', default=90, type=int, metavar='N',
+                    help='number of total epochs to run')
+parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
+                    help='manual epoch number (useful on restarts)')
+parser.add_argument('-b', '--batch-size', default=256, type=int,
+                    metavar='N',
+                    help='mini-batch size (default: 256), this is the total '
+                         'batch size of all GPUs on the current node when '
+                         'using Data Parallel or Distributed Data Parallel')
+parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
+                    metavar='LR', help='initial learning rate', dest='lr')
+parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                    help='momentum')
+parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                    metavar='W', help='weight decay (default: 1e-4)',
+                    dest='weight_decay')
+parser.add_argument('-p', '--print-freq', default=10, type=int,
+                    metavar='N', help='print frequency (default: 10)')
+parser.add_argument('--resume', default='', type=str, metavar='PATH',
+                    help='path to latest checkpoint (default: none)')
+parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
+                    help='evaluate model on validation set')
+parser.add_argument('--pretrained', dest='pretrained', action='store_true',
+                    help='use pre-trained model')
+parser.add_argument('--world-size', default=-1, type=int,
+                    help='number of nodes for distributed training')
+parser.add_argument('--rank', default=-1, type=int,
+                    help='node rank for distributed training')
+parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
+                    help='url used to set up distributed training')
+parser.add_argument('--dist-backend', default='nccl', type=str,
+                    help='distributed backend')
+parser.add_argument('--seed', default=None, type=int,
+                    help='seed for initializing training. ')
+parser.add_argument('--gpu', default=None, type=int,
+                    help='GPU id to use.')
+parser.add_argument('--multiprocessing-distributed', action='store_true',
+                    help='Use multi-processing distributed training to launch '
+                         'N processes per node, which has N GPUs. This is the '
+                         'fastest way to use PyTorch for either single node or '
+                         'multi node data parallel training')
+parser.add_argument('--dummy', action='store_true', help="use fake data to benchmark")
+parser.add_argument('-q', '--quantize', dest='quantize', action='store_true',
+                    help='quantize model')
+
+best_acc1 = 0
+
+
+def main():
+    args = parser.parse_args()
+
+    if args.seed is not None:
+        random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        cudnn.deterministic = True
+        cudnn.benchmark = False
+        warnings.warn('You have chosen to seed training. '
+                      'This will turn on the CUDNN deterministic setting, '
+                      'which can slow down your training considerably! '
+                      'You may see unexpected behavior when restarting '
+                      'from checkpoints.')
+
+    if args.gpu is not None:
+        warnings.warn('You have chosen a specific GPU. This will completely '
+                      'disable data parallelism.')
+
+    if args.dist_url == "env://" and args.world_size == -1:
+        args.world_size = int(os.environ["WORLD_SIZE"])
+
+    args.distributed = args.world_size > 1 or args.multiprocessing_distributed
+
+    if torch.cuda.is_available():
+        ngpus_per_node = torch.cuda.device_count()
+        if ngpus_per_node == 1 and args.dist_backend == "nccl":
+            warnings.warn("nccl backend >=2.5 requires GPU count>1, see https://github.com/NVIDIA/nccl/issues/103 perhaps use 'gloo'")
+    else:
+        ngpus_per_node = 1
+
+    if args.multiprocessing_distributed:
+        # Since we have ngpus_per_node processes per node, the total world_size
+        # needs to be adjusted accordingly
+        args.world_size = ngpus_per_node * args.world_size
+        # Use torch.multiprocessing.spawn to launch distributed processes: the
+        # main_worker process function
+        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
+    else:
+        # Simply call main_worker function
+        main_worker(args.gpu, ngpus_per_node, args)
+
+
+def main_worker(gpu, ngpus_per_node, args):
+    global best_acc1
+    args.gpu = gpu
+
+    if args.gpu is not None:
+        print("Use GPU: {} for training".format(args.gpu))
+
+    if args.distributed:
+        if args.dist_url == "env://" and args.rank == -1:
+            args.rank = int(os.environ["RANK"])
+        if args.multiprocessing_distributed:
+            # For multiprocessing distributed training, rank needs to be the
+            # global rank among all the processes
+            args.rank = args.rank * ngpus_per_node + gpu
+        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                world_size=args.world_size, rank=args.rank)
+    # create model
+    if args.pretrained:
+        print("=> using pre-trained model '{}'".format(args.arch))
+        model = models.__dict__[args.arch](pretrained=True)
+    else:
+        print("=> creating model '{}'".format(args.arch))
+        model = models.__dict__[args.arch]()
+
+    if not torch.cuda.is_available() and not torch.backends.mps.is_available():
+        print('using CPU, this will be slow')
+    elif args.distributed:
+        # For multiprocessing distributed, DistributedDataParallel constructor
+        # should always set the single device scope, otherwise,
+        # DistributedDataParallel will use all available devices.
+        if torch.cuda.is_available():
+            if args.gpu is not None:
+                torch.cuda.set_device(args.gpu)
+                model.cuda(args.gpu)
+                # When using a single GPU per process and per
+                # DistributedDataParallel, we need to divide the batch size
+                # ourselves based on the total number of GPUs of the current node.
+                args.batch_size = int(args.batch_size / ngpus_per_node)
+                args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
+                model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+            else:
+                model.cuda()
+                # DistributedDataParallel will divide and allocate batch_size to all
+                # available GPUs if device_ids are not set
+                model = torch.nn.parallel.DistributedDataParallel(model)
+    elif args.gpu is not None and torch.cuda.is_available():
+        torch.cuda.set_device(args.gpu)
+        model = model.cuda(args.gpu)
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+        model = model.to(device)
+    else:
+        # DataParallel will divide and allocate batch_size to all available GPUs
+        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
+            model.features = torch.nn.DataParallel(model.features)
+            model.cuda()
+        else:
+            model = torch.nn.DataParallel(model).cuda()
+
+    if torch.cuda.is_available():
+        if args.gpu:
+            device = torch.device('cuda:{}'.format(args.gpu))
+        else:
+            device = torch.device("cuda")
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
+    # define loss function (criterion), optimizer, and learning rate scheduler
+    criterion = nn.CrossEntropyLoss().to(device)
+
+    optimizer = torch.optim.SGD(model.parameters(), args.lr,
+                                momentum=args.momentum,
+                                weight_decay=args.weight_decay)
+    
+    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+    scheduler = StepLR(optimizer, step_size=30, gamma=0.1)
+    
+    # optionally resume from a checkpoint
+    if args.resume:
+        if os.path.isfile(args.resume):
+            print("=> loading checkpoint '{}'".format(args.resume))
+            if args.gpu is None:
+                checkpoint = torch.load(args.resume)
+            elif torch.cuda.is_available():
+                # Map model to be loaded to specified single gpu.
+                loc = 'cuda:{}'.format(args.gpu)
+                checkpoint = torch.load(args.resume, map_location=loc)
+            args.start_epoch = checkpoint['epoch']
+            best_acc1 = checkpoint['best_acc1']
+            if args.gpu is not None:
+                # best_acc1 may be from a checkpoint from a different GPU
+                best_acc1 = best_acc1.to(args.gpu)
+            model.load_state_dict(checkpoint['state_dict'])
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            scheduler.load_state_dict(checkpoint['scheduler'])
+            print("=> loaded checkpoint '{}' (epoch {})"
+                  .format(args.resume, checkpoint['epoch']))
+        else:
+            print("=> no checkpoint found at '{}'".format(args.resume))
+
+
+    # Data loading code
+    if args.dummy:
+        print("=> Dummy data is used!")
+        train_dataset = datasets.FakeData(1281167, (3, 224, 224), 1000, transforms.ToTensor())
+        val_dataset = datasets.FakeData(50000, (3, 224, 224), 1000, transforms.ToTensor())
+    else:
+        traindir = os.path.join(args.data, 'train')
+        valdir = os.path.join(args.data, 'val')
+        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+
+        train_dataset = datasets.ImageFolder(
+            traindir,
+            transforms.Compose([
+                transforms.RandomResizedCrop(224),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                normalize,
+            ]))
+
+        val_dataset = datasets.ImageFolder(
+            valdir,
+            transforms.Compose([
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                normalize,
+            ]))
+
+    if args.distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+        val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=True)
+    else:
+        train_sampler = None
+        val_sampler = None
+
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
+        num_workers=args.workers, pin_memory=True, sampler=train_sampler)
+
+    val_loader = torch.utils.data.DataLoader(
+        val_dataset, batch_size=args.batch_size, shuffle=False,
+        num_workers=args.workers, pin_memory=True, sampler=val_sampler)
+    
+    if args.quantize:
+        from neural_compressor.torch.export import export
+        from neural_compressor.torch.quantization import prepare, convert, get_default_dynamic_config
+
+        # Prepare the float model and example inputs for export model
+        model = model
+        x = torch.randn(args.batch_size, 3, 224, 224).contiguous(memory_format=torch.channels_last)
+        example_inputs = (x,)
+
+        # Export eager model into FX graph model
+        exported_model = export(model=model, example_inputs=example_inputs)
+        # Quantize the model
+        quant_config = get_default_dynamic_config()
+        
+        prepared_model = prepare(exported_model, quant_config=quant_config)
+        q_model = convert(prepared_model)
+        # Compile the quantized model and replace the Q/DQ pattern with Q-operator
+        from torch._inductor import config
+
+        config.freezing = True
+        opt_model = torch.compile(q_model)
+        model = opt_model
+
+    
+    if args.evaluate:
+        validate(val_loader, model, criterion, args)
+        return
+
+def train(train_loader, model, criterion, optimizer, epoch, device, args):
+    batch_time = AverageMeter('Time', ':6.3f')
+    data_time = AverageMeter('Data', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(
+        len(train_loader),
+        [batch_time, data_time, losses, top1, top5],
+        prefix="Epoch: [{}]".format(epoch))
+
+    # switch to train mode
+    model.train()
+
+    end = time.time()
+    for i, (images, target) in enumerate(train_loader):
+        # measure data loading time
+        data_time.update(time.time() - end)
+
+        # move data to the same device as model
+        images = images.to(device, non_blocking=True)
+        target = target.to(device, non_blocking=True)
+
+        # compute output
+        output = model(images)
+        loss = criterion(output, target)
+
+        # measure accuracy and record loss
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+        losses.update(loss.item(), images.size(0))
+        top1.update(acc1[0], images.size(0))
+        top5.update(acc5[0], images.size(0))
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+
+        # measure elapsed time
+        batch_time.update(time.time() - end)
+        end = time.time()
+
+        if i % args.print_freq == 0:
+            progress.display(i + 1)
+
+
+def validate(val_loader, model, criterion, args):
+
+    def run_validate(loader, base_progress=0):
+        with torch.no_grad():
+            end = time.time()
+            for i, (images, target) in enumerate(loader):
+                i = base_progress + i
+                if args.gpu is not None and torch.cuda.is_available():
+                    images = images.cuda(args.gpu, non_blocking=True)
+                if torch.backends.mps.is_available():
+                    images = images.to('mps')
+                    target = target.to('mps')
+                if torch.cuda.is_available():
+                    target = target.cuda(args.gpu, non_blocking=True)
+
+                # compute output
+                output = model(images)
+                loss = criterion(output, target)
+
+                # measure accuracy and record loss
+                acc1, acc5 = accuracy(output, target, topk=(1, 5))
+                losses.update(loss.item(), images.size(0))
+                top1.update(acc1[0], images.size(0))
+                top5.update(acc5[0], images.size(0))
+
+                # measure elapsed time
+                batch_time.update(time.time() - end)
+                end = time.time()
+
+                if i % args.print_freq == 0:
+                    progress.display(i + 1)
+
+    batch_time = AverageMeter('Time', ':6.3f', Summary.NONE)
+    losses = AverageMeter('Loss', ':.4e', Summary.NONE)
+    top1 = AverageMeter('Acc@1', ':6.2f', Summary.AVERAGE)
+    top5 = AverageMeter('Acc@5', ':6.2f', Summary.AVERAGE)
+    progress = ProgressMeter(
+        len(val_loader) + (args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset))),
+        [batch_time, losses, top1, top5],
+        prefix='Test: ')
+
+    # switch to evaluate mode, pt2e no eval() or train()
+    # model.eval()
+
+    run_validate(val_loader)
+    if args.distributed:
+        top1.all_reduce()
+        top5.all_reduce()
+
+    if args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset)):
+        aux_val_dataset = Subset(val_loader.dataset,
+                                 range(len(val_loader.sampler) * args.world_size, len(val_loader.dataset)))
+        aux_val_loader = torch.utils.data.DataLoader(
+            aux_val_dataset, batch_size=args.batch_size, shuffle=False,
+            num_workers=args.workers, pin_memory=True)
+        run_validate(aux_val_loader, len(val_loader))
+
+    progress.display_summary()
+
+    return top1.avg
+
+
+def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
+    torch.save(state, filename)
+    if is_best:
+        shutil.copyfile(filename, 'model_best.pth.tar')
+
+class Summary(Enum):
+    NONE = 0
+    AVERAGE = 1
+    SUM = 2
+    COUNT = 3
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self, name, fmt=':f', summary_type=Summary.AVERAGE):
+        self.name = name
+        self.fmt = fmt
+        self.summary_type = summary_type
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def all_reduce(self):
+        if torch.cuda.is_available():
+            device = torch.device("cuda")
+        elif torch.backends.mps.is_available():
+            device = torch.device("mps")
+        else:
+            device = torch.device("cpu")
+        total = torch.tensor([self.sum, self.count], dtype=torch.float32, device=device)
+        dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False)
+        self.sum, self.count = total.tolist()
+        self.avg = self.sum / self.count
+
+    def __str__(self):
+        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+        return fmtstr.format(**self.__dict__)
+    
+    def summary(self):
+        fmtstr = ''
+        if self.summary_type is Summary.NONE:
+            fmtstr = ''
+        elif self.summary_type is Summary.AVERAGE:
+            fmtstr = '{name} {avg:.3f}'
+        elif self.summary_type is Summary.SUM:
+            fmtstr = '{name} {sum:.3f}'
+        elif self.summary_type is Summary.COUNT:
+            fmtstr = '{name} {count:.3f}'
+        else:
+            raise ValueError('invalid summary type %r' % self.summary_type)
+        
+        return fmtstr.format(**self.__dict__)
+
+
+class ProgressMeter(object):
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print('\t'.join(entries))
+        
+    def display_summary(self):
+        entries = [" *"]
+        entries += [meter.summary() for meter in self.meters]
+        print(' '.join(entries))
+
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = '{:' + str(num_digits) + 'd}'
+        return '[' + fmt + '/' + fmt.format(num_batches) + ']'
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/cv/dynamic_quant/requirements.txt b/examples/3.x_api/pytorch/cv/dynamic_quant/requirements.txt
new file mode 100644
index 00000000000..ebd3df6ae7a
--- /dev/null
+++ b/examples/3.x_api/pytorch/cv/dynamic_quant/requirements.txt
@@ -0,0 +1,3 @@
+torch
+torchvision
+neural-compressor
\ No newline at end of file

From 5ef555b37754030a1e5606b99969dd6fa19ea7e1 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Thu, 6 Jun 2024 16:15:48 +0800
Subject: [PATCH 04/15] remove dynamic

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../pytorch/cv/dynamic_quant/README.md        |  27 -
 .../cv/dynamic_quant/extract_ILSVRC.sh        |  80 ---
 .../3.x_api/pytorch/cv/dynamic_quant/main.py  | 512 ------------------
 .../pytorch/cv/dynamic_quant/requirements.txt |   3 -
 4 files changed, 622 deletions(-)
 delete mode 100644 examples/3.x_api/pytorch/cv/dynamic_quant/README.md
 delete mode 100644 examples/3.x_api/pytorch/cv/dynamic_quant/extract_ILSVRC.sh
 delete mode 100644 examples/3.x_api/pytorch/cv/dynamic_quant/main.py
 delete mode 100644 examples/3.x_api/pytorch/cv/dynamic_quant/requirements.txt

diff --git a/examples/3.x_api/pytorch/cv/dynamic_quant/README.md b/examples/3.x_api/pytorch/cv/dynamic_quant/README.md
deleted file mode 100644
index 2ba08952e1b..00000000000
--- a/examples/3.x_api/pytorch/cv/dynamic_quant/README.md
+++ /dev/null
@@ -1,27 +0,0 @@
-# ImageNet training in PyTorch
-
-This implements quantization of popular model architectures, such as ResNet, AlexNet, and VGG on the ImageNet dataset.
-
-## Requirements
-
-- Install requirements
-- `pip install -r requirements.txt`
-- Download the ImageNet dataset from http://www.image-net.org/
-  - Then, move and extract the training and validation images to labeled subfolders, using [the following shell script](extract_ILSVRC.sh)
-
-## Quantizaiton
-
-To quant a model and validate accaracy, run `main.py` with the desired model architecture and the path to the ImageNet dataset:
-
-```bash
-python main.py -a resnet18 [imagenet-folder with train and val folders] -q -e
-```
-
-
-## Use Dummy Data
-
-ImageNet dataset is large and time-consuming to download. To get started quickly, run `main.py` using dummy data by "--dummy". Note that the loss or accuracy is useless in this case.
-
-```bash
-python main.py -a resnet18 --dummy -q -e
-```
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/cv/dynamic_quant/extract_ILSVRC.sh b/examples/3.x_api/pytorch/cv/dynamic_quant/extract_ILSVRC.sh
deleted file mode 100644
index 3ec05e8f328..00000000000
--- a/examples/3.x_api/pytorch/cv/dynamic_quant/extract_ILSVRC.sh
+++ /dev/null
@@ -1,80 +0,0 @@
-#!/bin/bash
-#
-# script to extract ImageNet dataset
-# ILSVRC2012_img_train.tar (about 138 GB)
-# ILSVRC2012_img_val.tar (about 6.3 GB)
-# make sure ILSVRC2012_img_train.tar & ILSVRC2012_img_val.tar in your current directory
-#
-#  Adapted from:
-#  https://github.com/facebook/fb.resnet.torch/blob/master/INSTALL.md
-#  https://gist.github.com/BIGBALLON/8a71d225eff18d88e469e6ea9b39cef4
-# 
-#  imagenet/train/
-#  ├── n01440764
-#  │   ├── n01440764_10026.JPEG
-#  │   ├── n01440764_10027.JPEG
-#  │   ├── ......
-#  ├── ......
-#  imagenet/val/
-#  ├── n01440764
-#  │   ├── ILSVRC2012_val_00000293.JPEG
-#  │   ├── ILSVRC2012_val_00002138.JPEG
-#  │   ├── ......
-#  ├── ......
-#
-#
-# Make imagnet directory
-#
-mkdir imagenet
-#
-# Extract the training data:
-#
-# Create train directory; move .tar file; change directory
-mkdir imagenet/train && mv ILSVRC2012_img_train.tar imagenet/train/ && cd imagenet/train
-# Extract training set; remove compressed file
-tar -xvf ILSVRC2012_img_train.tar && rm -f ILSVRC2012_img_train.tar
-#
-# At this stage imagenet/train will contain 1000 compressed .tar files, one for each category
-#
-# For each .tar file: 
-#   1. create directory with same name as .tar file
-#   2. extract and copy contents of .tar file into directory
-#   3. remove .tar file
-find . -name "*.tar" | while read NAME ; do mkdir -p "${NAME%.tar}"; tar -xvf "${NAME}" -C "${NAME%.tar}"; rm -f "${NAME}"; done
-#
-# This results in a training directory like so:
-#
-#  imagenet/train/
-#  ├── n01440764
-#  │   ├── n01440764_10026.JPEG
-#  │   ├── n01440764_10027.JPEG
-#  │   ├── ......
-#  ├── ......
-#
-# Change back to original directory
-cd ../..
-#
-# Extract the validation data and move images to subfolders:
-#
-# Create validation directory; move .tar file; change directory; extract validation .tar; remove compressed file
-mkdir imagenet/val && mv ILSVRC2012_img_val.tar imagenet/val/ && cd imagenet/val && tar -xvf ILSVRC2012_img_val.tar && rm -f ILSVRC2012_img_val.tar
-# get script from soumith and run; this script creates all class directories and moves images into corresponding directories
-wget -qO- https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh | bash
-#
-# This results in a validation directory like so:
-#
-#  imagenet/val/
-#  ├── n01440764
-#  │   ├── ILSVRC2012_val_00000293.JPEG
-#  │   ├── ILSVRC2012_val_00002138.JPEG
-#  │   ├── ......
-#  ├── ......
-#
-#
-# Check total files after extract
-#
-#  $ find train/ -name "*.JPEG" | wc -l
-#  1281167
-#  $ find val/ -name "*.JPEG" | wc -l
-#  50000
-#
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/cv/dynamic_quant/main.py b/examples/3.x_api/pytorch/cv/dynamic_quant/main.py
deleted file mode 100644
index 6b420ec342a..00000000000
--- a/examples/3.x_api/pytorch/cv/dynamic_quant/main.py
+++ /dev/null
@@ -1,512 +0,0 @@
-import argparse
-import os
-import random
-import shutil
-import time
-import warnings
-from enum import Enum
-
-import torch
-import torch.backends.cudnn as cudnn
-import torch.distributed as dist
-import torch.multiprocessing as mp
-import torch.nn as nn
-import torch.nn.parallel
-import torch.optim
-import torch.utils.data
-import torch.utils.data.distributed
-import torchvision.datasets as datasets
-import torchvision.models as models
-import torchvision.transforms as transforms
-from torch.optim.lr_scheduler import StepLR
-from torch.utils.data import Subset
-
-model_names = sorted(name for name in models.__dict__
-    if name.islower() and not name.startswith("__")
-    and callable(models.__dict__[name]))
-
-parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
-parser.add_argument('data', metavar='DIR', nargs='?', default='imagenet',
-                    help='path to dataset (default: imagenet)')
-parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
-                    choices=model_names,
-                    help='model architecture: ' +
-                        ' | '.join(model_names) +
-                        ' (default: resnet18)')
-parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
-                    help='number of data loading workers (default: 4)')
-parser.add_argument('--epochs', default=90, type=int, metavar='N',
-                    help='number of total epochs to run')
-parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
-                    help='manual epoch number (useful on restarts)')
-parser.add_argument('-b', '--batch-size', default=256, type=int,
-                    metavar='N',
-                    help='mini-batch size (default: 256), this is the total '
-                         'batch size of all GPUs on the current node when '
-                         'using Data Parallel or Distributed Data Parallel')
-parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
-                    metavar='LR', help='initial learning rate', dest='lr')
-parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
-                    help='momentum')
-parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
-                    metavar='W', help='weight decay (default: 1e-4)',
-                    dest='weight_decay')
-parser.add_argument('-p', '--print-freq', default=10, type=int,
-                    metavar='N', help='print frequency (default: 10)')
-parser.add_argument('--resume', default='', type=str, metavar='PATH',
-                    help='path to latest checkpoint (default: none)')
-parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
-                    help='evaluate model on validation set')
-parser.add_argument('--pretrained', dest='pretrained', action='store_true',
-                    help='use pre-trained model')
-parser.add_argument('--world-size', default=-1, type=int,
-                    help='number of nodes for distributed training')
-parser.add_argument('--rank', default=-1, type=int,
-                    help='node rank for distributed training')
-parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
-                    help='url used to set up distributed training')
-parser.add_argument('--dist-backend', default='nccl', type=str,
-                    help='distributed backend')
-parser.add_argument('--seed', default=None, type=int,
-                    help='seed for initializing training. ')
-parser.add_argument('--gpu', default=None, type=int,
-                    help='GPU id to use.')
-parser.add_argument('--multiprocessing-distributed', action='store_true',
-                    help='Use multi-processing distributed training to launch '
-                         'N processes per node, which has N GPUs. This is the '
-                         'fastest way to use PyTorch for either single node or '
-                         'multi node data parallel training')
-parser.add_argument('--dummy', action='store_true', help="use fake data to benchmark")
-parser.add_argument('-q', '--quantize', dest='quantize', action='store_true',
-                    help='quantize model')
-
-best_acc1 = 0
-
-
-def main():
-    args = parser.parse_args()
-
-    if args.seed is not None:
-        random.seed(args.seed)
-        torch.manual_seed(args.seed)
-        cudnn.deterministic = True
-        cudnn.benchmark = False
-        warnings.warn('You have chosen to seed training. '
-                      'This will turn on the CUDNN deterministic setting, '
-                      'which can slow down your training considerably! '
-                      'You may see unexpected behavior when restarting '
-                      'from checkpoints.')
-
-    if args.gpu is not None:
-        warnings.warn('You have chosen a specific GPU. This will completely '
-                      'disable data parallelism.')
-
-    if args.dist_url == "env://" and args.world_size == -1:
-        args.world_size = int(os.environ["WORLD_SIZE"])
-
-    args.distributed = args.world_size > 1 or args.multiprocessing_distributed
-
-    if torch.cuda.is_available():
-        ngpus_per_node = torch.cuda.device_count()
-        if ngpus_per_node == 1 and args.dist_backend == "nccl":
-            warnings.warn("nccl backend >=2.5 requires GPU count>1, see https://github.com/NVIDIA/nccl/issues/103 perhaps use 'gloo'")
-    else:
-        ngpus_per_node = 1
-
-    if args.multiprocessing_distributed:
-        # Since we have ngpus_per_node processes per node, the total world_size
-        # needs to be adjusted accordingly
-        args.world_size = ngpus_per_node * args.world_size
-        # Use torch.multiprocessing.spawn to launch distributed processes: the
-        # main_worker process function
-        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
-    else:
-        # Simply call main_worker function
-        main_worker(args.gpu, ngpus_per_node, args)
-
-
-def main_worker(gpu, ngpus_per_node, args):
-    global best_acc1
-    args.gpu = gpu
-
-    if args.gpu is not None:
-        print("Use GPU: {} for training".format(args.gpu))
-
-    if args.distributed:
-        if args.dist_url == "env://" and args.rank == -1:
-            args.rank = int(os.environ["RANK"])
-        if args.multiprocessing_distributed:
-            # For multiprocessing distributed training, rank needs to be the
-            # global rank among all the processes
-            args.rank = args.rank * ngpus_per_node + gpu
-        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
-                                world_size=args.world_size, rank=args.rank)
-    # create model
-    if args.pretrained:
-        print("=> using pre-trained model '{}'".format(args.arch))
-        model = models.__dict__[args.arch](pretrained=True)
-    else:
-        print("=> creating model '{}'".format(args.arch))
-        model = models.__dict__[args.arch]()
-
-    if not torch.cuda.is_available() and not torch.backends.mps.is_available():
-        print('using CPU, this will be slow')
-    elif args.distributed:
-        # For multiprocessing distributed, DistributedDataParallel constructor
-        # should always set the single device scope, otherwise,
-        # DistributedDataParallel will use all available devices.
-        if torch.cuda.is_available():
-            if args.gpu is not None:
-                torch.cuda.set_device(args.gpu)
-                model.cuda(args.gpu)
-                # When using a single GPU per process and per
-                # DistributedDataParallel, we need to divide the batch size
-                # ourselves based on the total number of GPUs of the current node.
-                args.batch_size = int(args.batch_size / ngpus_per_node)
-                args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
-                model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
-            else:
-                model.cuda()
-                # DistributedDataParallel will divide and allocate batch_size to all
-                # available GPUs if device_ids are not set
-                model = torch.nn.parallel.DistributedDataParallel(model)
-    elif args.gpu is not None and torch.cuda.is_available():
-        torch.cuda.set_device(args.gpu)
-        model = model.cuda(args.gpu)
-    elif torch.backends.mps.is_available():
-        device = torch.device("mps")
-        model = model.to(device)
-    else:
-        # DataParallel will divide and allocate batch_size to all available GPUs
-        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
-            model.features = torch.nn.DataParallel(model.features)
-            model.cuda()
-        else:
-            model = torch.nn.DataParallel(model).cuda()
-
-    if torch.cuda.is_available():
-        if args.gpu:
-            device = torch.device('cuda:{}'.format(args.gpu))
-        else:
-            device = torch.device("cuda")
-    elif torch.backends.mps.is_available():
-        device = torch.device("mps")
-    else:
-        device = torch.device("cpu")
-    # define loss function (criterion), optimizer, and learning rate scheduler
-    criterion = nn.CrossEntropyLoss().to(device)
-
-    optimizer = torch.optim.SGD(model.parameters(), args.lr,
-                                momentum=args.momentum,
-                                weight_decay=args.weight_decay)
-    
-    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
-    scheduler = StepLR(optimizer, step_size=30, gamma=0.1)
-    
-    # optionally resume from a checkpoint
-    if args.resume:
-        if os.path.isfile(args.resume):
-            print("=> loading checkpoint '{}'".format(args.resume))
-            if args.gpu is None:
-                checkpoint = torch.load(args.resume)
-            elif torch.cuda.is_available():
-                # Map model to be loaded to specified single gpu.
-                loc = 'cuda:{}'.format(args.gpu)
-                checkpoint = torch.load(args.resume, map_location=loc)
-            args.start_epoch = checkpoint['epoch']
-            best_acc1 = checkpoint['best_acc1']
-            if args.gpu is not None:
-                # best_acc1 may be from a checkpoint from a different GPU
-                best_acc1 = best_acc1.to(args.gpu)
-            model.load_state_dict(checkpoint['state_dict'])
-            optimizer.load_state_dict(checkpoint['optimizer'])
-            scheduler.load_state_dict(checkpoint['scheduler'])
-            print("=> loaded checkpoint '{}' (epoch {})"
-                  .format(args.resume, checkpoint['epoch']))
-        else:
-            print("=> no checkpoint found at '{}'".format(args.resume))
-
-
-    # Data loading code
-    if args.dummy:
-        print("=> Dummy data is used!")
-        train_dataset = datasets.FakeData(1281167, (3, 224, 224), 1000, transforms.ToTensor())
-        val_dataset = datasets.FakeData(50000, (3, 224, 224), 1000, transforms.ToTensor())
-    else:
-        traindir = os.path.join(args.data, 'train')
-        valdir = os.path.join(args.data, 'val')
-        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
-                                     std=[0.229, 0.224, 0.225])
-
-        train_dataset = datasets.ImageFolder(
-            traindir,
-            transforms.Compose([
-                transforms.RandomResizedCrop(224),
-                transforms.RandomHorizontalFlip(),
-                transforms.ToTensor(),
-                normalize,
-            ]))
-
-        val_dataset = datasets.ImageFolder(
-            valdir,
-            transforms.Compose([
-                transforms.Resize(256),
-                transforms.CenterCrop(224),
-                transforms.ToTensor(),
-                normalize,
-            ]))
-
-    if args.distributed:
-        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
-        val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=True)
-    else:
-        train_sampler = None
-        val_sampler = None
-
-    train_loader = torch.utils.data.DataLoader(
-        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
-        num_workers=args.workers, pin_memory=True, sampler=train_sampler)
-
-    val_loader = torch.utils.data.DataLoader(
-        val_dataset, batch_size=args.batch_size, shuffle=False,
-        num_workers=args.workers, pin_memory=True, sampler=val_sampler)
-    
-    if args.quantize:
-        from neural_compressor.torch.export import export
-        from neural_compressor.torch.quantization import prepare, convert, get_default_dynamic_config
-
-        # Prepare the float model and example inputs for export model
-        model = model
-        x = torch.randn(args.batch_size, 3, 224, 224).contiguous(memory_format=torch.channels_last)
-        example_inputs = (x,)
-
-        # Export eager model into FX graph model
-        exported_model = export(model=model, example_inputs=example_inputs)
-        # Quantize the model
-        quant_config = get_default_dynamic_config()
-        
-        prepared_model = prepare(exported_model, quant_config=quant_config)
-        q_model = convert(prepared_model)
-        # Compile the quantized model and replace the Q/DQ pattern with Q-operator
-        from torch._inductor import config
-
-        config.freezing = True
-        opt_model = torch.compile(q_model)
-        model = opt_model
-
-    
-    if args.evaluate:
-        validate(val_loader, model, criterion, args)
-        return
-
-def train(train_loader, model, criterion, optimizer, epoch, device, args):
-    batch_time = AverageMeter('Time', ':6.3f')
-    data_time = AverageMeter('Data', ':6.3f')
-    losses = AverageMeter('Loss', ':.4e')
-    top1 = AverageMeter('Acc@1', ':6.2f')
-    top5 = AverageMeter('Acc@5', ':6.2f')
-    progress = ProgressMeter(
-        len(train_loader),
-        [batch_time, data_time, losses, top1, top5],
-        prefix="Epoch: [{}]".format(epoch))
-
-    # switch to train mode
-    model.train()
-
-    end = time.time()
-    for i, (images, target) in enumerate(train_loader):
-        # measure data loading time
-        data_time.update(time.time() - end)
-
-        # move data to the same device as model
-        images = images.to(device, non_blocking=True)
-        target = target.to(device, non_blocking=True)
-
-        # compute output
-        output = model(images)
-        loss = criterion(output, target)
-
-        # measure accuracy and record loss
-        acc1, acc5 = accuracy(output, target, topk=(1, 5))
-        losses.update(loss.item(), images.size(0))
-        top1.update(acc1[0], images.size(0))
-        top5.update(acc5[0], images.size(0))
-
-        # compute gradient and do SGD step
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
-
-        # measure elapsed time
-        batch_time.update(time.time() - end)
-        end = time.time()
-
-        if i % args.print_freq == 0:
-            progress.display(i + 1)
-
-
-def validate(val_loader, model, criterion, args):
-
-    def run_validate(loader, base_progress=0):
-        with torch.no_grad():
-            end = time.time()
-            for i, (images, target) in enumerate(loader):
-                i = base_progress + i
-                if args.gpu is not None and torch.cuda.is_available():
-                    images = images.cuda(args.gpu, non_blocking=True)
-                if torch.backends.mps.is_available():
-                    images = images.to('mps')
-                    target = target.to('mps')
-                if torch.cuda.is_available():
-                    target = target.cuda(args.gpu, non_blocking=True)
-
-                # compute output
-                output = model(images)
-                loss = criterion(output, target)
-
-                # measure accuracy and record loss
-                acc1, acc5 = accuracy(output, target, topk=(1, 5))
-                losses.update(loss.item(), images.size(0))
-                top1.update(acc1[0], images.size(0))
-                top5.update(acc5[0], images.size(0))
-
-                # measure elapsed time
-                batch_time.update(time.time() - end)
-                end = time.time()
-
-                if i % args.print_freq == 0:
-                    progress.display(i + 1)
-
-    batch_time = AverageMeter('Time', ':6.3f', Summary.NONE)
-    losses = AverageMeter('Loss', ':.4e', Summary.NONE)
-    top1 = AverageMeter('Acc@1', ':6.2f', Summary.AVERAGE)
-    top5 = AverageMeter('Acc@5', ':6.2f', Summary.AVERAGE)
-    progress = ProgressMeter(
-        len(val_loader) + (args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset))),
-        [batch_time, losses, top1, top5],
-        prefix='Test: ')
-
-    # switch to evaluate mode, pt2e no eval() or train()
-    # model.eval()
-
-    run_validate(val_loader)
-    if args.distributed:
-        top1.all_reduce()
-        top5.all_reduce()
-
-    if args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset)):
-        aux_val_dataset = Subset(val_loader.dataset,
-                                 range(len(val_loader.sampler) * args.world_size, len(val_loader.dataset)))
-        aux_val_loader = torch.utils.data.DataLoader(
-            aux_val_dataset, batch_size=args.batch_size, shuffle=False,
-            num_workers=args.workers, pin_memory=True)
-        run_validate(aux_val_loader, len(val_loader))
-
-    progress.display_summary()
-
-    return top1.avg
-
-
-def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
-    torch.save(state, filename)
-    if is_best:
-        shutil.copyfile(filename, 'model_best.pth.tar')
-
-class Summary(Enum):
-    NONE = 0
-    AVERAGE = 1
-    SUM = 2
-    COUNT = 3
-
-class AverageMeter(object):
-    """Computes and stores the average and current value"""
-    def __init__(self, name, fmt=':f', summary_type=Summary.AVERAGE):
-        self.name = name
-        self.fmt = fmt
-        self.summary_type = summary_type
-        self.reset()
-
-    def reset(self):
-        self.val = 0
-        self.avg = 0
-        self.sum = 0
-        self.count = 0
-
-    def update(self, val, n=1):
-        self.val = val
-        self.sum += val * n
-        self.count += n
-        self.avg = self.sum / self.count
-
-    def all_reduce(self):
-        if torch.cuda.is_available():
-            device = torch.device("cuda")
-        elif torch.backends.mps.is_available():
-            device = torch.device("mps")
-        else:
-            device = torch.device("cpu")
-        total = torch.tensor([self.sum, self.count], dtype=torch.float32, device=device)
-        dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False)
-        self.sum, self.count = total.tolist()
-        self.avg = self.sum / self.count
-
-    def __str__(self):
-        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
-        return fmtstr.format(**self.__dict__)
-    
-    def summary(self):
-        fmtstr = ''
-        if self.summary_type is Summary.NONE:
-            fmtstr = ''
-        elif self.summary_type is Summary.AVERAGE:
-            fmtstr = '{name} {avg:.3f}'
-        elif self.summary_type is Summary.SUM:
-            fmtstr = '{name} {sum:.3f}'
-        elif self.summary_type is Summary.COUNT:
-            fmtstr = '{name} {count:.3f}'
-        else:
-            raise ValueError('invalid summary type %r' % self.summary_type)
-        
-        return fmtstr.format(**self.__dict__)
-
-
-class ProgressMeter(object):
-    def __init__(self, num_batches, meters, prefix=""):
-        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
-        self.meters = meters
-        self.prefix = prefix
-
-    def display(self, batch):
-        entries = [self.prefix + self.batch_fmtstr.format(batch)]
-        entries += [str(meter) for meter in self.meters]
-        print('\t'.join(entries))
-        
-    def display_summary(self):
-        entries = [" *"]
-        entries += [meter.summary() for meter in self.meters]
-        print(' '.join(entries))
-
-    def _get_batch_fmtstr(self, num_batches):
-        num_digits = len(str(num_batches // 1))
-        fmt = '{:' + str(num_digits) + 'd}'
-        return '[' + fmt + '/' + fmt.format(num_batches) + ']'
-
-def accuracy(output, target, topk=(1,)):
-    """Computes the accuracy over the k top predictions for the specified values of k"""
-    with torch.no_grad():
-        maxk = max(topk)
-        batch_size = target.size(0)
-
-        _, pred = output.topk(maxk, 1, True, True)
-        pred = pred.t()
-        correct = pred.eq(target.view(1, -1).expand_as(pred))
-
-        res = []
-        for k in topk:
-            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
-            res.append(correct_k.mul_(100.0 / batch_size))
-        return res
-
-
-if __name__ == '__main__':
-    main()
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/cv/dynamic_quant/requirements.txt b/examples/3.x_api/pytorch/cv/dynamic_quant/requirements.txt
deleted file mode 100644
index ebd3df6ae7a..00000000000
--- a/examples/3.x_api/pytorch/cv/dynamic_quant/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-torch
-torchvision
-neural-compressor
\ No newline at end of file

From d131d74b91fe98f7deef2e63f1caafc960dce598 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Thu, 6 Jun 2024 16:21:52 +0800
Subject: [PATCH 05/15] fix title

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 examples/3.x_api/pytorch/cv/static_quant/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/3.x_api/pytorch/cv/static_quant/README.md b/examples/3.x_api/pytorch/cv/static_quant/README.md
index 2ba08952e1b..922fd12ae8d 100644
--- a/examples/3.x_api/pytorch/cv/static_quant/README.md
+++ b/examples/3.x_api/pytorch/cv/static_quant/README.md
@@ -1,4 +1,4 @@
-# ImageNet training in PyTorch
+# ImageNet Quantization
 
 This implements quantization of popular model architectures, such as ResNet, AlexNet, and VGG on the ImageNet dataset.
 

From e8d6888dbd82709248ab0c759a2c49109b968898 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Fri, 14 Jun 2024 11:03:32 +0800
Subject: [PATCH 06/15] update cv example

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../3.x_api/pytorch/cv/static_quant/README.md |  2 +-
 .../3.x_api/pytorch/cv/static_quant/main.py   | 48 +------------------
 2 files changed, 2 insertions(+), 48 deletions(-)

diff --git a/examples/3.x_api/pytorch/cv/static_quant/README.md b/examples/3.x_api/pytorch/cv/static_quant/README.md
index 922fd12ae8d..172f8b0e12f 100644
--- a/examples/3.x_api/pytorch/cv/static_quant/README.md
+++ b/examples/3.x_api/pytorch/cv/static_quant/README.md
@@ -1,6 +1,6 @@
 # ImageNet Quantization
 
-This implements quantization of popular model architectures, such as ResNet, AlexNet, and VGG on the ImageNet dataset.
+This implements quantization of popular model architectures, such as ResNet on the ImageNet dataset.
 
 ## Requirements
 
diff --git a/examples/3.x_api/pytorch/cv/static_quant/main.py b/examples/3.x_api/pytorch/cv/static_quant/main.py
index 6617dadba5e..a9174e72837 100644
--- a/examples/3.x_api/pytorch/cv/static_quant/main.py
+++ b/examples/3.x_api/pytorch/cv/static_quant/main.py
@@ -275,8 +275,7 @@ def main_worker(gpu, ngpus_per_node, args):
         from neural_compressor.torch.export import export
         from neural_compressor.torch.quantization import prepare, convert, get_default_static_config
 
-        # Prepare the float model and example inputs for export model
-        model = model
+        # Prepare the float model and example inputs for exporting model
         x = torch.randn(args.batch_size, 3, 224, 224).contiguous(memory_format=torch.channels_last)
         example_inputs = (x,)
 
@@ -301,51 +300,6 @@ def main_worker(gpu, ngpus_per_node, args):
         validate(val_loader, model, criterion, args)
         return
 
-def train(train_loader, model, criterion, optimizer, epoch, device, args):
-    batch_time = AverageMeter('Time', ':6.3f')
-    data_time = AverageMeter('Data', ':6.3f')
-    losses = AverageMeter('Loss', ':.4e')
-    top1 = AverageMeter('Acc@1', ':6.2f')
-    top5 = AverageMeter('Acc@5', ':6.2f')
-    progress = ProgressMeter(
-        len(train_loader),
-        [batch_time, data_time, losses, top1, top5],
-        prefix="Epoch: [{}]".format(epoch))
-
-    # switch to train mode
-    model.train()
-
-    end = time.time()
-    for i, (images, target) in enumerate(train_loader):
-        # measure data loading time
-        data_time.update(time.time() - end)
-
-        # move data to the same device as model
-        images = images.to(device, non_blocking=True)
-        target = target.to(device, non_blocking=True)
-
-        # compute output
-        output = model(images)
-        loss = criterion(output, target)
-
-        # measure accuracy and record loss
-        acc1, acc5 = accuracy(output, target, topk=(1, 5))
-        losses.update(loss.item(), images.size(0))
-        top1.update(acc1[0], images.size(0))
-        top5.update(acc5[0], images.size(0))
-
-        # compute gradient and do SGD step
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
-
-        # measure elapsed time
-        batch_time.update(time.time() - end)
-        end = time.time()
-
-        if i % args.print_freq == 0:
-            progress.display(i + 1)
-
 
 def validate(val_loader, model, criterion, args):
 

From 96c946fd79f3a9102c47d81d58d8ea5c87d08612 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Fri, 14 Jun 2024 11:05:49 +0800
Subject: [PATCH 07/15] update llm example

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../quantization/static_quant/pt2e/run_clm_no_trainer.py      | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
index 2a3ce8ebbcf..406cd11d7fe 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
@@ -65,19 +65,16 @@ def get_user_model():
         )
     from neural_compressor.torch.export import export
     from torch.export import Dim
-    # set TOKENIZERS_PARALLELISM to false
     def get_example_inputs(tokenizer):
         text = "Hello, welcome to LLM world."
         encoded_input = tokenizer(text, return_tensors="pt")
 
         example_inputs = encoded_input
-        # print(f"example_inputs: {example_inputs}")
         input_ids = example_inputs["input_ids"]
         input_ids_batch = torch.cat((input_ids, input_ids), dim=0)
         print(f"input_ids_batch shape: {input_ids_batch.shape}")
         tuple_inputs = (input_ids_batch,)
         return tuple_inputs
-    # os.environ["TOKENIZERS_PARALLELISM"] = "false"
     # torch._dynamo.config.cache_size_limit = 4 # set limitation if out of memory
     batch = Dim(name="batch_size")
     seq_len = Dim(name="seq_len")
@@ -105,7 +102,6 @@ def get_example_inputs(tokenizer):
 
 
 if args.accuracy:
-    # user_model.eval()
     from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
     eval_args = LMEvalParser(
         model="hf",

From 856d03d0bfb0ccd0026289d8d08116f70959d277 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Fri, 14 Jun 2024 13:43:08 +0800
Subject: [PATCH 08/15] add run_quant.sh&model config

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 examples/.config/model_params_pytorch_3x.json | 18 ++++++++
 .../pytorch/cv/static_quant/run_quant.sh      | 45 ++++++++++++++++++
 .../static_quant/pt2e/run_quant.sh            | 46 +++++++++++++++++++
 3 files changed, 109 insertions(+)
 create mode 100644 examples/.config/model_params_pytorch_3x.json
 create mode 100644 examples/3.x_api/pytorch/cv/static_quant/run_quant.sh
 create mode 100644 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh

diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json
new file mode 100644
index 00000000000..e8298def78e
--- /dev/null
+++ b/examples/.config/model_params_pytorch_3x.json
@@ -0,0 +1,18 @@
+{
+    "pytorch": {
+      "resnet18_pt2e_static":{
+        "model_src_dir": "cv/static_quant",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "main.py",
+        "batch_size": 1
+      },
+      "opt_125m_pt2e_static":{
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_pt2e",
+        "dataset_location": "",
+        "input_model": "",
+        "main_script": "run_clm_no_trainer.py",
+        "batch_size": 1
+      }
+    }
+}
\ No newline at end of file
diff --git a/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh b/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh
new file mode 100644
index 00000000000..ac4a5a2b668
--- /dev/null
+++ b/examples/3.x_api/pytorch/cv/static_quant/run_quant.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_tuning
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+       --output_model=*)
+           tuned_checkpoint=$(echo $var |cut -f2 -d=)
+       ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+    if [ "${topology}" = "resnet18_pt2e_static" ]; then
+        model_name_or_path="resnet18"
+    fi
+    python main.py -a ${model_name_or_path} ${dataset_location} -q -e
+}
+
+main "$@"
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh
new file mode 100644
index 00000000000..bc6c15d25db
--- /dev/null
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_tuning
+
+}
+
+# init params
+function init_params {
+  for var in "$@"
+  do
+    case $var in
+      --topology=*)
+          topology=$(echo $var |cut -f2 -d=)
+      ;;
+      --dataset_location=*)
+          dataset_location=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+       --output_model=*)
+           tuned_checkpoint=$(echo $var |cut -f2 -d=)
+       ;;
+      *)
+          echo "Error: No such parameter: ${var}"
+          exit 1
+      ;;
+    esac
+  done
+
+}
+
+# run_tuning
+function run_tuning {
+
+    if [ "${topology}" = "opt_125m_pt2e_static" ]; then
+        model_name_or_path="facebook/opt-125m"
+    fi
+    python run_clm_no_trainer.py --model ${model_name_or_path} --quantize --accuracy
+}
+
+main "$@"

From c1b2fc6df05f270593074bae1d7af8abe0337898 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Fri, 14 Jun 2024 13:59:19 +0800
Subject: [PATCH 09/15] update doc

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 docs/3x/PT_StaticQuant.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/docs/3x/PT_StaticQuant.md b/docs/3x/PT_StaticQuant.md
index ec967a780d4..eb5bf55d2cc 100644
--- a/docs/3x/PT_StaticQuant.md
+++ b/docs/3x/PT_StaticQuant.md
@@ -1,6 +1,5 @@
 PyTorch Static Quantization
 ========================================
-
 1. [Introduction](#introduction)
 2. [Get Started](#get-started) \
     2.1 [Static Quantization with IPEX Backend](#static-quantization-with-ipex-backend) \
@@ -9,6 +8,7 @@ PyTorch Static Quantization
         2.1.3 [Model Examples](#model-examples) \
     2.2 [Static Quantization with PT2E Backend](#static-quantization-with-pt2e-backend) \
         2.2.1 [Usage Sample with PT2E](#usage-sample-with-pt2e)
+        2.2.2 [Model Examples with PT2E](#model-examples-with-pt2e)
 
 
 ## Introduction
@@ -102,3 +102,7 @@ opt_model = torch.compile(q_model)
 ```
 
 > Note: The `set_local` of `StaticQuantConfig` will be supported after the torch 2.4 release.
+
+#### Model Examples with PT2E
+
+Users could refer to [cv examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/cv/static_quant) and [llm examples](https://github.com/intel/neural-compressor/blob/master/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e) on how to quantize a new model.

From 3d29c9565304f97758bec04eb0bea255562a9685 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Fri, 14 Jun 2024 14:02:23 +0800
Subject: [PATCH 10/15] add imagenet location

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 examples/.config/model_params_pytorch_3x.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json
index e8298def78e..1652fd3f3b3 100644
--- a/examples/.config/model_params_pytorch_3x.json
+++ b/examples/.config/model_params_pytorch_3x.json
@@ -9,7 +9,7 @@
       },
       "opt_125m_pt2e_static":{
         "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_pt2e",
-        "dataset_location": "",
+        "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
         "input_model": "",
         "main_script": "run_clm_no_trainer.py",
         "batch_size": 1

From bf109e1687100e2a253254f8d85bf774f428e9c4 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Fri, 14 Jun 2024 14:59:25 +0800
Subject: [PATCH 11/15] add calib_iters argument

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 examples/3.x_api/pytorch/cv/static_quant/main.py           | 5 ++++-
 .../quantization/static_quant/pt2e/run_clm_no_trainer.py   | 7 +++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/examples/3.x_api/pytorch/cv/static_quant/main.py b/examples/3.x_api/pytorch/cv/static_quant/main.py
index a9174e72837..3ab2d6bd6ad 100644
--- a/examples/3.x_api/pytorch/cv/static_quant/main.py
+++ b/examples/3.x_api/pytorch/cv/static_quant/main.py
@@ -79,6 +79,8 @@
 parser.add_argument('--dummy', action='store_true', help="use fake data to benchmark")
 parser.add_argument('-q', '--quantize', dest='quantize', action='store_true',
                     help='quantize model')
+parser.add_argument("--calib_iters", default=2, type=int,
+                    help="For calibration only.")
 
 best_acc1 = 0
 
@@ -286,7 +288,8 @@ def main_worker(gpu, ngpus_per_node, args):
         
         prepared_model = prepare(exported_model, quant_config=quant_config)
         # Calibrate
-        prepared_model(*example_inputs)
+        for i in range(args.calib_iters):
+            prepared_model(*example_inputs)
         q_model = convert(prepared_model)
         # Compile the quantized model and replace the Q/DQ pattern with Q-operator
         from torch._inductor import config
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
index 406cd11d7fe..98d3f11a1dd 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_clm_no_trainer.py
@@ -1,7 +1,4 @@
 import argparse
-import sys
-
-sys.path.append('./')
 import time
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -24,6 +21,8 @@
 parser.add_argument("--int8", action="store_true")
 parser.add_argument("--accuracy", action="store_true")
 parser.add_argument("--performance", action="store_true")
+parser.add_argument("--calib_iters", default=2, type=int,
+                    help="For calibration only.")
 parser.add_argument("--iters", default=100, type=int,
                     help="For accuracy measurement only.")
 parser.add_argument("--batch_size", default=1, type=int,
@@ -87,7 +86,7 @@ def get_example_inputs(tokenizer):
     prepare_model = prepare(exported_model, quant_config)
 
     # calibrate
-    for i in range(2):
+    for i in range(args.calib_iters):
         prepare_model(*example_inputs)
     # convert
     converted_model = convert(prepare_model)

From 72e5b058d814ae94cfef17ed69c70dc041c1681a Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Fri, 14 Jun 2024 15:06:24 +0800
Subject: [PATCH 12/15] fix model config

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 examples/.config/model_params_pytorch_3x.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json
index 1652fd3f3b3..0fa38480788 100644
--- a/examples/.config/model_params_pytorch_3x.json
+++ b/examples/.config/model_params_pytorch_3x.json
@@ -2,14 +2,14 @@
     "pytorch": {
       "resnet18_pt2e_static":{
         "model_src_dir": "cv/static_quant",
-        "dataset_location": "",
+        "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
         "input_model": "",
         "main_script": "main.py",
         "batch_size": 1
       },
       "opt_125m_pt2e_static":{
         "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_pt2e",
-        "dataset_location": "/tf_dataset/pytorch/ImageNet/raw",
+        "dataset_location": "",
         "input_model": "",
         "main_script": "run_clm_no_trainer.py",
         "batch_size": 1

From 21de6ff4907e34a54b6706f5a781f86a36a101c9 Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Fri, 14 Jun 2024 16:12:11 +0800
Subject: [PATCH 13/15] fix model config

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 examples/.config/model_params_pytorch_3x.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/.config/model_params_pytorch_3x.json b/examples/.config/model_params_pytorch_3x.json
index 90479ffcb3e..029e6841040 100644
--- a/examples/.config/model_params_pytorch_3x.json
+++ b/examples/.config/model_params_pytorch_3x.json
@@ -50,7 +50,7 @@
         "batch_size": 1
       },
       "opt_125m_pt2e_static":{
-        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_pt2e",
+        "model_src_dir": "nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e",
         "dataset_location": "",
         "input_model": "",
         "main_script": "run_clm_no_trainer.py",

From cf44439480662d7e5707f1e47971911f9095bd6f Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Fri, 14 Jun 2024 16:36:24 +0800
Subject: [PATCH 14/15] update tasks

Signed-off-by: Kaihui-intel <kaihui.tang@intel.com>
---
 .../quantization/static_quant/pt2e/run_quant.sh                 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh
index bc6c15d25db..6bd599483ff 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/run_quant.sh
@@ -40,7 +40,7 @@ function run_tuning {
     if [ "${topology}" = "opt_125m_pt2e_static" ]; then
         model_name_or_path="facebook/opt-125m"
     fi
-    python run_clm_no_trainer.py --model ${model_name_or_path} --quantize --accuracy
+    python run_clm_no_trainer.py --model ${model_name_or_path} --quantize --accuracy --tasks "lambada_openai"
 }
 
 main "$@"

From 8dbd980768418411773fa8f6b62150a64cdc024e Mon Sep 17 00:00:00 2001
From: Kaihui-intel <kaihui.tang@intel.com>
Date: Fri, 14 Jun 2024 17:35:16 +0800
Subject: [PATCH 15/15] Update
 examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md

Co-authored-by: Yi Liu <106061964+yiliu30@users.noreply.github.com>
---
 .../language-modeling/quantization/static_quant/pt2e/README.md  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md
index bc8cb057ee5..7ad8b76bd1e 100644
--- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md
+++ b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/static_quant/pt2e/README.md
@@ -2,7 +2,7 @@ Step-by-Step
 ============
 This document describes the step-by-step instructions to run large language models (LLMs) on 4th Gen Intel® Xeon® Scalable Processor (codenamed Sapphire Rapids) with PyTorch 2 Export Quantization.
 
-The script `run_clm_no_trainer.py` supports `OPT` quantization and validates last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git) now, and we are adding more models.
+Currently, users can use `run_clm_no_trainer.py` to quantize the `OPT` series models and validate the last word prediction accuracy with [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness.git). We will add more models in the near future.
 
 # Prerequisite
 ## 1. Create Environment