Smooth Quant for Tensorflow backend (#830)

Signed-off-by: Lv, Liang1 <[email protected]>
intel · May 31, 2023 · 1f4127f · 1f4127f
1 parent d47ea8e
commit 1f4127f
Show file tree

Hide file tree

Showing 24 changed files with 1,547 additions and 17 deletions.
diff --git a/examples/.config/model_params_tensorflow.json b/examples/.config/model_params_tensorflow.json
@@ -1766,6 +1766,27 @@
       "main_script": "run_inference.py",
       "batch_size": 128
     },
+    "distilbert_base_sq": {
+      "model_src_dir": "nlp/distilbert_base/quantization/ptq",
+      "dataset_location": "/tf_dataset2/datasets/sst2_validation_dataset",
+      "input_model": "/tf_dataset2/models/tensorflow/distilbert_base/fp32/distilbert_base_fp32.pb",
+      "main_script": "run_inference.py",
+      "batch_size": 128
+    },
+    "gpt2_medium_sq": {
+      "model_src_dir": "nlp/large_language_models/quantization/ptq/smoothquant",
+      "dataset_location": "",
+      "input_model": "/tf_dataset2/models/tensorflow/gpt2-medium",
+      "main_script": "main.py",
+      "batch_size": 16
+    },
+    "opt_125m_sq": {
+      "model_src_dir": "nlp/large_language_models/quantization/ptq/smoothquant",
+      "dataset_location": "",
+      "input_model": "/tf_dataset2/models/tensorflow/facebook-opt-125m",
+      "main_script": "main.py",
+      "batch_size": 16
+    },
   }
 }
 
diff --git a/examples/tensorflow/nlp/distilbert_base/quantization/ptq/README.md b/examples/tensorflow/nlp/distilbert_base/quantization/ptq/README.md
@@ -111,6 +111,23 @@ Where (Default values are shown in the square brackets):
    * $INTRA_THREADS [28]-- The number of intra op parallelism thread to use, which can be set to the number of physical core per socket
 
 
+### Run Smooth Quant to improve int8 accuracy
+
+#### Tuning
+```shell
+bash run_tuning.sh \
+    --input_model=$INPUT_MODEL \
+    --dataset_location=$DATASET_DIR \
+    --output_model=$OUTPUT_MODEL \
+    --batch_size=$BATCH_SIZE \
+    --max_seq_length=$MAX_SEQ \
+    --warmup_steps=$WARMUPS \
+    --num_inter=$INTER_THREADS \
+    --num_intra=$INTRA_THREADS \
+    --sq=True
+```
+
+
 Details of enabling Intel® Neural Compressor on DistilBERT base for TensorFlow
 =========================
 

diff --git a/examples/tensorflow/nlp/distilbert_base/quantization/ptq/run_inference.py b/examples/tensorflow/nlp/distilbert_base/quantization/ptq/run_inference.py
@@ -72,6 +72,7 @@ def boolean_string(s):
                         dest="tune",
                         default=False
                         )
+arg_parser.add_argument('--sq', type=boolean_string, dest='sq', help='smooth quantization', default=False)
 arg_parser.add_argument("--benchmark", type=boolean_string,
                         help="whether to do benchmark",
                         dest="benchmark",
@@ -201,7 +202,7 @@ def validate_args(self):
             logger.warning("Warmup steps greater than max possible value of 22." + \
                            " Setting to max value of ", MAX_WARMUP_STEPS)
             ARGS.warmup_steps = MAX_WARMUP_STEPS
-        if ARGS.tune or (ARGS.benchmark and ARGS.mode == "accuracy"):
+        if ARGS.tune or ARGS.sq or (ARGS.benchmark and ARGS.mode == "accuracy"):
             ARGS.steps = MAX_STEPS
         elif ARGS.benchmark:
             if ARGS.steps > (MAX_STEPS - MAX_WARMUP_STEPS):
@@ -271,7 +272,7 @@ def eval_func(self, graph):
                 else:
                     pred = sess.run(output, feed_dict=feed_dict)
                 run_time = time.time() - start_time
-                if ARGS.tune or (ARGS.benchmark and ARGS.mode == "accuracy"):
+                if ARGS.tune or ARGS.sq or (ARGS.benchmark and ARGS.mode == "accuracy"):
                     total_correct_predictions += self.get_correct_predictions(pred, labels)
                 total_time += run_time
                 # save profiling file
@@ -287,7 +288,7 @@ def eval_func(self, graph):
                         with open(profiling_file, 'w') as trace_file:
                             trace_file.write(trace.generate_chrome_trace_format(show_memory=False))
         time_per_batch = total_time / float(ARGS.steps / ARGS.batch_size)
-        if ARGS.tune or (ARGS.benchmark and ARGS.mode == "accuracy"):
+        if ARGS.tune or ARGS.sq or (ARGS.benchmark and ARGS.mode == "accuracy"):
             accuracy = total_correct_predictions / ARGS.steps
             logger.info("Accuracy: {:.4f}".format(accuracy))
         if self.dataloader.batch_size == 1:
@@ -297,12 +298,17 @@ def eval_func(self, graph):
 
     def run(self):
         graph = self.load_graph()
-        if ARGS.tune:
+        if ARGS.tune or ARGS.sq:
             from neural_compressor import quantization
             from neural_compressor.config import PostTrainingQuantConfig, AccuracyCriterion
-            accuracy_criterion = AccuracyCriterion(tolerable_loss=0.02)
-            config = PostTrainingQuantConfig(calibration_sampling_size=[500],
-                                             accuracy_criterion=accuracy_criterion)
+            if ARGS.sq:
+                config = PostTrainingQuantConfig(calibration_sampling_size=[500],
+                                                quant_level=1,
+                                                recipes={"smooth_quant": True, "smooth_quant_args": {'alpha': 0.6}})
+            else:
+                accuracy_criterion = AccuracyCriterion(tolerable_loss=0.02)
+                config = PostTrainingQuantConfig(calibration_sampling_size=[500],
+                                                 accuracy_criterion=accuracy_criterion)
             q_model = quantization.fit(model=graph, conf=config, calib_dataloader=self.dataloader,
                             eval_func=self.eval_func)
             try:

diff --git a/examples/tensorflow/nlp/distilbert_base/quantization/ptq/run_tuning.sh b/examples/tensorflow/nlp/distilbert_base/quantization/ptq/run_tuning.sh
@@ -21,6 +21,7 @@ function init_params {
   num_inter=2
   num_intra=28
   tune=True
+  sq=False
 
   for var in "$@"
   do
@@ -52,6 +53,9 @@ function init_params {
       --tune=*)
          tune=$(echo ${var} |cut -f2 -d=)
       ;;
+      --sq=*)
+         sq=$(echo ${var} |cut -f2 -d=)
+      ;;
     esac
   done
 
@@ -65,6 +69,7 @@ function run_tuning {
             --data-location=${dataset_location} \
             --output-graph=${output_model} \
             --tune=${tune} \
+            --sq=${sq} \
             --warmup-steps=${warmup_steps} \
             --batch-size=${batch_size} \
             --max-seq-length=${max_seq_length} \

diff --git a/...les/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/README.md b/...les/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/README.md
@@ -0,0 +1,52 @@
+Step-by-Step
+============
+
+This document is used to list steps of reproducing TensorFlow Intel® Neural Compressor quantization and smooth quantization of language models such as OPT and GPT2.
+
+## Prerequisite
+
+```shell
+# Install Intel® Neural Compressor
+pip install neural-compressor
+pip install -r requirements
+```
+## Run
+
+
+### Basic quantization
+
+```
+python main.py --model_name_or_path <MODEL_NAME>
+```
+
+`<MODEL_NAME>` can be following:
+
+- gpt2-medium
+- facebook/opt-125m
+
+### Smooth quant
+
+```shell
+bash run_tuning.sh --input_model=<MODEL_NAME>
+```
+
+Or you can use
+
+```
+python main.py --model_name_or_path <MODEL_NAME> --sq
+```
+
+## Benchmark
+
+### Get the FP32 performance
+
+```shell
+bash run_benchmark.sh --input_model=<MODEL_NAME>
+```
+
+### Get the INT8 performance
+
+```shell
+bash run_benchmark.sh --input_model=<MODEL_NAME> --int8=true
+```
+
diff --git a/examples/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/benchmark.py b/examples/tensorflow/nlp/large_language_models/quantization/ptq/smoothquant/benchmark.py
@@ -0,0 +1,184 @@
+import os.path
+import transformers
+import tensorflow as tf
+from tqdm import tqdm
+import sys
+import argparse
+from datasets import load_dataset
+import numpy as np
+import time
+
+sys.path.insert(0, './')
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--int8', action='store_true', help="eval fp32 model or int8 model")
+parser.add_argument('--model_name_or_path', type=str, default='facebook/opt-125m')
+parser.add_argument('--batch_size', type=int, default=16)
+parser.add_argument('--warmup', type=int, default=10)
+args = parser.parse_args()
+
+class Evaluator:
+    def __init__(self, dataset, tokenizer, device, batch_size=args.batch_size):
+        self.dataset = dataset
+        self.tokenizer = tokenizer
+        self.device = device
+        self.dataloader = INCDataloader(dataset, tokenizer, batch_size, device)
+
+    def evaluate(self, model):
+        # model.eval()
+        # The task is to predict the last word of the input.
+        total, hit = 0, 0
+        index = 1
+        for input_ids, label, label_indices in tqdm(self.dataloader):
+            # TFCausalLMOutputWithPast len: 2
+            # first element shape (16, 196, 50272)
+            # second element shape (16, 12, 196, 64)
+            outputs = model(input_ids)
+            last_token_logits = outputs[0].numpy()[np.arange(len(label_indices)), label_indices, :]
+            pred = last_token_logits.argmax(axis=-1)
+            total += label.shape[0]
+            hit += (pred == label.numpy()).sum().item()
+            index += 1
+        acc = hit / total
+        print(acc, flush=True)
+        return acc
+
+    def get_attention_mask(self, input_ids):
+        return tf.constant(1 - (input_ids==1).numpy().astype(int))
+
+    def evaluate_tf_v1(self, model):
+        # return 0.99 # TODO debug remove
+        total, hit = 0, 0
+        index = 1
+        infer = model.signatures["serving_default"]
+        overall_infer_duration = 0
+        for input_ids, label, label_indices in tqdm(self.dataloader):
+            attention_mask = self.get_attention_mask(input_ids)
+            input_ids = tf.constant(input_ids.numpy(), dtype=infer.inputs[0].dtype)
+            attention_mask = tf.constant(attention_mask.numpy(), dtype=infer.inputs[0].dtype)
+            start = time.time()
+            results = infer(input_ids=input_ids, attention_mask=attention_mask) # len: 25 Identity: [16, 196, 50272], Identity_1: [16, 12, 196, 64]
+            batch_infer_time = time.time() - start
+            if index > args.warmup:
+                overall_infer_duration += batch_infer_time
+            last_token_logits = results['Identity'].numpy()[np.arange(len(label_indices)), label_indices, :]
+            pred = last_token_logits.argmax(axis=-1)
+            total += label.shape[0]
+            hit += (pred == label.numpy()).sum().item()
+            index += 1
+        acc = hit / total
+        print("\nEvaluation result: ")
+        print(f"Batch size = {args.batch_size}")
+        print(f"Accuracy: {acc}")
+        print(
+            f"Throughput: {(len(self.dataloader) - args.warmup * args.batch_size) / overall_infer_duration} samples/sec"
+        )
+
+class INCDataloader:
+    # for_calib=True in quantization, only input_id is needed, =False in evaluation need label
+    def __init__(self, dataset, tokenizer, batch_size=1, device='cpu', for_calib=False):
+        self.dataset = dataset
+        self.tokenizer = tokenizer
+        self.device = device
+        self.batch_size = batch_size
+        self.for_calib = for_calib
+        import math
+        self.length = math.ceil(len(dataset) / self.batch_size) # batch number
+        self.pad_len = 196
+
+        # tokenize the dataset
+        def tokenize_function(examples):
+            example = self.tokenizer(examples['text'])
+            return example
+
+        self.dataset = self.dataset.map(tokenize_function, batched=True)
+        self.dataset.set_format(type='tensorflow', columns=['input_ids'])
+    def get_attention_mask(self, input_ids):
+        return 1 - (input_ids==1).numpy().astype(int)
+    def pad_input(self, input): # input: a record
+        input_id = input['input_ids']
+        if input_id.numpy().shape[0] > self.pad_len: # truncate the sequence to pad_len if the sequence is longer than pad_len
+            input_id = input_id[:self.pad_len]
+        label = input_id[-1]
+        pad_len = self.pad_len - input_id.numpy().shape[0]
+        label_index = -2 - pad_len  # last logit index
+        input_id = tf.pad(input_id, tf.constant([[0,pad_len]]), constant_values=1)
+        input_id = tf.expand_dims(input_id, axis=0)
+        label = tf.expand_dims(label, axis=0)
+        return (input_id, label, label_index)
+
+    def __iter__(self):
+        if self.for_calib:
+            labels = None
+            # label_indices = None
+            for idx, record in enumerate(self.dataset):
+                input_id, label, label_index = self.pad_input(record)
+                attention_mask = self.get_attention_mask(input_id)
+                # compose attention_mask and input_id together
+                # during the calibration, it requires to yield a <attention_mask, input_id>
+                # cur_input = tf.constant(np.append(attention_mask, input_id.numpy(), axis=0))
+                cur_input = {"input_ids": input_id.numpy(), "attention_mask": attention_mask}
+                assert self.batch_size == 1
+                yield (cur_input, label)
+        else:
+            input_ids = None
+            labels = None
+            label_indices = None
+            for idx, record in enumerate(self.dataset):
+                input_id, label, label_index = self.pad_input(record)
+                if input_ids is None:
+                    input_ids = input_id
+                    labels = label
+                    label_indices = [label_index]
+                else:
+                    input_ids = tf.concat([input_ids, input_id], 0)
+                    labels = tf.concat([labels, label], 0)
+
+                    label_indices.append(label_index)
+
+                if (idx + 1) % self.batch_size == 0:
+                    yield (input_ids, labels, label_indices)
+                    input_ids = None
+                    labels = None
+                    label_indices = None
+            if (idx + 1) % self.batch_size != 0:
+                yield (input_ids, labels, label_indices)
+
+    def __len__(self):
+        return self.length
+
+from datasets import load_dataset
+
+model_name = args.model_name_or_path
+tokenizer = transformers.AutoTokenizer.from_pretrained(
+    model_name,
+)
+eval_dataset = load_dataset('lambada', split='validation')
+
+evaluator = Evaluator(eval_dataset, tokenizer, 'cpu')
+
+if args.int8:
+    print("benchmarking int8 model")
+    int8_folder = model_name.split('/')[-1] + "_int8"
+    if not os.path.exists(int8_folder):
+        print(f"could not find int8 folder {int8_folder} ")
+        exit()
+    model = tf.saved_model.load(int8_folder)    # tensorflow.python.trackable.autotrackable.AutoTrackable object
+else:
+    print("benchmaking fp32 model")
+    model = transformers.TFAutoModelForCausalLM.from_pretrained(model_name)
+    # fp32_folder = model_name.split('/')[-1] + "_fp32"
+    # model.save(fp32_folder)
+    # model = tf.keras.models.load_model(fp32_folder)
+    from neural_compressor.experimental import common
+    def keras2SavedModel(model):
+        model = common.Model(model)
+        return model.model
+    model = keras2SavedModel(model) # tensorflow.python.trackable.autotrackable.AutoTrackable object
+
+# TODO current neural_compressor.benchmark does not support AutoTrackable model, we will write our own
+# from neural_compressor.benchmark import fit
+# from neural_compressor.config import BenchmarkConfig
+# conf = BenchmarkConfig(cores_per_instance=28, num_of_instance=1)
+# fit(model, conf, b_func=evaluator.evaluate_tf_v1)
+evaluator.evaluate_tf_v1(model)