Skip to content

Commit

Permalink
Smooth Quant for Tensorflow backend (#830)
Browse files Browse the repository at this point in the history
Signed-off-by: Lv, Liang1 <[email protected]>
  • Loading branch information
Spycsh authored May 31, 2023
1 parent d47ea8e commit 1f4127f
Show file tree
Hide file tree
Showing 24 changed files with 1,547 additions and 17 deletions.
21 changes: 21 additions & 0 deletions examples/.config/model_params_tensorflow.json
Original file line number Diff line number Diff line change
Expand Up @@ -1766,6 +1766,27 @@
"main_script": "run_inference.py",
"batch_size": 128
},
"distilbert_base_sq": {
"model_src_dir": "nlp/distilbert_base/quantization/ptq",
"dataset_location": "/tf_dataset2/datasets/sst2_validation_dataset",
"input_model": "/tf_dataset2/models/tensorflow/distilbert_base/fp32/distilbert_base_fp32.pb",
"main_script": "run_inference.py",
"batch_size": 128
},
"gpt2_medium_sq": {
"model_src_dir": "nlp/large_language_models/quantization/ptq/smoothquant",
"dataset_location": "",
"input_model": "/tf_dataset2/models/tensorflow/gpt2-medium",
"main_script": "main.py",
"batch_size": 16
},
"opt_125m_sq": {
"model_src_dir": "nlp/large_language_models/quantization/ptq/smoothquant",
"dataset_location": "",
"input_model": "/tf_dataset2/models/tensorflow/facebook-opt-125m",
"main_script": "main.py",
"batch_size": 16
},
}
}

17 changes: 17 additions & 0 deletions examples/tensorflow/nlp/distilbert_base/quantization/ptq/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,23 @@ Where (Default values are shown in the square brackets):
* $INTRA_THREADS [28]-- The number of intra op parallelism thread to use, which can be set to the number of physical core per socket


### Run Smooth Quant to improve int8 accuracy

#### Tuning
```shell
bash run_tuning.sh \
--input_model=$INPUT_MODEL \
--dataset_location=$DATASET_DIR \
--output_model=$OUTPUT_MODEL \
--batch_size=$BATCH_SIZE \
--max_seq_length=$MAX_SEQ \
--warmup_steps=$WARMUPS \
--num_inter=$INTER_THREADS \
--num_intra=$INTRA_THREADS \
--sq=True
```


Details of enabling Intel® Neural Compressor on DistilBERT base for TensorFlow
=========================

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def boolean_string(s):
dest="tune",
default=False
)
arg_parser.add_argument('--sq', type=boolean_string, dest='sq', help='smooth quantization', default=False)
arg_parser.add_argument("--benchmark", type=boolean_string,
help="whether to do benchmark",
dest="benchmark",
Expand Down Expand Up @@ -201,7 +202,7 @@ def validate_args(self):
logger.warning("Warmup steps greater than max possible value of 22." + \
" Setting to max value of ", MAX_WARMUP_STEPS)
ARGS.warmup_steps = MAX_WARMUP_STEPS
if ARGS.tune or (ARGS.benchmark and ARGS.mode == "accuracy"):
if ARGS.tune or ARGS.sq or (ARGS.benchmark and ARGS.mode == "accuracy"):
ARGS.steps = MAX_STEPS
elif ARGS.benchmark:
if ARGS.steps > (MAX_STEPS - MAX_WARMUP_STEPS):
Expand Down Expand Up @@ -271,7 +272,7 @@ def eval_func(self, graph):
else:
pred = sess.run(output, feed_dict=feed_dict)
run_time = time.time() - start_time
if ARGS.tune or (ARGS.benchmark and ARGS.mode == "accuracy"):
if ARGS.tune or ARGS.sq or (ARGS.benchmark and ARGS.mode == "accuracy"):
total_correct_predictions += self.get_correct_predictions(pred, labels)
total_time += run_time
# save profiling file
Expand All @@ -287,7 +288,7 @@ def eval_func(self, graph):
with open(profiling_file, 'w') as trace_file:
trace_file.write(trace.generate_chrome_trace_format(show_memory=False))
time_per_batch = total_time / float(ARGS.steps / ARGS.batch_size)
if ARGS.tune or (ARGS.benchmark and ARGS.mode == "accuracy"):
if ARGS.tune or ARGS.sq or (ARGS.benchmark and ARGS.mode == "accuracy"):
accuracy = total_correct_predictions / ARGS.steps
logger.info("Accuracy: {:.4f}".format(accuracy))
if self.dataloader.batch_size == 1:
Expand All @@ -297,12 +298,17 @@ def eval_func(self, graph):

def run(self):
graph = self.load_graph()
if ARGS.tune:
if ARGS.tune or ARGS.sq:
from neural_compressor import quantization
from neural_compressor.config import PostTrainingQuantConfig, AccuracyCriterion
accuracy_criterion = AccuracyCriterion(tolerable_loss=0.02)
config = PostTrainingQuantConfig(calibration_sampling_size=[500],
accuracy_criterion=accuracy_criterion)
if ARGS.sq:
config = PostTrainingQuantConfig(calibration_sampling_size=[500],
quant_level=1,
recipes={"smooth_quant": True, "smooth_quant_args": {'alpha': 0.6}})
else:
accuracy_criterion = AccuracyCriterion(tolerable_loss=0.02)
config = PostTrainingQuantConfig(calibration_sampling_size=[500],
accuracy_criterion=accuracy_criterion)
q_model = quantization.fit(model=graph, conf=config, calib_dataloader=self.dataloader,
eval_func=self.eval_func)
try:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ function init_params {
num_inter=2
num_intra=28
tune=True
sq=False

for var in "$@"
do
Expand Down Expand Up @@ -52,6 +53,9 @@ function init_params {
--tune=*)
tune=$(echo ${var} |cut -f2 -d=)
;;
--sq=*)
sq=$(echo ${var} |cut -f2 -d=)
;;
esac
done

Expand All @@ -65,6 +69,7 @@ function run_tuning {
--data-location=${dataset_location} \
--output-graph=${output_model} \
--tune=${tune} \
--sq=${sq} \
--warmup-steps=${warmup_steps} \
--batch-size=${batch_size} \
--max-seq-length=${max_seq_length} \
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
Step-by-Step
============

This document is used to list steps of reproducing TensorFlow Intel® Neural Compressor quantization and smooth quantization of language models such as OPT and GPT2.

## Prerequisite

```shell
# Install Intel® Neural Compressor
pip install neural-compressor
pip install -r requirements
```
## Run


### Basic quantization

```
python main.py --model_name_or_path <MODEL_NAME>
```

`<MODEL_NAME>` can be following:

- gpt2-medium
- facebook/opt-125m

### Smooth quant

```shell
bash run_tuning.sh --input_model=<MODEL_NAME>
```

Or you can use

```
python main.py --model_name_or_path <MODEL_NAME> --sq
```

## Benchmark

### Get the FP32 performance

```shell
bash run_benchmark.sh --input_model=<MODEL_NAME>
```

### Get the INT8 performance

```shell
bash run_benchmark.sh --input_model=<MODEL_NAME> --int8=true
```

Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
import os.path
import transformers
import tensorflow as tf
from tqdm import tqdm
import sys
import argparse
from datasets import load_dataset
import numpy as np
import time

sys.path.insert(0, './')

parser = argparse.ArgumentParser()
parser.add_argument('--int8', action='store_true', help="eval fp32 model or int8 model")
parser.add_argument('--model_name_or_path', type=str, default='facebook/opt-125m')
parser.add_argument('--batch_size', type=int, default=16)
parser.add_argument('--warmup', type=int, default=10)
args = parser.parse_args()

class Evaluator:
def __init__(self, dataset, tokenizer, device, batch_size=args.batch_size):
self.dataset = dataset
self.tokenizer = tokenizer
self.device = device
self.dataloader = INCDataloader(dataset, tokenizer, batch_size, device)

def evaluate(self, model):
# model.eval()
# The task is to predict the last word of the input.
total, hit = 0, 0
index = 1
for input_ids, label, label_indices in tqdm(self.dataloader):
# TFCausalLMOutputWithPast len: 2
# first element shape (16, 196, 50272)
# second element shape (16, 12, 196, 64)
outputs = model(input_ids)
last_token_logits = outputs[0].numpy()[np.arange(len(label_indices)), label_indices, :]
pred = last_token_logits.argmax(axis=-1)
total += label.shape[0]
hit += (pred == label.numpy()).sum().item()
index += 1
acc = hit / total
print(acc, flush=True)
return acc

def get_attention_mask(self, input_ids):
return tf.constant(1 - (input_ids==1).numpy().astype(int))

def evaluate_tf_v1(self, model):
# return 0.99 # TODO debug remove
total, hit = 0, 0
index = 1
infer = model.signatures["serving_default"]
overall_infer_duration = 0
for input_ids, label, label_indices in tqdm(self.dataloader):
attention_mask = self.get_attention_mask(input_ids)
input_ids = tf.constant(input_ids.numpy(), dtype=infer.inputs[0].dtype)
attention_mask = tf.constant(attention_mask.numpy(), dtype=infer.inputs[0].dtype)
start = time.time()
results = infer(input_ids=input_ids, attention_mask=attention_mask) # len: 25 Identity: [16, 196, 50272], Identity_1: [16, 12, 196, 64]
batch_infer_time = time.time() - start
if index > args.warmup:
overall_infer_duration += batch_infer_time
last_token_logits = results['Identity'].numpy()[np.arange(len(label_indices)), label_indices, :]
pred = last_token_logits.argmax(axis=-1)
total += label.shape[0]
hit += (pred == label.numpy()).sum().item()
index += 1
acc = hit / total
print("\nEvaluation result: ")
print(f"Batch size = {args.batch_size}")
print(f"Accuracy: {acc}")
print(
f"Throughput: {(len(self.dataloader) - args.warmup * args.batch_size) / overall_infer_duration} samples/sec"
)

class INCDataloader:
# for_calib=True in quantization, only input_id is needed, =False in evaluation need label
def __init__(self, dataset, tokenizer, batch_size=1, device='cpu', for_calib=False):
self.dataset = dataset
self.tokenizer = tokenizer
self.device = device
self.batch_size = batch_size
self.for_calib = for_calib
import math
self.length = math.ceil(len(dataset) / self.batch_size) # batch number
self.pad_len = 196

# tokenize the dataset
def tokenize_function(examples):
example = self.tokenizer(examples['text'])
return example

self.dataset = self.dataset.map(tokenize_function, batched=True)
self.dataset.set_format(type='tensorflow', columns=['input_ids'])
def get_attention_mask(self, input_ids):
return 1 - (input_ids==1).numpy().astype(int)
def pad_input(self, input): # input: a record
input_id = input['input_ids']
if input_id.numpy().shape[0] > self.pad_len: # truncate the sequence to pad_len if the sequence is longer than pad_len
input_id = input_id[:self.pad_len]
label = input_id[-1]
pad_len = self.pad_len - input_id.numpy().shape[0]
label_index = -2 - pad_len # last logit index
input_id = tf.pad(input_id, tf.constant([[0,pad_len]]), constant_values=1)
input_id = tf.expand_dims(input_id, axis=0)
label = tf.expand_dims(label, axis=0)
return (input_id, label, label_index)

def __iter__(self):
if self.for_calib:
labels = None
# label_indices = None
for idx, record in enumerate(self.dataset):
input_id, label, label_index = self.pad_input(record)
attention_mask = self.get_attention_mask(input_id)
# compose attention_mask and input_id together
# during the calibration, it requires to yield a <attention_mask, input_id>
# cur_input = tf.constant(np.append(attention_mask, input_id.numpy(), axis=0))
cur_input = {"input_ids": input_id.numpy(), "attention_mask": attention_mask}
assert self.batch_size == 1
yield (cur_input, label)
else:
input_ids = None
labels = None
label_indices = None
for idx, record in enumerate(self.dataset):
input_id, label, label_index = self.pad_input(record)
if input_ids is None:
input_ids = input_id
labels = label
label_indices = [label_index]
else:
input_ids = tf.concat([input_ids, input_id], 0)
labels = tf.concat([labels, label], 0)

label_indices.append(label_index)

if (idx + 1) % self.batch_size == 0:
yield (input_ids, labels, label_indices)
input_ids = None
labels = None
label_indices = None
if (idx + 1) % self.batch_size != 0:
yield (input_ids, labels, label_indices)

def __len__(self):
return self.length

from datasets import load_dataset

model_name = args.model_name_or_path
tokenizer = transformers.AutoTokenizer.from_pretrained(
model_name,
)
eval_dataset = load_dataset('lambada', split='validation')

evaluator = Evaluator(eval_dataset, tokenizer, 'cpu')

if args.int8:
print("benchmarking int8 model")
int8_folder = model_name.split('/')[-1] + "_int8"
if not os.path.exists(int8_folder):
print(f"could not find int8 folder {int8_folder} ")
exit()
model = tf.saved_model.load(int8_folder) # tensorflow.python.trackable.autotrackable.AutoTrackable object
else:
print("benchmaking fp32 model")
model = transformers.TFAutoModelForCausalLM.from_pretrained(model_name)
# fp32_folder = model_name.split('/')[-1] + "_fp32"
# model.save(fp32_folder)
# model = tf.keras.models.load_model(fp32_folder)
from neural_compressor.experimental import common
def keras2SavedModel(model):
model = common.Model(model)
return model.model
model = keras2SavedModel(model) # tensorflow.python.trackable.autotrackable.AutoTrackable object

# TODO current neural_compressor.benchmark does not support AutoTrackable model, we will write our own
# from neural_compressor.benchmark import fit
# from neural_compressor.config import BenchmarkConfig
# conf = BenchmarkConfig(cores_per_instance=28, num_of_instance=1)
# fit(model, conf, b_func=evaluator.evaluate_tf_v1)
evaluator.evaluate_tf_v1(model)
Loading

0 comments on commit 1f4127f

Please sign in to comment.