Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GPTQ-GPTJ examples #1091

Merged
merged 7 commits into from
Aug 4, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,16 @@ sh run_tuning.sh --topology=topology_name --input_model=model_name_or_path --wei
>
> `weight_only_bits`, `weight_only_group`, `weight_only_scheme`, and `weight_only_algorithm` can be modified by user. For details, please refer to [README](../../../../../../../docs/source/quantization_weight_only.md).

### Run MLPerf on GPT-J-6B
Use the following link to get
[**CNN Daily Mail** datasets](https://github.com/intel-innersource/frameworks.ai.benchmarking.mlperf.submission.inference-submission-v3-1/tree/master/closed/Intel/code/gpt-j/pytorch-cpu#download-and-prepare-dataset)
and [gpt-j-6B mlperf model](https://github.com/mlcommons/inference/tree/master/language/gpt-j#download-gpt-j-model)

Then run following command to do quantization
```shell
sh run_gptj_mlperf_int4.sh
```

## 2. Benchmark
```bash
# int8
Expand Down Expand Up @@ -102,4 +112,4 @@ from neural_compressor.utils.pytorch import load
quantized_model = load(tuned_checkpoint, model)
```
--------
For more details, please refer to the [sample code](./run_clm.py).
For more details, please refer to the [sample code](./run_clm.py).
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
import sys
import argparse
import os
import time
import json
import fnmatch

import copy
import logging
from dataclasses import dataclass, field
from typing import Optional, Dict, Sequence


import numpy as np
import torch
import torch.nn.functional as F
from datasets import load_dataset, load_from_disk
from torch.nn.functional import pad
from torch.utils.data import DataLoader
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

import random
random.seed(9973)

# Bucketize sequence lengths
MaxLens = range(0,64,1919)
Buckets = dict()
cutoff_step = 64
min_cutoff = 64
min_len = 1
for cutoff in range(min_cutoff, 1921, cutoff_step): # All input sequences
Buckets[cutoff] = list(range(min_len, cutoff, 1))
min_len = cutoff

#Buckets[1920] = list(range(min_len, 1921, 1))

input_buckets = dict()
for cutoff, seq_lens in Buckets.items():
for seq_len in seq_lens:
input_buckets[seq_len] = cutoff

#print("Buckets: {}".format(input_buckets))

IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "</s>"
DEFAULT_UNK_TOKEN = "</s>"
PROMPT_DICT = {
"prompt_input": (
"Below is an instruction that describes a task, paired with an input that provides further context. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
),
"prompt_no_input": (
"Below is an instruction that describes a task. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n{instruction}\n\n### Response:"
),
}


class CNNDAILYMAIL(object):
def __init__(self, model_path, data_path, device="cpu",is_calib=False, num_samples=20, max_len=1920):
self.model_path = model_path
self.data_path = data_path
self.device = device
self.num_samples = num_samples
self.is_calib = is_calib

self.padding = "max_length" if self.is_calib else False
self.max_len = 2048 if self.is_calib else max_len

self.calib_collator = self.collate_batch
self.pad_max = max_len
self.load_tokenizer()
self.load_dataset()
def load_dataset(self):
""" Loads dataset"""
with open(self.data_path, "r") as fid:
list_data_dict = json.load(fid)
self.list_data_dict = copy.deepcopy(list_data_dict)

if self.num_samples is not None:
self.num_samples = min(self.num_samples, len(list_data_dict))

if self.is_calib:
list_data_dict = list_data_dict[:self.num_samples]
else:
list_data_dict = random.choices(list_data_dict, k=self.num_samples)

prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
sources = [prompt_input.format_map(example) for example in list_data_dict]
targets = [f"{example['output']}" for example in list_data_dict]

self.input_ids = []
self.input_lens = []
for i in range(len(sources)):
tok_input = self.tokenize_function(sources[i])
self.input_ids.append(tok_input.input_ids)


#if self.num_samples is not None:
# self.num_samples = min(self.num_samples, len(list_data_dict))
# self.input_ids = random.choices(self.input_ids, k=self.num_samples)
# print("Sources: {}".format(len(sources)))
# print("Targets: {}".format(len(targets)))
# sources = random.choices(sources, k=self.num_samples)
# targets = random.choices(targets, k=self.num_samples)


self.sources = sources
self.targets = targets

def load_tokenizer(self):
""" Returns the tokenizer """
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_path,
model_max_length=2048,
padding_side="right",
use_fast=False,
)
self.tokenizer.pad_token = self.tokenizer.eos_token

@torch.no_grad()
def tokenize_function(self, text):
example = self.tokenizer(text, truncation=True, max_length=self.max_len, return_tensors="pt", padding=self.padding)
return example

def __len__(self):
return len(self.input_ids)

def __getitem__(self, i) -> Dict[str, torch.Tensor]:
input_ids = self.input_ids[i]
input_len = input_ids.shape[-1]
#pad_size = input_buckets[input_len] - input_len
#input_ids = F.pad(input_ids, pad=(0, pad_size))
return (input_ids, input_len)

@torch.no_grad()
def collate_batch(self, batch):
input_ids_padded = []

for input_ids, input_lens in batch: # input_ids are returned by this dataset (see __getitem__)
pad_len = self.pad_max - input_ids.shape[0]
#input_ids = F.pad(input_ids, pad=(0, pad_size), value=self.tokenizer.pad_token_id)
input_ids_padded.append(input_ids)

input_ids_padded = torch.vstack(input_ids_padded)
return (input_ids_padded, input_ids_padded)

def get_warmup_samples(self):
cutoff_set = set(range(128, 1920, 64))
warmup_samples = []
prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
sources = [prompt_input.format_map(example) for example in self.list_data_dict]
for source in sources: #self.input_ids:
tok_input = self.tokenize_function(source)
input_ids = tok_input.input_ids
input_len = input_ids.shape[-1]
bucket = input_buckets[input_len]
if bucket in cutoff_set:
#print("inputlen: {}; Bucket: {}".format(input_len, bucket))
pad_size = bucket - input_len
input_ids = F.pad(input_ids, pad=(0, pad_size), value=0)
warmup_samples.append(input_ids)
cutoff_set.remove(bucket)
if len(cutoff_set)==0:
break

return warmup_samples
Loading