Skip to content

Commit

Permalink
Refact gptq to support runing on gaudi (#1700)
Browse files Browse the repository at this point in the history
* gptq support for gaudi

Signed-off-by: n1ck-guo <[email protected]>
  • Loading branch information
n1ck-guo authored Apr 2, 2024
1 parent 9d7a052 commit 14868c0
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 4 deletions.
21 changes: 17 additions & 4 deletions neural_compressor/torch/algorithms/weight_only/gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from .modules import WeightOnlyLinear

DEBUG = False
accelerator = auto_detect_accelerator()


# ================ device related ===================
Expand Down Expand Up @@ -542,8 +543,10 @@ def forward(layer, *args, **kwargs):
if self.run_fn:
if self.run_args:
self.run_fn(self.model, *self.run_args)
accelerator.mark_step()
else:
self.run_fn(self.model)
accelerator.mark_step()
else:
for batch in tqdm(self.dataloader):
if not self.use_layer_wise:
Expand Down Expand Up @@ -663,6 +666,7 @@ def tmp(_, inp, out):
for j in range(batch_num):
cache_keyword_batch = self.gather_single_batch_from_dict(self.cache_key_arguments, j)
cache_positional_batch = self.gather_single_batch_from_list(self.cache_positional_arguments, j)
accelerator.mark_step()
out = transformer_block(*cache_positional_batch, **cache_keyword_batch)
out = self.track_hidden_states(out)
self.cache_key_arguments["batch_num"] = batch_num
Expand All @@ -682,6 +686,9 @@ def tmp(_, inp, out):
W = load_value(self.model, full_layer_name + ".weight", model_path)
else:
W = sub_layers[layer_name].weight.data.clone()
accelerator.mark_step()
if "hpu" in self.device:
W = W.to("cpu")
scale, zp, Q = gptq_for_this_block[layer_name].fasterquant(
W,
blocksize=weight_config_this_layer["block_size"],
Expand Down Expand Up @@ -854,6 +861,8 @@ def fasterquant(self, W, blocksize=128, percdamp=0.01, groupsize=-1, act_order=F
self.quantizer.find_params(W, weight=True)

H = self.H
if "hpu" in self.device:
H = H.to("cpu")
del self.H
dead = torch.diag(H) == 0
H[dead, dead] = 1
Expand Down Expand Up @@ -958,6 +967,10 @@ def fasterquant(self, W, blocksize=128, percdamp=0.01, groupsize=-1, act_order=F
zero.append(self.quantizer.zero)
scale = torch.cat(scale, dim=1)
zero = torch.cat(zero, dim=1)
if "hpu" in self.device:
scale = scale.to(self.device)
zero = zero.to(self.device)
Q = Q.to(self.device)
return scale, zero, Q

def free(self):
Expand All @@ -973,25 +986,25 @@ def free(self):
class Quantizer(nn.Module):
def __init__(self, shape=1):
super(Quantizer, self).__init__()
self.register_buffer("maxq", torch.tensor(0))
self.maxq = 0
self.register_buffer("scale", torch.zeros(shape))
self.register_buffer("zero", torch.zeros(shape))

def configure(self, weight_config_this_layer, norm=2.4, grid=100, maxshrink=0.8, trits=False):
for k, v in weight_config_this_layer.items():
setattr(self, k, v)
self.maxq = torch.tensor(2**self.bits - 1)
# self.maxq = torch.tensor(2**self.bits - 1)
self.maxq = 2**self.bits - 1
self.scheme = "sym" if self.sym else "asym"
self.double_quant_scheme = "sym" if self.double_quant_sym else "asym"
self.norm = norm
self.grid = grid
self.maxshrink = maxshrink
if trits:
self.maxq = torch.tensor(-1)
self.maxq = -1

def find_params(self, x, weight=False):
dev = x.device
self.maxq = self.maxq.to(dev)
# NF4 FP4
if self.dtype != "int":
from .utility import quant_tensor
Expand Down
16 changes: 16 additions & 0 deletions neural_compressor/torch/utils/auto_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,11 @@

import torch

from neural_compressor.common.utils import LazyImport
from neural_compressor.torch.utils import logger

htcore = LazyImport("habana_frameworks.torch.core")

PRIORITY_HPU = 100
PRIORITY_CUDA = 95
PRIORITY_CPU = 90
Expand Down Expand Up @@ -133,6 +136,10 @@ def empty_cache(self):
def synchronize(self):
pass

@abstractmethod
def mark_step(self):
pass


@register_accelerator(name="cpu", priority=PRIORITY_CPU)
class CPU_Accelerator(Auto_Accelerator):
Expand Down Expand Up @@ -167,6 +174,9 @@ def empty_cache(self):
def synchronize(self):
pass

def mark_step(self):
pass


@register_accelerator(name="cuda", priority=PRIORITY_CUDA)
class CUDA_Accelerator(Auto_Accelerator):
Expand Down Expand Up @@ -203,6 +213,9 @@ def device(self, device_index=None):
def empty_cache(self):
return torch.cuda.empty_cache()

def mark_step(self):
pass


@register_accelerator(name="hpu", priority=PRIORITY_HPU)
class HPU_Accelerator(Auto_Accelerator):
Expand Down Expand Up @@ -244,6 +257,9 @@ def device(self, device_index=None):
def empty_cache(self):
return torch.hpu.empty_cache()

def mark_step(self):
return htcore.mark_step()


def auto_detect_accelerator(device_name="auto") -> Auto_Accelerator:
# Force use the cpu on node has both cpu and gpu: `FORCE_DEVICE=cpu` python main.py ...
Expand Down

0 comments on commit 14868c0

Please sign in to comment.