diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/README.md deleted file mode 100644 index eb39321b173..00000000000 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/README.md +++ /dev/null @@ -1,24 +0,0 @@ -# Run - -## Run FP32 model -``` python -python run_llm.py --model [model_name_or_path] --to_graph [--performance]|[--accuracy --tasks lambada_openai --batch_size 8]|[--generate --max_new_tokens 10] -``` - -## Run BF16/FP16 model -``` python -python run_llm.py --model [model_name_or_path] --approach cast --precision [bf16|fp16] --to_graph [--performance]|[--accuracy --tasks lambada_openai --batch_size 8]|[--generate --max_new_tokens 10] -``` - -## Run FP8 model -``` python -python run_llm.py --model [model_name_or_path] --approach [dynamic|static|cast] --precision [fp8_e4m3|fp8_e5m2] --to_graph [--performance]|[--accuracy --tasks lambada_openai --batch_size 8]|[--generate --max_new_tokens 10] -``` - -# Multi-card Inference -With deepspeed we can leverage multi-cards inference with a prefix in command, below it's a demonstration of 4 card inference. - -```python -deepspeed --num_gpus=4 run_llm.py --model [model_name_or_path] --approach [dynamic|static|cast] --precision [fp8_e4m3|fp8_e5m2] --to_graph [--performance]|[--accuracy --tasks lambada_openai --batch_size 8]|[--generate --max_new_tokens 10] -``` -deepspeed --num_gpus=4 run_llm.py --model facebook/opt-125m --approach static --precision fp8_e4m3 --to_graph --accuracy --tasks lambada_openai --batch_size 8 \ No newline at end of file diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/configuration_chatglm.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/configuration_chatglm.py deleted file mode 100644 index 35600185f5a..00000000000 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/configuration_chatglm.py +++ /dev/null @@ -1,61 +0,0 @@ -from transformers import PretrainedConfig - - -class ChatGLMConfig(PretrainedConfig): - model_type = "chatglm" - def __init__( - self, - num_layers=28, - padded_vocab_size=65024, - hidden_size=4096, - ffn_hidden_size=13696, - kv_channels=128, - num_attention_heads=32, - seq_length=2048, - hidden_dropout=0.0, - classifier_dropout=None, - attention_dropout=0.0, - layernorm_epsilon=1e-5, - rmsnorm=True, - apply_residual_connection_post_layernorm=False, - post_layer_norm=True, - add_bias_linear=False, - add_qkv_bias=False, - bias_dropout_fusion=True, - multi_query_attention=False, - multi_query_group_num=1, - apply_query_key_layer_scaling=True, - attention_softmax_in_fp32=True, - fp32_residual_connection=False, - quantization_bit=0, - pre_seq_len=None, - prefix_projection=False, - **kwargs - ): - self.num_layers = num_layers - self.vocab_size = padded_vocab_size - self.padded_vocab_size = padded_vocab_size - self.hidden_size = hidden_size - self.ffn_hidden_size = ffn_hidden_size - self.kv_channels = kv_channels - self.num_attention_heads = num_attention_heads - self.seq_length = seq_length - self.hidden_dropout = hidden_dropout - self.classifier_dropout = classifier_dropout - self.attention_dropout = attention_dropout - self.layernorm_epsilon = layernorm_epsilon - self.rmsnorm = rmsnorm - self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm - self.post_layer_norm = post_layer_norm - self.add_bias_linear = add_bias_linear - self.add_qkv_bias = add_qkv_bias - self.bias_dropout_fusion = bias_dropout_fusion - self.multi_query_attention = multi_query_attention - self.multi_query_group_num = multi_query_group_num - self.apply_query_key_layer_scaling = apply_query_key_layer_scaling - self.attention_softmax_in_fp32 = attention_softmax_in_fp32 - self.fp32_residual_connection = fp32_residual_connection - self.quantization_bit = quantization_bit - self.pre_seq_len = pre_seq_len - self.prefix_projection = prefix_projection - super().__init__(**kwargs) \ No newline at end of file diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/modeling_chatglm.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/modeling_chatglm.py deleted file mode 100644 index be1cd520af5..00000000000 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/modeling_chatglm.py +++ /dev/null @@ -1,1294 +0,0 @@ -""" PyTorch ChatGLM model. """ - -import math -import copy -import warnings -import re -import sys - -import torch -import torch.utils.checkpoint -import torch.nn.functional as F -from torch import nn -from torch.nn import CrossEntropyLoss, LayerNorm, MSELoss, BCEWithLogitsLoss -from torch.nn.utils import skip_init -from typing import Optional, Tuple, Union, List, Callable, Dict, Any -from copy import deepcopy - -from transformers.modeling_outputs import ( - BaseModelOutputWithPast, - CausalLMOutputWithPast, - SequenceClassifierOutputWithPast, -) -from transformers.modeling_utils import PreTrainedModel -from transformers.utils import logging -from transformers.generation.logits_process import LogitsProcessor -from transformers.generation.utils import LogitsProcessorList, StoppingCriteriaList, GenerationConfig, ModelOutput - -from .configuration_chatglm import ChatGLMConfig - -# flags required to enable jit fusion kernels - -if sys.platform != 'darwin': - torch._C._jit_set_profiling_mode(False) - torch._C._jit_set_profiling_executor(False) - torch._C._jit_override_can_fuse_on_cpu(True) - torch._C._jit_override_can_fuse_on_gpu(True) - -logger = logging.get_logger(__name__) - -_CHECKPOINT_FOR_DOC = "THUDM/ChatGLM" -_CONFIG_FOR_DOC = "ChatGLMConfig" - -CHATGLM_6B_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "THUDM/chatglm3-6b", - # See all ChatGLM models at https://huggingface.co/models?filter=chatglm -] - - -def default_init(cls, *args, **kwargs): - return cls(*args, **kwargs) - - -class InvalidScoreLogitsProcessor(LogitsProcessor): - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: - if torch.isnan(scores).any() or torch.isinf(scores).any(): - scores.zero_() - scores[..., 5] = 5e4 - return scores - - -class PrefixEncoder(torch.nn.Module): - """ - The torch.nn model to encode the prefix - Input shape: (batch-size, prefix-length) - Output shape: (batch-size, prefix-length, 2*layers*hidden) - """ - - def __init__(self, config: ChatGLMConfig): - super().__init__() - self.prefix_projection = config.prefix_projection - if self.prefix_projection: - # Use a two-layer MLP to encode the prefix - kv_size = config.num_layers * config.kv_channels * config.multi_query_group_num * 2 - self.embedding = torch.nn.Embedding(config.pre_seq_len, kv_size) - self.trans = torch.nn.Sequential( - torch.nn.Linear(kv_size, config.hidden_size), - torch.nn.Tanh(), - torch.nn.Linear(config.hidden_size, kv_size) - ) - else: - self.embedding = torch.nn.Embedding(config.pre_seq_len, - config.num_layers * config.kv_channels * config.multi_query_group_num * 2) - - def forward(self, prefix: torch.Tensor): - if self.prefix_projection: - prefix_tokens = self.embedding(prefix) - past_key_values = self.trans(prefix_tokens) - else: - past_key_values = self.embedding(prefix) - return past_key_values - - -def split_tensor_along_last_dim( - tensor: torch.Tensor, - num_partitions: int, - contiguous_split_chunks: bool = False, -) -> List[torch.Tensor]: - """Split a tensor along its last dimension. - - Arguments: - tensor: input tensor. - num_partitions: number of partitions to split the tensor - contiguous_split_chunks: If True, make each chunk contiguous - in memory. - - Returns: - A list of Tensors - """ - # Get the size and dimension. - last_dim = tensor.dim() - 1 - last_dim_size = tensor.size()[last_dim] // num_partitions - # Split. - tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) - # Note: torch.split does not create contiguous tensors by default. - if contiguous_split_chunks: - return tuple(chunk.contiguous() for chunk in tensor_list) - - return tensor_list - - -class RotaryEmbedding(nn.Module): - def __init__(self, dim, original_impl=False, device=None, dtype=None): - super().__init__() - inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=device).to(dtype=dtype) / dim)) - self.register_buffer("inv_freq", inv_freq) - self.dim = dim - self.original_impl = original_impl - - def forward_impl( - self, seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000 - ): - """Enhanced Transformer with Rotary Position Embedding. - - Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/ - transformers/rope/__init__.py. MIT License: - https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license. - """ - # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$ - theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, dtype=torch.float, device=device) / n_elem)) - - # Create position indexes `[0, 1, ..., seq_len - 1]` - seq_idx = torch.arange(seq_len, dtype=torch.float, device=device) - - # Calculate the product of position index and $\theta_i$ - idx_theta = torch.outer(seq_idx, theta).float() - - cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1) - - # this is to mimic the behaviour of complex32, else we will get different results - if dtype in (torch.float16, torch.bfloat16, torch.int8): - cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half() - return cache - - def forward(self, max_seq_len, offset=0): - return self.forward_impl( - max_seq_len, self.dim, dtype=self.inv_freq.dtype, device=self.inv_freq.device - ) - -### INC change ### -# @torch.jit.script - -def apply_rotary_pos_emb(x: torch.Tensor, rope_cache: torch.Tensor) -> torch.Tensor: - # x: [sq, b, np, hn] - sq, b, np, hn = x.size(0), x.size(1), x.size(2), x.size(3) - rot_dim = rope_cache.shape[-2] * 2 - x, x_pass = x[..., :rot_dim], x[..., rot_dim:] - # truncate to support variable sizes - rope_cache = rope_cache[:sq] - xshaped = x.reshape(sq, -1, np, rot_dim // 2, 2) - rope_cache = rope_cache.view(sq, -1, 1, xshaped.size(3), 2) - x_out2 = torch.stack( - [ - xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1], - xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1], - ], - -1, - ) - x_out2 = x_out2.flatten(3) - return torch.cat((x_out2, x_pass), dim=-1) - - -class RMSNorm(torch.nn.Module): - def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None, **kwargs): - super().__init__() - self.weight = torch.nn.Parameter(torch.empty(normalized_shape, device=device, dtype=dtype)) - self.eps = eps - - def forward(self, hidden_states: torch.Tensor): - input_dtype = hidden_states.dtype - variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.eps) - - return (self.weight * hidden_states).to(input_dtype) - - -class CoreAttention(torch.nn.Module): - def __init__(self, config: ChatGLMConfig, layer_number): - super(CoreAttention, self).__init__() - - self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling - self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32 - if self.apply_query_key_layer_scaling: - self.attention_softmax_in_fp32 = True - self.layer_number = max(1, layer_number) - - projection_size = config.kv_channels * config.num_attention_heads - - # Per attention head and per partition values. - self.hidden_size_per_partition = projection_size - self.hidden_size_per_attention_head = projection_size // config.num_attention_heads - self.num_attention_heads_per_partition = config.num_attention_heads - - coeff = None - self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) - if self.apply_query_key_layer_scaling: - coeff = self.layer_number - self.norm_factor *= coeff - self.coeff = coeff - - self.attention_dropout = torch.nn.Dropout(config.attention_dropout) - - def forward(self, query_layer, key_layer, value_layer, attention_mask): - pytorch_major_version = int(torch.__version__.split('.')[0]) - if pytorch_major_version >= 2: - query_layer, key_layer, value_layer = [k.permute(1, 2, 0, 3) for k in [query_layer, key_layer, value_layer]] - if attention_mask is None and query_layer.shape[2] == key_layer.shape[2]: - context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, - is_causal=True) - else: - if attention_mask is not None: - attention_mask = ~attention_mask - context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer, key_layer, value_layer, - attention_mask) - context_layer = context_layer.permute(2, 0, 1, 3) - new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) - context_layer = context_layer.reshape(*new_context_layer_shape) - else: - # Raw attention scores - - # [b, np, sq, sk] - output_size = (query_layer.size(1), query_layer.size(2), query_layer.size(0), key_layer.size(0)) - - # [sq, b, np, hn] -> [sq, b * np, hn] - query_layer = query_layer.view(output_size[2], output_size[0] * output_size[1], -1) - # [sk, b, np, hn] -> [sk, b * np, hn] - key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1) - - # preallocting input tensor: [b * np, sq, sk] - matmul_input_buffer = torch.empty( - output_size[0] * output_size[1], output_size[2], output_size[3], dtype=query_layer.dtype, - device=query_layer.device - ) - - # Raw attention scores. [b * np, sq, sk] - matmul_result = torch.baddbmm( - matmul_input_buffer, - query_layer.transpose(0, 1), # [b * np, sq, hn] - key_layer.transpose(0, 1).transpose(1, 2), # [b * np, hn, sk] - beta=0.0, - alpha=(1.0 / self.norm_factor), - ) - - # change view to [b, np, sq, sk] - attention_scores = matmul_result.view(*output_size) - - # =========================== - # Attention probs and dropout - # =========================== - - # attention scores and attention mask [b, np, sq, sk] - if self.attention_softmax_in_fp32: - attention_scores = attention_scores.float() - if self.coeff is not None: - attention_scores = attention_scores * self.coeff - if attention_mask is None and attention_scores.shape[2] == attention_scores.shape[3]: - attention_mask = torch.ones(output_size[0], 1, output_size[2], output_size[3], - device=attention_scores.device, dtype=torch.bool) - attention_mask.tril_() - attention_mask = ~attention_mask - if attention_mask is not None: - attention_scores = attention_scores.masked_fill(attention_mask, float("-inf")) - attention_probs = F.softmax(attention_scores, dim=-1) - attention_probs = attention_probs.type_as(value_layer) - - # This is actually dropping out entire tokens to attend to, which might - # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = self.attention_dropout(attention_probs) - # ========================= - # Context layer. [sq, b, hp] - # ========================= - - # value_layer -> context layer. - # [sk, b, np, hn] --> [b, np, sq, hn] - - # context layer shape: [b, np, sq, hn] - output_size = (value_layer.size(1), value_layer.size(2), query_layer.size(0), value_layer.size(3)) - # change view [sk, b * np, hn] - value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1) - # change view [b * np, sq, sk] - attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1) - # matmul: [b * np, sq, hn] - context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1)) - # change view [b, np, sq, hn] - context_layer = context_layer.view(*output_size) - # [b, np, sq, hn] --> [sq, b, np, hn] - context_layer = context_layer.permute(2, 0, 1, 3).contiguous() - # [sq, b, np, hn] --> [sq, b, hp] - new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,) - context_layer = context_layer.view(*new_context_layer_shape) - - return context_layer - - -class SelfAttention(torch.nn.Module): - """Parallel self-attention layer abstract class. - - Self-attention layer takes input with size [s, b, h] - and returns output of the same size. - """ - - def __init__(self, config: ChatGLMConfig, layer_number, device=None): - super(SelfAttention, self).__init__() - self.layer_number = max(1, layer_number) - - self.projection_size = config.kv_channels * config.num_attention_heads - - # Per attention head and per partition values. - self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads - self.num_attention_heads_per_partition = config.num_attention_heads - - self.multi_query_attention = config.multi_query_attention - self.qkv_hidden_size = 3 * self.projection_size - if self.multi_query_attention: - self.num_multi_query_groups_per_partition = config.multi_query_group_num - self.qkv_hidden_size = ( - self.projection_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num - ) - self.query_key_value = nn.Linear(config.hidden_size, self.qkv_hidden_size, - bias=config.add_bias_linear or config.add_qkv_bias, - device=device, **_config_to_kwargs(config) - ) - - self.core_attention = CoreAttention(config, self.layer_number) - - # Output. - self.dense = nn.Linear(self.projection_size, config.hidden_size, bias=config.add_bias_linear, - device=device, **_config_to_kwargs(config) - ) - - def _allocate_memory(self, inference_max_sequence_len, batch_size, device=None, dtype=None): - if self.multi_query_attention: - num_attention_heads = self.num_multi_query_groups_per_partition - else: - num_attention_heads = self.num_attention_heads_per_partition - return torch.empty( - inference_max_sequence_len, - batch_size, - num_attention_heads, - self.hidden_size_per_attention_head, - dtype=dtype, - device=device, - ) - - def forward( - self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True - ): - # hidden_states: [sq, b, h] - - # ================================================= - # Pre-allocate memory for key-values for inference. - # ================================================= - # ===================== - # Query, Key, and Value - # ===================== - - # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)] - mixed_x_layer = self.query_key_value(hidden_states) - - if self.multi_query_attention: - (query_layer, key_layer, value_layer) = mixed_x_layer.split( - [ - self.num_attention_heads_per_partition * self.hidden_size_per_attention_head, - self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head, - self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head, - ], - dim=-1, - ) - query_layer = query_layer.view( - query_layer.size()[:-1] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) - ) - key_layer = key_layer.view( - key_layer.size()[:-1] + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head) - ) - value_layer = value_layer.view( - value_layer.size()[:-1] - + (self.num_multi_query_groups_per_partition, self.hidden_size_per_attention_head) - ) - else: - new_tensor_shape = mixed_x_layer.size()[:-1] + \ - (self.num_attention_heads_per_partition, - 3 * self.hidden_size_per_attention_head) - mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) - - # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn] - (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3) - - # apply relative positional encoding (rotary embedding) - if rotary_pos_emb is not None: - query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb) - key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb) - - # adjust key and value for inference - if kv_cache is not None: - cache_k, cache_v = kv_cache - key_layer = torch.cat((cache_k, key_layer), dim=0) - value_layer = torch.cat((cache_v, value_layer), dim=0) - if use_cache: - kv_cache = (key_layer, value_layer) - else: - kv_cache = None - - if self.multi_query_attention: - key_layer = key_layer.unsqueeze(-2) - key_layer = key_layer.expand( - -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1 - ) - key_layer = key_layer.contiguous().view( - key_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) - ) - value_layer = value_layer.unsqueeze(-2) - value_layer = value_layer.expand( - -1, -1, -1, self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition, -1 - ) - value_layer = value_layer.contiguous().view( - value_layer.size()[:2] + (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) - ) - - # ================================== - # core attention computation - # ================================== - - context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask) - - # ================= - # Output. [sq, b, h] - # ================= - - output = self.dense(context_layer) - - return output, kv_cache - - -def _config_to_kwargs(args): - common_kwargs = { - "dtype": args.torch_dtype, - } - return common_kwargs - - -class MLP(torch.nn.Module): - """MLP. - - MLP will take the input with h hidden state, project it to 4*h - hidden dimension, perform nonlinear transformation, and project the - state back into h hidden dimension. - """ - - def __init__(self, config: ChatGLMConfig, device=None): - super(MLP, self).__init__() - - self.add_bias = config.add_bias_linear - - # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf - self.dense_h_to_4h = nn.Linear( - config.hidden_size, - config.ffn_hidden_size * 2, - bias=self.add_bias, - device=device, - **_config_to_kwargs(config) - ) - - def swiglu(x): - x = torch.chunk(x, 2, dim=-1) - return F.silu(x[0]) * x[1] - - self.activation_func = swiglu - - # Project back to h. - self.dense_4h_to_h = nn.Linear( - config.ffn_hidden_size, - config.hidden_size, - bias=self.add_bias, - device=device, - **_config_to_kwargs(config) - ) - - def forward(self, hidden_states): - # [s, b, 4hp] - intermediate_parallel = self.dense_h_to_4h(hidden_states) - intermediate_parallel = self.activation_func(intermediate_parallel) - # [s, b, h] - output = self.dense_4h_to_h(intermediate_parallel) - return output - - -class GLMBlock(torch.nn.Module): - """A single transformer layer. - - Transformer layer takes input with size [s, b, h] and returns an - output of the same size. - """ - - def __init__(self, config: ChatGLMConfig, layer_number, device=None): - super(GLMBlock, self).__init__() - self.layer_number = layer_number - - self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm - - self.fp32_residual_connection = config.fp32_residual_connection - - LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm - # Layernorm on the input data. - self.input_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device, - dtype=config.torch_dtype) - - # Self attention. - self.self_attention = SelfAttention(config, layer_number, device=device) - self.hidden_dropout = config.hidden_dropout - - # Layernorm on the attention output - self.post_attention_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device, - dtype=config.torch_dtype) - - # MLP - self.mlp = MLP(config, device=device) - - def forward( - self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True, - ): - # hidden_states: [s, b, h] - - # Layer norm at the beginning of the transformer layer. - layernorm_output = self.input_layernorm(hidden_states) - # Self attention. - attention_output, kv_cache = self.self_attention( - layernorm_output, - attention_mask, - rotary_pos_emb, - kv_cache=kv_cache, - use_cache=use_cache - ) - - # Residual connection. - if self.apply_residual_connection_post_layernorm: - residual = layernorm_output - else: - residual = hidden_states - - layernorm_input = torch.nn.functional.dropout(attention_output, p=self.hidden_dropout, training=self.training) - layernorm_input = residual + layernorm_input - - # Layer norm post the self attention. - layernorm_output = self.post_attention_layernorm(layernorm_input) - - # MLP. - mlp_output = self.mlp(layernorm_output) - - # Second residual connection. - if self.apply_residual_connection_post_layernorm: - residual = layernorm_output - else: - residual = layernorm_input - - output = torch.nn.functional.dropout(mlp_output, p=self.hidden_dropout, training=self.training) - output = residual + output - - return output, kv_cache - - -class GLMTransformer(torch.nn.Module): - """Transformer class.""" - - def __init__(self, config: ChatGLMConfig, device=None): - super(GLMTransformer, self).__init__() - - self.fp32_residual_connection = config.fp32_residual_connection - self.post_layer_norm = config.post_layer_norm - - # Number of layers. - self.num_layers = config.num_layers - - # Transformer layers. - def build_layer(layer_number): - return GLMBlock(config, layer_number, device=device) - - self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)]) - - if self.post_layer_norm: - LayerNormFunc = RMSNorm if config.rmsnorm else LayerNorm - # Final layer norm before output. - self.final_layernorm = LayerNormFunc(config.hidden_size, eps=config.layernorm_epsilon, device=device, - dtype=config.torch_dtype) - - self.gradient_checkpointing = False - - def _get_layer(self, layer_number): - return self.layers[layer_number] - - def forward( - self, hidden_states, attention_mask, rotary_pos_emb, kv_caches=None, - use_cache: Optional[bool] = True, - output_hidden_states: Optional[bool] = False, - ): - if not kv_caches: - kv_caches = [None for _ in range(self.num_layers)] - presents = () if use_cache else None - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - - all_self_attentions = None - all_hidden_states = () if output_hidden_states else None - for index in range(self.num_layers): - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - layer = self._get_layer(index) - if self.gradient_checkpointing and self.training: - layer_ret = torch.utils.checkpoint.checkpoint( - layer, - hidden_states, - attention_mask, - rotary_pos_emb, - kv_caches[index], - use_cache - ) - else: - layer_ret = layer( - hidden_states, - attention_mask, - rotary_pos_emb, - kv_cache=kv_caches[index], - use_cache=use_cache - ) - hidden_states, kv_cache = layer_ret - if use_cache: - presents = presents + (kv_cache,) - - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - # Final layer norm. - if self.post_layer_norm: - hidden_states = self.final_layernorm(hidden_states) - - return hidden_states, presents, all_hidden_states, all_self_attentions - - -class ChatGLMPreTrainedModel(PreTrainedModel): - """ - An abstract class to handle weights initialization and - a simple interface for downloading and loading pretrained models. - """ - - is_parallelizable = False - supports_gradient_checkpointing = True - config_class = ChatGLMConfig - base_model_prefix = "transformer" - _no_split_modules = ["GLMBlock"] - - def _init_weights(self, module: nn.Module): - """Initialize the weights.""" - return - - def get_masks(self, input_ids, past_key_values, padding_mask=None): - batch_size, seq_length = input_ids.shape - full_attention_mask = torch.ones(batch_size, seq_length, seq_length, device=input_ids.device) - full_attention_mask.tril_() - past_length = 0 - if past_key_values: - past_length = past_key_values[0][0].shape[0] - if past_length: - full_attention_mask = torch.cat((torch.ones(batch_size, seq_length, past_length, - device=input_ids.device), full_attention_mask), dim=-1) - if padding_mask is not None: - full_attention_mask = full_attention_mask * padding_mask.unsqueeze(1) - if not past_length and padding_mask is not None: - full_attention_mask -= padding_mask.unsqueeze(-1) - 1 - full_attention_mask = (full_attention_mask < 0.5).bool() - full_attention_mask.unsqueeze_(1) - return full_attention_mask - - def get_position_ids(self, input_ids, device): - batch_size, seq_length = input_ids.shape - position_ids = torch.arange(seq_length, dtype=torch.long, device=device).unsqueeze(0).repeat(batch_size, 1) - return position_ids - - def _set_gradient_checkpointing(self, module, value=False): - if isinstance(module, GLMTransformer): - module.gradient_checkpointing = value - - -class Embedding(torch.nn.Module): - """Language model embeddings.""" - - def __init__(self, config: ChatGLMConfig, device=None): - super(Embedding, self).__init__() - - self.hidden_size = config.hidden_size - # Word embeddings (parallel). - self.word_embeddings = nn.Embedding( - config.padded_vocab_size, - self.hidden_size, - dtype=config.torch_dtype, - device=device - ) - self.fp32_residual_connection = config.fp32_residual_connection - - def forward(self, input_ids): - # Embeddings. - words_embeddings = self.word_embeddings(input_ids) - embeddings = words_embeddings - # Data format change to avoid explicit transposes : [b s h] --> [s b h]. - embeddings = embeddings.transpose(0, 1).contiguous() - # If the input flag for fp32 residual connection is set, convert for float. - if self.fp32_residual_connection: - embeddings = embeddings.float() - return embeddings - - -class ChatGLMModel(ChatGLMPreTrainedModel): - def __init__(self, config: ChatGLMConfig, device=None, empty_init=True): - super().__init__(config) - if empty_init: - init_method = skip_init - else: - init_method = default_init - init_kwargs = {} - if device is not None: - init_kwargs["device"] = device - self.embedding = init_method(Embedding, config, **init_kwargs) - self.num_layers = config.num_layers - self.multi_query_group_num = config.multi_query_group_num - self.kv_channels = config.kv_channels - - # Rotary positional embeddings - self.seq_length = config.seq_length - rotary_dim = ( - config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels - ) - - self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2, original_impl=config.original_rope, device=device, - dtype=config.torch_dtype) - self.encoder = init_method(GLMTransformer, config, **init_kwargs) - self.output_layer = init_method(nn.Linear, config.hidden_size, config.padded_vocab_size, bias=False, - dtype=config.torch_dtype, **init_kwargs) - self.pre_seq_len = config.pre_seq_len - self.prefix_projection = config.prefix_projection - if self.pre_seq_len is not None: - for param in self.parameters(): - param.requires_grad = False - self.prefix_tokens = torch.arange(self.pre_seq_len).long() - self.prefix_encoder = PrefixEncoder(config) - self.dropout = torch.nn.Dropout(0.1) - - def get_input_embeddings(self): - return self.embedding.word_embeddings - - def get_prompt(self, batch_size, device, dtype=torch.half): - prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(device) - past_key_values = self.prefix_encoder(prefix_tokens).type(dtype) - past_key_values = past_key_values.view( - batch_size, - self.pre_seq_len, - self.num_layers * 2, - self.multi_query_group_num, - self.kv_channels - ) - # seq_len, b, nh, hidden_size - past_key_values = self.dropout(past_key_values) - past_key_values = past_key_values.permute([2, 1, 0, 3, 4]).split(2) - return past_key_values - - def forward( - self, - input_ids, - position_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.BoolTensor] = None, - full_attention_mask: Optional[torch.BoolTensor] = None, - past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, - inputs_embeds: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ): - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - batch_size, seq_length = input_ids.shape - - if inputs_embeds is None: - inputs_embeds = self.embedding(input_ids) - - if self.pre_seq_len is not None: - if past_key_values is None: - past_key_values = self.get_prompt(batch_size=batch_size, device=input_ids.device, - dtype=inputs_embeds.dtype) - if attention_mask is not None: - attention_mask = torch.cat([attention_mask.new_ones((batch_size, self.pre_seq_len)), - attention_mask], dim=-1) - - if full_attention_mask is None: - if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1): - full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask) - - # Rotary positional embeddings - rotary_pos_emb = self.rotary_pos_emb(self.seq_length) - if position_ids is not None: - rotary_pos_emb = rotary_pos_emb[position_ids] - else: - rotary_pos_emb = rotary_pos_emb[None, :seq_length] - rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous() - - # Run encoder. - hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder( - inputs_embeds, full_attention_mask, rotary_pos_emb=rotary_pos_emb, - kv_caches=past_key_values, use_cache=use_cache, output_hidden_states=output_hidden_states - ) - - if not return_dict: - return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None) - - return BaseModelOutputWithPast( - last_hidden_state=hidden_states, - past_key_values=presents, - hidden_states=all_hidden_states, - attentions=all_self_attentions, - ) - - def quantize(self, weight_bit_width: int): - from .quantization import quantize - quantize(self.encoder, weight_bit_width) - return self - - -class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel): - def __init__(self, config: ChatGLMConfig, empty_init=True, device=None): - super().__init__(config) - - self.max_sequence_length = config.max_length - self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device) - self.config = config - self.quantized = False - - if self.config.quantization_bit: - self.quantize(self.config.quantization_bit, empty_init=True) - - def _update_model_kwargs_for_generation( - self, - outputs: ModelOutput, - model_kwargs: Dict[str, Any], - is_encoder_decoder: bool = False, - standardize_cache_format: bool = False, - ) -> Dict[str, Any]: - # update past_key_values - model_kwargs["past_key_values"] = self._extract_past_from_model_output( - outputs, standardize_cache_format=standardize_cache_format - ) - - # update attention mask - if "attention_mask" in model_kwargs: - attention_mask = model_kwargs["attention_mask"] - model_kwargs["attention_mask"] = torch.cat( - [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 - ) - - # update position ids - if "position_ids" in model_kwargs: - position_ids = model_kwargs["position_ids"] - new_position_id = position_ids[..., -1:].clone() - new_position_id += 1 - model_kwargs["position_ids"] = torch.cat( - [position_ids, new_position_id], dim=-1 - ) - - model_kwargs["is_first_forward"] = False - return model_kwargs - - def prepare_inputs_for_generation( - self, - input_ids: torch.LongTensor, - past_key_values: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, - is_first_forward: bool = True, - **kwargs - ) -> dict: - # only last token for input_ids if past is not None - if position_ids is None: - position_ids = self.get_position_ids(input_ids, device=input_ids.device) - if not is_first_forward: - if past_key_values is not None: - position_ids = position_ids[..., -1:] - input_ids = input_ids[:, -1:] - return { - "input_ids": input_ids, - "past_key_values": past_key_values, - "position_ids": position_ids, - "attention_mask": attention_mask, - "return_last_logit": True, - "use_cache": use_cache - } - - def forward( - self, - input_ids: Optional[torch.Tensor] = None, - position_ids: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - past_key_values: Optional[Tuple[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.Tensor] = None, - labels: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - return_last_logit: Optional[bool] = False, - ): - use_cache = use_cache if use_cache is not None else self.config.use_cache - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - transformer_outputs = self.transformer( - input_ids=input_ids, - position_ids=position_ids, - attention_mask=attention_mask, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - hidden_states = transformer_outputs[0] - if return_last_logit: - hidden_states = hidden_states[-1:] - lm_logits = self.transformer.output_layer(hidden_states) - lm_logits = lm_logits.transpose(0, 1).contiguous() - - loss = None - if labels is not None: - lm_logits = lm_logits.to(torch.float32) - - # Shift so that tokens < n predict n - shift_logits = lm_logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss(ignore_index=-100) - loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) - - lm_logits = lm_logits.to(hidden_states.dtype) - loss = loss.to(hidden_states.dtype) - - if not return_dict: - output = (lm_logits,) + transformer_outputs[1:] - return ((loss,) + output) if loss is not None else output - - return CausalLMOutputWithPast( - loss=loss, - logits=lm_logits, - past_key_values=transformer_outputs.past_key_values, - hidden_states=transformer_outputs.hidden_states, - attentions=transformer_outputs.attentions, - ) - - @staticmethod - def _reorder_cache( - past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor - ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]: - """ - This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or - [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct - beam_idx at every generation step. - - Output shares the same memory storage as `past`. - """ - return tuple( - ( - layer_past[0].index_select(1, beam_idx.to(layer_past[0].device)), - layer_past[1].index_select(1, beam_idx.to(layer_past[1].device)), - ) - for layer_past in past - ) - - def process_response(self, output, history): - content = "" - history = deepcopy(history) - for response in output.split("<|assistant|>"): - metadata, content = response.split("\n", maxsplit=1) - if not metadata.strip(): - content = content.strip() - history.append({"role": "assistant", "metadata": metadata, "content": content}) - content = content.replace("[[训练时间]]", "2023年") - else: - history.append({"role": "assistant", "metadata": metadata, "content": content}) - if history[0]["role"] == "system" and "tools" in history[0]: - content = "\n".join(content.split("\n")[1:-1]) - def tool_call(**kwargs): - return kwargs - parameters = eval(content) - content = {"name": metadata.strip(), "parameters": parameters} - else: - content = {"name": metadata.strip(), "content": content} - return content, history - - @torch.inference_mode() - def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user", - max_length: int = 8192, num_beams=1, do_sample=True, top_p=0.8, temperature=0.8, logits_processor=None, - **kwargs): - if history is None: - history = [] - if logits_processor is None: - logits_processor = LogitsProcessorList() - logits_processor.append(InvalidScoreLogitsProcessor()) - gen_kwargs = {"max_length": max_length, "num_beams": num_beams, "do_sample": do_sample, "top_p": top_p, - "temperature": temperature, "logits_processor": logits_processor, **kwargs} - inputs = tokenizer.build_chat_input(query, history=history, role=role) - inputs = inputs.to(self.device) - eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"), - tokenizer.get_command("<|observation|>")] - outputs = self.generate(**inputs, **gen_kwargs, eos_token_id=eos_token_id) - outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1] - response = tokenizer.decode(outputs) - history.append({"role": role, "content": query}) - response, history = self.process_response(response, history) - return response, history - - @torch.inference_mode() - def stream_chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user", - past_key_values=None,max_length: int = 8192, do_sample=True, top_p=0.8, temperature=0.8, - logits_processor=None, return_past_key_values=False, **kwargs): - if history is None: - history = [] - if logits_processor is None: - logits_processor = LogitsProcessorList() - logits_processor.append(InvalidScoreLogitsProcessor()) - eos_token_id = [tokenizer.eos_token_id, tokenizer.get_command("<|user|>"), - tokenizer.get_command("<|observation|>")] - gen_kwargs = {"max_length": max_length, "do_sample": do_sample, "top_p": top_p, - "temperature": temperature, "logits_processor": logits_processor, **kwargs} - if past_key_values is None: - inputs = tokenizer.build_chat_input(query, history=history, role=role) - else: - inputs = tokenizer.build_chat_input(query, role=role) - inputs = inputs.to(self.device) - if past_key_values is not None: - past_length = past_key_values[0][0].shape[0] - if self.transformer.pre_seq_len is not None: - past_length -= self.transformer.pre_seq_len - inputs.position_ids += past_length - attention_mask = inputs.attention_mask - attention_mask = torch.cat((attention_mask.new_ones(1, past_length), attention_mask), dim=1) - inputs['attention_mask'] = attention_mask - history.append({"role": role, "content": query}) - for outputs in self.stream_generate(**inputs, past_key_values=past_key_values, - eos_token_id=eos_token_id, return_past_key_values=return_past_key_values, - **gen_kwargs): - if return_past_key_values: - outputs, past_key_values = outputs - outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1] - response = tokenizer.decode(outputs) - if response and response[-1] != "�": - response, new_history = self.process_response(response, history) - if return_past_key_values: - yield response, new_history, past_key_values - else: - yield response, new_history - - @torch.inference_mode() - def stream_generate( - self, - input_ids, - generation_config: Optional[GenerationConfig] = None, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None, - return_past_key_values=False, - **kwargs, - ): - batch_size, input_ids_seq_length = input_ids.shape[0], input_ids.shape[-1] - - if generation_config is None: - generation_config = self.generation_config - generation_config = copy.deepcopy(generation_config) - model_kwargs = generation_config.update(**kwargs) - model_kwargs["use_cache"] = generation_config.use_cache - bos_token_id, eos_token_id = generation_config.bos_token_id, generation_config.eos_token_id - - if isinstance(eos_token_id, int): - eos_token_id = [eos_token_id] - eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None - - has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None - if has_default_max_length and generation_config.max_new_tokens is None: - warnings.warn( - f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. " - "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we" - " recommend using `max_new_tokens` to control the maximum length of the generation.", - UserWarning, - ) - elif generation_config.max_new_tokens is not None: - generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length - if not has_default_max_length: - logger.warn( - f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(=" - f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. " - "Please refer to the documentation for more information. " - "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)", - UserWarning, - ) - - if input_ids_seq_length >= generation_config.max_length: - input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids" - logger.warning( - f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is set to" - f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider" - " increasing `max_new_tokens`." - ) - - # 2. Set generation parameters if not already defined - logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() - stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() - - logits_processor = self._get_logits_processor( - generation_config=generation_config, - input_ids_seq_length=input_ids_seq_length, - encoder_input_ids=input_ids, - prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, - logits_processor=logits_processor, - ) - - stopping_criteria = self._get_stopping_criteria( - generation_config=generation_config, stopping_criteria=stopping_criteria - ) - logits_warper = self._get_logits_warper(generation_config) - - unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) - scores = None - while True: - model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - # forward pass to get next token - outputs = self( - **model_inputs, - return_dict=True, - output_attentions=False, - output_hidden_states=False, - ) - - next_token_logits = outputs.logits[:, -1, :] - - # pre-process distribution - next_token_scores = logits_processor(input_ids, next_token_logits) - next_token_scores = logits_warper(input_ids, next_token_scores) - - # sample - probs = nn.functional.softmax(next_token_scores, dim=-1) - if generation_config.do_sample: - next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) - else: - next_tokens = torch.argmax(probs, dim=-1) - # update generated ids, model inputs, and length for next step - input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) - model_kwargs = self._update_model_kwargs_for_generation( - outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder - ) - unfinished_sequences = unfinished_sequences.mul( - next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0) - ) - if return_past_key_values: - yield input_ids, outputs.past_key_values - else: - yield input_ids - # stop when each sentence is finished, or if we exceed the maximum length - if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): - break - - def quantize(self, bits: int, empty_init=False, device=None, **kwargs): - if bits == 0: - return - - from .quantization import quantize - - if self.quantized: - logger.info("Already quantized.") - return self - - self.quantized = True - - self.config.quantization_bit = bits - - self.transformer.encoder = quantize(self.transformer.encoder, bits, empty_init=empty_init, device=device, - **kwargs) - return self - - -class ChatGLMForSequenceClassification(ChatGLMPreTrainedModel): - def __init__(self, config: ChatGLMConfig, empty_init=True, device=None): - super().__init__(config) - - self.num_labels = config.num_labels - self.transformer = ChatGLMModel(config, empty_init=empty_init, device=device) - - self.classifier_head = nn.Linear(config.hidden_size, config.num_labels, bias=True, dtype=torch.half) - if config.classifier_dropout is not None: - self.dropout = nn.Dropout(config.classifier_dropout) - else: - self.dropout = None - self.config = config - - if self.config.quantization_bit: - self.quantize(self.config.quantization_bit, empty_init=True) - - def forward( - self, - input_ids: Optional[torch.LongTensor] = None, - position_ids: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.Tensor] = None, - full_attention_mask: Optional[torch.Tensor] = None, - past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, - inputs_embeds: Optional[torch.LongTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple[torch.Tensor, ...], SequenceClassifierOutputWithPast]: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - transformer_outputs = self.transformer( - input_ids=input_ids, - position_ids=position_ids, - attention_mask=attention_mask, - full_attention_mask=full_attention_mask, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - hidden_states = transformer_outputs[0] - pooled_hidden_states = hidden_states[-1] - if self.dropout is not None: - pooled_hidden_states = self.dropout(pooled_hidden_states) - logits = self.classifier_head(pooled_hidden_states) - - loss = None - if labels is not None: - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(logits.squeeze().float(), labels.squeeze()) - else: - loss = loss_fct(logits.float(), labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels).float(), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(logits.float(), labels.view(-1, self.num_labels)) - - if not return_dict: - output = (logits,) + transformer_outputs[1:] - return ((loss,) + output) if loss is not None else output - - return SequenceClassifierOutputWithPast( - loss=loss, - logits=logits, - past_key_values=transformer_outputs.past_key_values, - hidden_states=transformer_outputs.hidden_states, - attentions=transformer_outputs.attentions, - ) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/modeling_llama.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/modeling_llama.py deleted file mode 100644 index 4cd1b6e18e8..00000000000 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/modeling_llama.py +++ /dev/null @@ -1,1263 +0,0 @@ -# coding=utf-8 -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" PyTorch LLaMA model.""" -import math -import warnings -from typing import List, Optional, Tuple, Union - -import torch -import torch.nn.functional as F -import torch.utils.checkpoint -from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss - -from transformers.activations import ACT2FN -from transformers.modeling_attn_mask_utils import AttentionMaskConverter, _prepare_4d_causal_attention_mask -from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast -from transformers.modeling_utils import PreTrainedModel -from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS -from transformers.utils import ( - add_start_docstrings, - add_start_docstrings_to_model_forward, - is_flash_attn_2_available, - logging, - replace_return_docstrings, -) -from transformers.utils.import_utils import is_torch_fx_available -from transformers.models.llama.configuration_llama import LlamaConfig - - -if is_flash_attn_2_available(): - from flash_attn import flash_attn_func, flash_attn_varlen_func - from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa -### INC code ### -from neural_compressor.torch.quantization.modules import Matmul, BatchMatmul, Autocast - -# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph. -# It means that the function will not be traced through and simply appear as a node in the graph. -if is_torch_fx_available(): - _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask) - - -logger = logging.get_logger(__name__) - -_CONFIG_FOR_DOC = "LlamaConfig" - - -def _get_unpad_data(attention_mask): - seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) - indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() - max_seqlen_in_batch = seqlens_in_batch.max().item() - cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) - return ( - indices, - cu_seqlens, - max_seqlen_in_batch, - ) - - -def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): - warnings.warn( - "Calling `transformers.models.llama.modeling_llama._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils.AttentionMaskConverter._prepare_4d_attention_mask" - ) - return AttentionMaskConverter._prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len) - - -def _make_causal_mask( - input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 -): - warnings.warn( - "Calling `transformers.models.llama.modeling_llama._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.llama.modeling_llama.AttentionMaskConverter._make_causal_mask" - ) - return AttentionMaskConverter._make_causal_mask( - input_ids_shape=input_ids_shape, dtype=dtype, device=device, past_key_values_length=past_key_values_length - ) - - -class LlamaRMSNorm(nn.Module): - def __init__(self, hidden_size, eps=1e-6): - """ - LlamaRMSNorm is equivalent to T5LayerNorm - """ - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states): - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - return self.weight * hidden_states.to(input_dtype) - - -ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm) - - -class LlamaRotaryEmbedding(nn.Module): - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): - super().__init__() - - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base - inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - - # Build here to make `torch.jit.trace` work. - self._set_cos_sin_cache( - seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() - ) - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) - - freqs = torch.einsum("i,j->ij", t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) - self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) - - def forward(self, x, seq_len=None): - # x: [bs, num_attention_heads, seq_len, head_size] - if seq_len > self.max_seq_len_cached: - self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) - - return ( - self.cos_cached[:seq_len].to(dtype=x.dtype), - self.sin_cached[:seq_len].to(dtype=x.dtype), - ) - - -class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding): - """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" - - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): - self.scaling_factor = scaling_factor - super().__init__(dim, max_position_embeddings, base, device) - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) - t = t / self.scaling_factor - - freqs = torch.einsum("i,j->ij", t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) - self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) - - -class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding): - """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" - - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): - self.scaling_factor = scaling_factor - super().__init__(dim, max_position_embeddings, base, device) - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - - if seq_len > self.max_position_embeddings: - base = self.base * ( - (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) - ) ** (self.dim / (self.dim - 2)) - inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - - t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) - - freqs = torch.einsum("i,j->ij", t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) - self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) - - -def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - -def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): - """Applies Rotary Position Embedding to the query and key tensors. - - Args: - q (`torch.Tensor`): The query tensor. - k (`torch.Tensor`): The key tensor. - cos (`torch.Tensor`): The cosine part of the rotary embedding. - sin (`torch.Tensor`): The sine part of the rotary embedding. - position_ids (`torch.Tensor`): - The position indices of the tokens corresponding to the query and key tensors. For example, this can be - used to pass offsetted position ids when working with a KV-cache. - unsqueeze_dim (`int`, *optional*, defaults to 1): - The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and - sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note - that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and - k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes - cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have - the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. - Returns: - `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. - """ - cos = cos[position_ids].unsqueeze(unsqueeze_dim) - sin = sin[position_ids].unsqueeze(unsqueeze_dim) - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed - - -class LlamaMLP(nn.Module): - def __init__(self, config): - super().__init__() - self.config = config - self.hidden_size = config.hidden_size - self.intermediate_size = config.intermediate_size - self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) - self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) - self.act_fn = ACT2FN[config.hidden_act] - - def forward(self, x): - if self.config.pretraining_tp > 1: - slice = self.intermediate_size // self.config.pretraining_tp - gate_proj_slices = self.gate_proj.weight.split(slice, dim=0) - up_proj_slices = self.up_proj.weight.split(slice, dim=0) - down_proj_slices = self.down_proj.weight.split(slice, dim=1) - - gate_proj = torch.cat( - [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1 - ) - up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1) - - intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2) - down_proj = [ - F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp) - ] - down_proj = sum(down_proj) - else: - down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) - - return down_proj - - -def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: - """ - This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, - num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) - """ - batch, num_key_value_heads, slen, head_dim = hidden_states.shape - if n_rep == 1: - return hidden_states - hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) - return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) - - -class LlamaAttention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - def __init__(self, config: LlamaConfig): - super().__init__() - self.config = config - self.attention_dropout = config.attention_dropout - self.hidden_size = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.hidden_size // self.num_heads - self.num_key_value_heads = config.num_key_value_heads - self.num_key_value_groups = self.num_heads // self.num_key_value_heads - self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta - self.is_causal = True - - if (self.head_dim * self.num_heads) != self.hidden_size: - raise ValueError( - f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" - f" and `num_heads`: {self.num_heads})." - ) - - self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) - self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) - self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) - self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias) - self._init_rope() - ### INC code ### - self.matmul1 = Matmul() - self.matmul2 = Matmul() - self.cast1 = Autocast() - self.cast2 = Autocast() - - def _init_rope(self): - if self.config.rope_scaling is None: - self.rotary_emb = LlamaRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.rope_theta, - ) - else: - scaling_type = self.config.rope_scaling["type"] - scaling_factor = self.config.rope_scaling["factor"] - if scaling_type == "linear": - self.rotary_emb = LlamaLinearScalingRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - scaling_factor=scaling_factor, - base=self.rope_theta, - ) - elif scaling_type == "dynamic": - self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - scaling_factor=scaling_factor, - base=self.rope_theta, - ) - else: - raise ValueError(f"Unknown RoPE scaling type {scaling_type}") - - def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - - bsz, q_len, _ = hidden_states.size() - - if self.config.pretraining_tp > 1: - key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp - query_slices = self.q_proj.weight.split( - (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0 - ) - key_slices = self.k_proj.weight.split(key_value_slicing, dim=0) - value_slices = self.v_proj.weight.split(key_value_slicing, dim=0) - - query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)] - query_states = torch.cat(query_states, dim=-1) - - key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)] - key_states = torch.cat(key_states, dim=-1) - - value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)] - value_states = torch.cat(value_states, dim=-1) - - else: - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value[0].shape[-2] - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - ### INC code ### - key_states = self.cast1(key_states) - value_states = self.cast2(value_states) - # import habana_frameworks.torch.core as htcore - # htcore.mark_step() - - if past_key_value is not None: - # reuse k, v, self_attention - key_states = torch.cat([past_key_value[0], key_states], dim=2) - value_states = torch.cat([past_key_value[1], value_states], dim=2) - - past_key_value = (key_states, value_states) if use_cache else None - - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - ### INC code ### - attn_weights = self.matmul1(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - - if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): - raise ValueError( - f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" - f" {attn_weights.size()}" - ) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - attn_weights = attn_weights + attention_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) - attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) - - ### INC code ### - attn_output = self.matmul2(attn_weights, value_states) - - if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - - if self.config.pretraining_tp > 1: - attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) - o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1) - attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)]) - else: - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - -class LlamaFlashAttention2(LlamaAttention): - """ - Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays - untouched. The only required change would be on the forward pass where it needs to correctly call the public API of - flash attention and deal with padding tokens in case the input contains any of them. - """ - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.LongTensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, - **kwargs, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - # LlamaFlashAttention2 attention does not support output_attentions - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - - # overwrite attention_mask with padding_mask - attention_mask = kwargs.pop("padding_mask") - - output_attentions = False - - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - # Flash attention requires the input to have the shape - # batch_size x seq_length x head_dim x hidden_dim - # therefore we just need to keep the original shape - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value[0].shape[-2] - - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) - - if past_key_value is not None: - # reuse k, v, self_attention - key_states = torch.cat([past_key_value[0], key_states], dim=2) - value_states = torch.cat([past_key_value[1], value_states], dim=2) - - past_key_value = (key_states, value_states) if use_cache else None - - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - dropout_rate = 0.0 if not self.training else self.attention_dropout - - # In PEFT, usually we cast the layer norms in float32 for training stability reasons - # therefore the input hidden states gets silently casted in float32. Hence, we need - # cast them back in the correct dtype just to be sure everything works as expected. - # This might slowdown training & inference so it is recommended to not cast the LayerNorms - # in fp32. (LlamaRMSNorm handles it correctly) - - input_dtype = query_states.dtype - if input_dtype == torch.float32: - # Handle the case where the model is quantized - if hasattr(self.config, "_pre_quantization_dtype"): - target_dtype = self.config._pre_quantization_dtype - else: - target_dtype = self.q_proj.weight.dtype - - logger.warning_once( - f"The input hidden states seems to be silently casted in float32, this might be related to" - f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" - f" {target_dtype}." - ) - - query_states = query_states.to(target_dtype) - key_states = key_states.to(target_dtype) - value_states = value_states.to(target_dtype) - - attn_output = self._flash_attention_forward( - query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate - ) - - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - def _flash_attention_forward( - self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None - ): - """ - Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token - first unpad the input, then computes the attention scores and pad the final attention scores. - - Args: - query_states (`torch.Tensor`): - Input query states to be passed to Flash Attention API - key_states (`torch.Tensor`): - Input key states to be passed to Flash Attention API - value_states (`torch.Tensor`): - Input value states to be passed to Flash Attention API - attention_mask (`torch.Tensor`): - The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the - position of padding tokens and 1 for the position of non-padding tokens. - dropout (`int`, *optional*): - Attention dropout - softmax_scale (`float`, *optional*): - The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) - """ - # Contains at least one padding token in the sequence - if attention_mask is not None: - batch_size = query_states.shape[0] - query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( - query_states, key_states, value_states, attention_mask, query_length - ) - - cu_seqlens_q, cu_seqlens_k = cu_seq_lens - max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens - - attn_output_unpad = flash_attn_varlen_func( - query_states, - key_states, - value_states, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_k=cu_seqlens_k, - max_seqlen_q=max_seqlen_in_batch_q, - max_seqlen_k=max_seqlen_in_batch_k, - dropout_p=dropout, - softmax_scale=softmax_scale, - causal=self.is_causal, - ) - - attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) - else: - attn_output = flash_attn_func( - query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=self.is_causal - ) - - return attn_output - - def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): - indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) - batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape - - key_layer = index_first_axis( - key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k - ) - value_layer = index_first_axis( - value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k - ) - if query_length == kv_seq_len: - query_layer = index_first_axis( - query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k - ) - cu_seqlens_q = cu_seqlens_k - max_seqlen_in_batch_q = max_seqlen_in_batch_k - indices_q = indices_k - elif query_length == 1: - max_seqlen_in_batch_q = 1 - cu_seqlens_q = torch.arange( - batch_size + 1, dtype=torch.int32, device=query_layer.device - ) # There is a memcpy here, that is very bad. - indices_q = cu_seqlens_q[:-1] - query_layer = query_layer.squeeze(1) - else: - # The -q_len: slice assumes left padding. - attention_mask = attention_mask[:, -query_length:] - query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) - - return ( - query_layer, - key_layer, - value_layer, - indices_q, - (cu_seqlens_q, cu_seqlens_k), - (max_seqlen_in_batch_q, max_seqlen_in_batch_k), - ) - - -class LlamaDecoderLayer(nn.Module): - def __init__(self, config: LlamaConfig): - super().__init__() - self.hidden_size = config.hidden_size - self.self_attn = ( - LlamaAttention(config=config) - if not getattr(config, "_flash_attn_2_enabled", False) - else LlamaFlashAttention2(config=config) - ) - self.mlp = LlamaMLP(config) - self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: Optional[bool] = False, - use_cache: Optional[bool] = False, - **kwargs, - ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`, *optional*): - attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, - query_sequence_length, key_sequence_length)` if default attention is used. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding - (see `past_key_values`). - past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states - """ - if "padding_mask" in kwargs: - warnings.warn( - "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" - ) - - residual = hidden_states - - hidden_states = self.input_layernorm(hidden_states) - - # Self Attention - hidden_states, self_attn_weights, present_key_value = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - **kwargs, - ) - hidden_states = residual + hidden_states - - # Fully Connected - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - - outputs = (hidden_states,) - - if output_attentions: - outputs += (self_attn_weights,) - - if use_cache: - outputs += (present_key_value,) - - return outputs - - -LLAMA_START_DOCSTRING = r""" - This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the - library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads - etc.) - - This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. - Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage - and behavior. - - Parameters: - config ([`LlamaConfig`]): - Model configuration class with all the parameters of the model. Initializing with a config file does not - load the weights associated with the model, only the configuration. Check out the - [`~PreTrainedModel.from_pretrained`] method to load the model weights. -""" - - -@add_start_docstrings( - "The bare LLaMA Model outputting raw hidden-states without any specific head on top.", - LLAMA_START_DOCSTRING, -) -class LlamaPreTrainedModel(PreTrainedModel): - config_class = LlamaConfig - base_model_prefix = "model" - supports_gradient_checkpointing = True - _no_split_modules = ["LlamaDecoderLayer"] - _skip_keys_device_placement = "past_key_values" - _supports_flash_attn_2 = True - - def _init_weights(self, module): - std = self.config.initializer_range - if isinstance(module, nn.Linear): - module.weight.data.normal_(mean=0.0, std=std) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - -LLAMA_INPUTS_DOCSTRING = r""" - Args: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide - it. - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - [What are input IDs?](../glossary#input-ids) - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - - Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and - [`PreTrainedTokenizer.__call__`] for details. - - If `past_key_values` is used, optionally only the last `input_ids` have to be input (see - `past_key_values`). - - If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] - and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more - information on the default strategy. - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, - config.n_positions - 1]`. - - [What are position IDs?](../glossary#position-ids) - past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape - `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape - `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)`. - - Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention - blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. - - If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't - have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` - of shape `(batch_size, sequence_length)`. - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This - is useful if you want more control over how to convert `input_ids` indices into associated vectors than the - model's internal embedding lookup matrix. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see - `past_key_values`). - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. -""" - - -@add_start_docstrings( - "The bare LLaMA Model outputting raw hidden-states without any specific head on top.", - LLAMA_START_DOCSTRING, -) -class LlamaModel(LlamaPreTrainedModel): - """ - Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`] - - Args: - config: LlamaConfig - """ - - def __init__(self, config: LlamaConfig): - super().__init__(config) - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - - self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) - self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)]) - self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - self.gradient_checkpointing = False - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value - - @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPast]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # retrieve input_ids and inputs_embeds - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") - elif input_ids is not None: - batch_size, seq_length = input_ids.shape[:2] - elif inputs_embeds is not None: - batch_size, seq_length = inputs_embeds.shape[:2] - else: - raise ValueError("You have to specify either input_ids or inputs_embeds") - - past_key_values_length = 0 - if past_key_values is not None: - past_key_values_length = past_key_values[0][0].shape[2] - - if position_ids is None: - device = input_ids.device if input_ids is not None else inputs_embeds.device - position_ids = torch.arange( - past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device - ) - position_ids = position_ids.unsqueeze(0) - - if inputs_embeds is None: - inputs_embeds = self.embed_tokens(input_ids) - - if getattr(self.config, "_flash_attn_2_enabled", False): - # 2d mask is passed through the layers - attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None - else: - # 4d mask is passed through the layers - attention_mask = _prepare_4d_causal_attention_mask( - attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length - ) - - # embed positions - hidden_states = inputs_embeds - - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - - # decoder layers - all_hidden_states = () if output_hidden_states else None - all_self_attns = () if output_attentions else None - next_decoder_cache = () if use_cache else None - - for idx, decoder_layer in enumerate(self.layers): - if output_hidden_states: - all_hidden_states += (hidden_states,) - - past_key_value = past_key_values[idx] if past_key_values is not None else None - - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - decoder_layer.__call__, - hidden_states, - attention_mask, - position_ids, - past_key_value, - output_attentions, - use_cache, - ) - else: - layer_outputs = decoder_layer( - hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - ) - - hidden_states = layer_outputs[0] - - if use_cache: - next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) - - if output_attentions: - all_self_attns += (layer_outputs[1],) - - hidden_states = self.norm(hidden_states) - - # add hidden states from the last decoder layer - if output_hidden_states: - all_hidden_states += (hidden_states,) - - next_cache = next_decoder_cache if use_cache else None - if not return_dict: - return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) - return BaseModelOutputWithPast( - last_hidden_state=hidden_states, - past_key_values=next_cache, - hidden_states=all_hidden_states, - attentions=all_self_attns, - ) - - -class LlamaForCausalLM(LlamaPreTrainedModel): - _tied_weights_keys = ["lm_head.weight"] - - def __init__(self, config): - super().__init__(config) - self.model = LlamaModel(config) - self.vocab_size = config.vocab_size - self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, new_embeddings): - self.lm_head = new_embeddings - - def set_decoder(self, decoder): - self.model = decoder - - def get_decoder(self): - return self.model - - @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, CausalLMOutputWithPast]: - r""" - Args: - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - - Returns: - - Example: - - ```python - >>> from transformers import AutoTokenizer, LlamaForCausalLM - - >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) - >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) - - >>> prompt = "Hey, are you conscious? Can you talk to me?" - >>> inputs = tokenizer(prompt, return_tensors="pt") - - >>> # Generate - >>> generate_ids = model.generate(inputs.input_ids, max_length=30) - >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." - ```""" - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) - outputs = self.model( - input_ids=input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - hidden_states = outputs[0] - if self.config.pretraining_tp > 1: - lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0) - logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)] - logits = torch.cat(logits, dim=-1) - else: - logits = self.lm_head(hidden_states) - logits = logits.float() - - loss = None - if labels is not None: - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) - - if not return_dict: - output = (logits,) + outputs[1:] - return (loss,) + output if loss is not None else output - - return CausalLMOutputWithPast( - loss=loss, - logits=logits, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - ) - - def prepare_inputs_for_generation( - self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs - ): - if past_key_values is not None: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = input_ids.shape[1] - 1 - - input_ids = input_ids[:, remove_prefix_length:] - - position_ids = kwargs.get("position_ids", None) - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -input_ids.shape[1] :] - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and past_key_values is None: - model_inputs = {"inputs_embeds": inputs_embeds} - else: - model_inputs = {"input_ids": input_ids} - - model_inputs.update( - { - "position_ids": position_ids, - "past_key_values": past_key_values, - "use_cache": kwargs.get("use_cache"), - "attention_mask": attention_mask, - } - ) - return model_inputs - - @staticmethod - def _reorder_cache(past_key_values, beam_idx): - reordered_past = () - for layer_past in past_key_values: - reordered_past += ( - tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), - ) - return reordered_past - - -@add_start_docstrings( - """ - The LLaMa Model transformer with a sequence classification head on top (linear layer). - - [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models - (e.g. GPT-2) do. - - Since it does classification on the last token, it requires to know the position of the last token. If a - `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If - no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the - padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in - each row of the batch). - """, - LLAMA_START_DOCSTRING, -) -class LlamaForSequenceClassification(LlamaPreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.num_labels = config.num_labels - self.model = LlamaModel(config) - self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self): - return self.model.embed_tokens - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, SequenceClassifierOutputWithPast]: - r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): - Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). - """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - transformer_outputs = self.model( - input_ids, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - hidden_states = transformer_outputs[0] - logits = self.score(hidden_states) - - if input_ids is not None: - batch_size = input_ids.shape[0] - else: - batch_size = inputs_embeds.shape[0] - - if self.config.pad_token_id is None and batch_size != 1: - raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") - if self.config.pad_token_id is None: - sequence_lengths = -1 - else: - if input_ids is not None: - sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).long().argmax(-1) - 1).to( - logits.device - ) - else: - sequence_lengths = -1 - - pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] - - loss = None - if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) - if not return_dict: - output = (pooled_logits,) + transformer_outputs[1:] - return ((loss,) + output) if loss is not None else output - - return SequenceClassifierOutputWithPast( - loss=loss, - logits=pooled_logits, - past_key_values=transformer_outputs.past_key_values, - hidden_states=transformer_outputs.hidden_states, - attentions=transformer_outputs.attentions, - ) diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/tokenization_baichuan.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/tokenization_baichuan.py deleted file mode 100644 index 5b7054d3227..00000000000 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/models/tokenization_baichuan.py +++ /dev/null @@ -1,255 +0,0 @@ -# Copyright 2023 Baichuan Inc. All Rights Reserved. - -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from shutil import copyfile -from typing import Any, Dict, List, Optional, Tuple - -import sentencepiece as spm - -from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer -from transformers.utils import logging - - -logger = logging.get_logger(__name__) - -VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} - -PRETRAINED_VOCAB_FILES_MAP = { - "vocab_file": {}, - "tokenizer_file": {}, -} -PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} - - -class BaichuanTokenizer(PreTrainedTokenizer): - """ - Construct a Baichuan tokenizer. Based on byte-level Byte-Pair-Encoding. - - Args: - vocab_file (`str`): - Path to the vocabulary file. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - model_input_names = ["input_ids", "attention_mask"] - - def __init__( - self, - vocab_file, - unk_token="", - bos_token="", - eos_token="", - pad_token=None, - sp_model_kwargs: Optional[Dict[str, Any]] = None, - add_bos_token=True, - add_eos_token=False, - clean_up_tokenization_spaces=False, - **kwargs, - ): - self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token - eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token - unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token - pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token - ### INC code ### - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(vocab_file) - - super().__init__( - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - pad_token=pad_token, - add_bos_token=add_bos_token, - add_eos_token=add_eos_token, - sp_model_kwargs=self.sp_model_kwargs, - clean_up_tokenization_spaces=clean_up_tokenization_spaces, - **kwargs, - ) - self.vocab_file = vocab_file - self.add_bos_token = add_bos_token - self.add_eos_token = add_eos_token - #self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - #self.sp_model.Load(vocab_file) - - def __getstate__(self): - state = self.__dict__.copy() - state["sp_model"] = None - return state - - def __setstate__(self, d): - self.__dict__ = d - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(self.vocab_file) - - @property - def vocab_size(self): - """Returns vocab size""" - return self.sp_model.get_piece_size() - - def get_vocab(self): - """Returns vocab as a dict""" - vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} - vocab.update(self.added_tokens_encoder) - return vocab - - def _tokenize(self, text): - """Returns a tokenized string.""" - return self.sp_model.encode(text, out_type=str) - - def _convert_token_to_id(self, token): - """Converts a token (str) in an id using the vocab.""" - return self.sp_model.piece_to_id(token) - - def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (str) using the vocab.""" - token = self.sp_model.IdToPiece(index) - return token - - def convert_tokens_to_string(self, tokens): - """Converts a sequence of tokens (string) in a single string.""" - current_sub_tokens = [] - out_string = "" - prev_is_special = False - for i, token in enumerate(tokens): - # make sure that special tokens are not decoded using sentencepiece model - if token in self.all_special_tokens: - if not prev_is_special and i != 0: - out_string += " " - out_string += self.sp_model.decode(current_sub_tokens) + token - prev_is_special = True - current_sub_tokens = [] - else: - current_sub_tokens.append(token) - prev_is_special = False - out_string += self.sp_model.decode(current_sub_tokens) - return out_string - - def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]: - """ - Save the vocabulary and special tokens file to a directory. - - Args: - save_directory (`str`): - The directory in which to save the vocabulary. - - Returns: - `Tuple(str)`: Paths to the files saved. - """ - if not os.path.isdir(save_directory): - logger.error(f"Vocabulary path ({save_directory}) should be a directory") - return - out_vocab_file = os.path.join( - save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] - ) - - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): - copyfile(self.vocab_file, out_vocab_file) - elif not os.path.isfile(self.vocab_file): - with open(out_vocab_file, "wb") as fi: - content_spiece_model = self.sp_model.serialized_model_proto() - fi.write(content_spiece_model) - - return (out_vocab_file,) - - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): - bos_token_id = [self.bos_token_id] if self.add_bos_token else [] - eos_token_id = [self.eos_token_id] if self.add_eos_token else [] - - output = bos_token_id + token_ids_0 + eos_token_id - - if token_ids_1 is not None: - output = output + bos_token_id + token_ids_1 + eos_token_id - - return output - - def get_special_tokens_mask( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False - ) -> List[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer `prepare_for_model` method. - - Args: - token_ids_0 (`List[int]`): - List of IDs. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (`bool`, *optional*, defaults to `False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - if already_has_special_tokens: - return super().get_special_tokens_mask( - token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True - ) - - bos_token_id = [1] if self.add_bos_token else [] - eos_token_id = [1] if self.add_eos_token else [] - - if token_ids_1 is None: - return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id - return ( - bos_token_id - + ([0] * len(token_ids_0)) - + eos_token_id - + bos_token_id - + ([0] * len(token_ids_1)) - + eos_token_id - ) - - def create_token_type_ids_from_sequences( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: - """ - Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT - sequence pair mask has the following format: - - ``` - 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 - | first sequence | second sequence | - ``` - - if token_ids_1 is None, only returns the first portion of the mask (0s). - - Args: - token_ids_0 (`List[int]`): - List of ids. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). - """ - bos_token_id = [self.bos_token_id] if self.add_bos_token else [] - eos_token_id = [self.eos_token_id] if self.add_eos_token else [] - - output = [0] * len(bos_token_id + token_ids_0 + eos_token_id) - - if token_ids_1 is not None: - output += [1] * len(bos_token_id + token_ids_1 + eos_token_id) - - return output diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/requirement.txt b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/requirement.txt deleted file mode 100644 index d3655acd742..00000000000 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/requirement.txt +++ /dev/null @@ -1,7 +0,0 @@ -transformers -datasets -accelerate -SentencePiece -lm_eval==0.3.0 -openpyxl -einops diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/run_llm.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/run_llm.py deleted file mode 100644 index 5cd0f046aba..00000000000 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/run_llm.py +++ /dev/null @@ -1,222 +0,0 @@ -import os -os.environ["EXPERIMENTAL_WEIGHT_SHARING"] = "False" - -### USE_GAUDI2_SCALE requires PT_USE_FP8_AMAX for torch.mm/bmm, or got failure -# os.environ["USE_GAUDI2_SCALE"] = "True" -# os.environ["PT_USE_FP8_AMAX"] = "True" - -### graphs will dump to .graph_dumps folder -# os.environ["GRAPH_VISUALIZATION"] = "True" -# import shutil -# shutil.rmtree(".graph_dumps", ignore_errors=True) - -import argparse -import time -import json -import re -import torch -import habana_frameworks.torch.hpex -import torch.nn.functional as F -import deepspeed -import transformers -from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig -import habana_frameworks.torch.core as htcore - -from utils import show_msg, eval_func, init_empty_model, init_model, init_tokenizer - - -torch.set_grad_enabled(False) -htcore.hpu_set_env() -torch.device('hpu') - - -parser = argparse.ArgumentParser() -parser.add_argument( - "--model", nargs="?", default="facebook/opt-125m" -) -parser.add_argument( - "--trust_remote_code", default=True, - help="Transformers parameter: use the external repo") -parser.add_argument( - "--revision", default=None, - help="Transformers parameter: set the model hub commit number") -parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k", const="NeelNanda/pile-10k") -parser.add_argument("--output_dir", nargs="?", default="./saved_results") -parser.add_argument("--to_graph", action="store_true") -parser.add_argument("--approach", type=str, default=None, - help="Select from ['dynamic', 'static' 'cast']") -parser.add_argument("--precision", type=str, default='fp32', - help="Select from ['fp8_e4m3', 'fp8_e5m2', 'bf16', 'fp16', 'fp32'], \ - ['bf16', 'fp16'] only work with cast approach") -parser.add_argument("--autotune", action="store_true") -parser.add_argument("--accuracy", action="store_true") -parser.add_argument("--performance", action="store_true") -parser.add_argument("--generate", action="store_true") -parser.add_argument("--skip_fp8_mm", action="store_true") -parser.add_argument("--dump_to_excel", action="store_true") -parser.add_argument("--save", action="store_true") -parser.add_argument("--load", action="store_true") -parser.add_argument("--batch_size", default=1, type=int, - help="For accuracy measurement only.") -parser.add_argument("--pad_max_length", default=512, type=int, - help="Pad input ids to max length.") -parser.add_argument("--calib_iters", default=100, type=int, - help="calibration iters.") -parser.add_argument("--tasks", nargs='+', default=["lambada_openai"], \ - type=str, choices=["hellaswag", "lambada_openai", "piqa", "winogrande", "copa", - "rte", "openbookqa", "lambada_standard", "wikitext"], - help="tasks list for accuracy validation") -parser.add_argument("--limit", default=None, type=int, - help="the sample num of evaluation.") -parser.add_argument("--max_new_tokens", default=100, type=int, - help="calibration iters.") -parser.add_argument('--buckets', type=int, nargs='+', \ - help="Input length buckets to use with static_shapes", default=[256, 512]) -parser.add_argument("--local_rank", - type=int, - default=-1, - help="local_rank for distributed training on gpus") -parser.add_argument("--skip_lm_head", action="store_true") -args = parser.parse_args() - - -world_size = int(os.getenv('WORLD_SIZE', '1')) -local_rank = int(os.getenv('LOCAL_RANK', '-1')) - - -if args.load: - user_model = init_empty_model(args.model) -else: - user_model = init_model(args) -user_model.eval() - - -tokenizer = init_tokenizer(args) - - -### dynamic & static quantization ### -if args.approach in ["dynamic", "static"] and not args.load: - print("device:", next(user_model.parameters()).device) - from neural_compressor.torch.quantization import ( - quantize, autotune, FP8Config, get_default_fp8_config, TuningConfig, get_default_fp8_config_set - ) - dtype = args.precision - if args.approach == "dynamic": - from neural_compressor.torch.algorithms.habana_fp8 import quantize_dynamic - user_model = quantize_dynamic(user_model, dtype, inplace=True) - elif args.approach == "static": - qconfig = FP8Config(w_dtype=dtype, act_dtype=dtype, approach="static") - if args.skip_lm_head: - fp32_config = FP8Config(w_dtype="fp32", act_dtype="fp32") - qconfig.set_local("lm_head", fp32_config) - # dataset - from datasets import load_dataset - calib_dataset = load_dataset(args.dataset, split="train").select(range(100)) - calib_dataset = calib_dataset.shuffle(seed=42) - calib_data = [] - for examples in calib_dataset: - calib_data.append( - tokenizer( - examples["text"], - return_tensors="pt", - max_length=64, - padding="max_length", - truncation=True - ) - ) - - def calib_func(model): - for i, calib_input in enumerate(calib_data): - if i >= args.calib_iters: - break - model( - input_ids=calib_input["input_ids"].to('hpu'), - attention_mask=calib_input["attention_mask"].to('hpu'), - ) - - user_model = quantize(user_model, qconfig, calib_func, inplace=True) - # saving - print(user_model) - if args.save and local_rank in [-1, 0]: - user_model.save("saved_results") - - -if args.load: - from neural_compressor.torch.quantization import load - user_model = load("saved_results", user_model) - - -if args.approach in ["dynamic", "static"] or args.load: - # It enables weights constant folding - from habana_frameworks.torch.core.quantization import _check_params_as_const, _mark_params_as_const - _mark_params_as_const(user_model) # can reduce memory allocated and speed up - _check_params_as_const(user_model) - - - -# If torch.matmul and torch.bmm are not replaced by INC module, -# Below codes can make torch.matmul and torch.bmm run on fp8 by injection. -if not args.skip_fp8_mm and args.precision in ['fp8_e4m3', 'fp8_e5m2']: - def replace_torch_mm_bmm(): - from neural_compressor.torch.amp.fp8.functions import fp8_matmul - torch.matmul = fp8_matmul - torch.bmm = fp8_matmul - - replace_torch_mm_bmm() - - -# inference optimization -if args.to_graph: - import habana_frameworks.torch.hpu.graphs as htgraphs - user_model = htgraphs.wrap_in_hpu_graph(user_model) - - -# dump message of HPU after quantization or reloading -show_msg() - - -### generation, performance and accuracy validation ### -if args.generate: - input_prompt = "Here is my prompt" - print("Prompt sentence:", input_prompt) - generation_config = { - "min_new_tokens": args.max_new_tokens, "max_new_tokens": args.max_new_tokens, - # "do_sample": False, "temperature": 0.9, "num_beams": 4, - } - input_tokens = tokenizer(input_prompt, return_tensors="pt").to('hpu') - eval_start = time.perf_counter() - if args.approach == "cast": - from neural_compressor.torch.amp import autocast - if args.precision == "fp8_e4m3": - dtype = torch.float8_e4m3fn - elif args.precision == "fp8_e5m2": - dtype = torch.float8_e5m2 - elif args.precision == "fp16": - dtype = torch.float16 - elif args.precision == "bf16": - dtype = torch.bfloat16 - with autocast('hpu', dtype=dtype): - outputs = user_model.generate(**input_tokens, **generation_config) - else: - outputs = user_model.generate(**input_tokens, **generation_config) - - output_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True) - eval_end = time.perf_counter() - print("Generated sentence:", output_sentence) - print("Duration:", eval_end - eval_start) - - -if args.performance: - eval_start = time.perf_counter() - input_prompt = "Intel is a company which" - input_tokens = torch.ones((1, 128), dtype=torch.long).to('hpu') - generation_config = {"min_new_tokens": 100, "max_new_tokens": 100} - outputs = user_model.generate(input_tokens, **generation_config) - print("Duration of generating 100 tokens :", time.perf_counter() - eval_start) - - -if args.accuracy: - eval_func(user_model, tokenizer=tokenizer, args=args) - -# dump final message of HPU -show_msg() diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/utils.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/utils.py deleted file mode 100644 index 843287cddfa..00000000000 --- a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/habana_fp8/utils.py +++ /dev/null @@ -1,255 +0,0 @@ -import os -import re -import torch -from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer - - -world_size = int(os.getenv('WORLD_SIZE', '1')) -local_rank = int(os.getenv('LOCAL_RANK', '-1')) - - -def init_model(args): - import deepspeed - model_dtype = torch.float32 - if re.search("llama", args.model.lower()) or re.search("bloom", args.model.lower()): - if world_size > 1: - config = AutoConfig.from_pretrained(args.model) - model_dtype = torch.bfloat16 # RuntimeErrorCastToFp8V2 input must be of float or bfloat16 dtype - deepspeed.init_distributed(dist_backend="hccl") - with deepspeed.OnDevice(dtype=model_dtype, device="meta"): - user_model = AutoModelForCausalLM.from_config(config, torch_dtype=model_dtype) - import tempfile - checkpoints_json = tempfile.NamedTemporaryFile(suffix=".json", mode="+w") - from optimum.habana.checkpoint_utils import write_checkpoints_json # in optimum-habana - write_checkpoints_json( - args.model, - local_rank, - checkpoints_json, - token=None, - ) - else: - user_model = AutoModelForCausalLM.from_pretrained( - args.model, - device_map='hpu', - torch_dtype=model_dtype, - ) - elif re.search("chatglm", args.model.lower()): - from models.modeling_chatglm import ChatGLMForConditionalGeneration - user_model = ChatGLMForConditionalGeneration.from_pretrained( - args.model, - revision=args.revision, - device_map='hpu', - torch_dtype=model_dtype, - ) - # print(user_model.transformer.output_layer.weight.dtype) # always fp16 - user_model.float() # static fp8 need float32 for graph compiler - else: - user_model = AutoModelForCausalLM.from_pretrained( - args.model, - trust_remote_code=args.trust_remote_code, - revision=args.revision, - device_map='hpu', - torch_dtype=model_dtype, - ) - # load weight for multi-cards - if world_size > 1: - if re.search("llama", args.model.lower()) or re.search("bloom", args.model.lower()): - ds_inference_kwargs = {"dtype": model_dtype} - ds_inference_kwargs["tensor_parallel"] = {"tp_size": world_size} - ds_inference_kwargs["enable_cuda_graph"] = False - from transformers.models.llama.modeling_llama import LlamaDecoderLayer - ds_inference_kwargs["injection_policy"] = {LlamaDecoderLayer: ("self_attn.o_proj", "mlp.down_proj")} - ds_inference_kwargs["checkpoint"] = checkpoints_json.name - ds_model = deepspeed.init_inference(user_model, **ds_inference_kwargs) - else: - ds_model = deepspeed.init_inference(user_model, - mp_size=world_size, - replace_with_kernel_inject=False) - user_model = ds_model.module - return user_model - - -def init_empty_model(model_name): - from accelerate import init_empty_weights - model_dtype = torch.float32 - config = AutoConfig.from_pretrained(model_name) - with init_empty_weights(): - model = AutoModelForCausalLM.from_config(config, torch_dtype=model_dtype) - return model - - -def init_tokenizer(args): - # tokenizer - if re.search("baichuan", args.model.lower()): - from models.tokenization_baichuan import BaichuanTokenizer - tokenizer = BaichuanTokenizer.from_pretrained( - args.model, - trust_remote_code=args.trust_remote_code - ) - else: - tokenizer = AutoTokenizer.from_pretrained( - args.model, - trust_remote_code=args.trust_remote_code - ) - tokenizer.pad_token = tokenizer.eos_token - return tokenizer - - -def show_msg(): - import numpy as np - import glob - from habana_frameworks.torch.hpu import memory_stats - print("Number of HPU graphs:", len(glob.glob(".graph_dumps/*PreGraph*"))) - mem_stats = memory_stats() - mem_dict = { - "memory_allocated (GB)": np.round(mem_stats["InUse"] / 1024**3, 2), - "max_memory_allocated (GB)": np.round(mem_stats["MaxInUse"] / 1024**3, 2), - "total_memory_available (GB)": np.round(mem_stats["Limit"] / 1024**3, 2), - } - for k, v in mem_dict.items(): - print("{:35} = {} GB".format(k[:-5].replace("_", " ").capitalize(), v)) - - -def itrex_bootstrap_stderr(f, xs, iters): - from lm_eval.metrics import _bootstrap_internal, sample_stddev - res = [] - chunk_size = min(1000, iters) - it = _bootstrap_internal(f, chunk_size) - for i in range(iters // chunk_size): - bootstrap = it((i, xs)) - res.extend(bootstrap) - return sample_stddev(res) - - -def save_to_excel(dict): - import pandas as pd - df_new = pd.DataFrame(dict) - try: - df_existing = pd.read_excel('output.xlsx') - except FileNotFoundError: - df_existing = pd.DataFrame() - df_combined = pd.concat([df_existing, df_new], axis=0, ignore_index=True) - df_combined.to_excel('output.xlsx', index=False, engine='openpyxl', header=True) - - -def eval_func(user_model, tokenizer, args): - import os - import re - import time - import json - import torch - import habana_frameworks.torch.hpex - import torch.nn.functional as F - import lm_eval - import lm_eval.tasks - import lm_eval.evaluator - - # to avoid out-of-memory caused by Popen for large language models. - lm_eval.metrics.bootstrap_stderr = itrex_bootstrap_stderr - - class HabanaModelAdapter(lm_eval.base.BaseLM): - def __init__(self, tokenizer, model, args, options): - super().__init__() - self.tokenizer = tokenizer - self.model = model.eval() - self._batch_size = args.batch_size - self.buckets = list(sorted(args.buckets)) - self.options = options - self._device = "hpu" - torch.set_grad_enabled(False) - - @property - def eot_token_id(self): - return self.model.config.eos_token_id - - @property - def max_length(self): - return self.buckets[-1] - - @property - def max_gen_toks(self): - raise NotImplementedError() - - @property - def batch_size(self): - return self._batch_size - - @property - def device(self): - # We need to do padding ourselves, otherwise we'll end up with recompilations - # Returning 'cpu' to keep tensors on CPU in lm_eval code - return 'cpu' # 'hpu' - - def tok_encode(self, string): - if ( - re.search("chatglm3", args.model.lower()) or - re.search("llama", args.model.lower()) or - re.search("mistral", args.model.lower()) - ): - string = string.lstrip() - return self.tokenizer.encode(string, add_special_tokens=False) - - def tok_decode(self, tokens): - return self.tokenizer.decode(tokens, skip_special_tokens=True) - - def _model_generate(self, context, max_length, eos_token_id): - raise NotImplementedError() - - def find_bucket(self, length): - return [b for b in self.buckets if b >= length][0] - - def _model_call(self, inputs): - seq_length = inputs.shape[-1] - padding_length = 0 - bucket_length = self.find_bucket(seq_length) - padding_length = bucket_length - seq_length - inputs = F.pad(inputs, (0, padding_length), value=self.model.config.pad_token_id) - logits = self.model(inputs.to(self._device))["logits"].cpu() - - if padding_length > 0: - logits = logits[:, :-padding_length, :] - logits = logits.to(torch.float32) - return logits - - lm_tasks = lm_eval.tasks.get_task_dict(args.tasks) - options = None - lm = HabanaModelAdapter(tokenizer, user_model, args, options) - - eval_start = time.perf_counter() - if args.approach == "cast": - from neural_compressor.torch.amp import autocast - if args.precision == "fp8_e4m3": - dtype = torch.float8_e4m3fn - elif args.precision == "fp8_e5m2": - dtype = torch.float8_e5m2 - elif args.precision == "fp16": - dtype = torch.float16 - elif args.precision == "bf16": - dtype = torch.bfloat16 - with autocast('hpu', dtype=dtype): - results = lm_eval.evaluator.evaluate(lm, lm_tasks, limit=args.limit) - else: - results = lm_eval.evaluator.evaluate(lm, lm_tasks, limit=args.limit) - print(lm_eval.evaluator.make_table(results)) - eval_end = time.perf_counter() - print("Duration:", eval_end - eval_start) - results['args'] = vars(args) - results['duration'] = eval_end - eval_start - - # make sure that result is dumped only once during multi-cards evaluation - local_rank = int(os.getenv('LOCAL_RANK', '-1')) - if local_rank in [-1, 0]: - dumped = json.dumps(results, indent=2) - accu_dict = {} - case_name = str(args.approach) + "-" + args.precision - for task_name in args.tasks: - if task_name == "wikitext": - print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["word_perplexity"]), flush=True) - accu_dict[task_name] = [args.model, case_name, results["results"][task_name]["word_perplexity"]] - else: - print("Accuracy for %s is: %s" % (task_name, results["results"][task_name]["acc"]), flush=True) - accu_dict[task_name] = [args.model, case_name, results["results"][task_name]["acc"]] - accu_dict["duration"] = [args.model, case_name, results["duration"]] - if args.dump_to_excel: - save_to_excel(accu_dict) - return results["results"][task_name]["acc"] diff --git a/examples/fp8_sample/README.md b/examples/fp8_sample/README.md new file mode 100644 index 00000000000..b758768ef0f --- /dev/null +++ b/examples/fp8_sample/README.md @@ -0,0 +1,96 @@ +### Usage demo: + +#### two steps to get quantized model + +```diff +import torch ++ from neural_compressor.torch.quantization import FP8Config, convert, prepare, finalize_calibration +import habana_frameworks.torch.core as htcore + +class M(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.fc1 = torch.nn.Linear(10, 5) + self.fc2 = torch.nn.Linear(5, 10) + + def forward(self, inp): + x1 = self.fc1(inp) + x2 = self.fc2(x1) + return x2 + +model = M().eval() + ++ config = FP8Config.from_json_file(args.quant_config) # args.quant_config is the path of json file + ++ if config.measure: ++ model = prepare(model, config) + ++ if config.quantize: ++ htcore.hpu_initialize() ++ model = convert(model, config) + +# user code run +with torch.no_grad(): + model.to("hpu") + output = model(torch.randn(1, 10).to("hpu")) + print(output) + ++ if config.measure: ++ finalize_calibration(model) +``` + + +Whole script and config refer to [sample_two_steps.py](./sample_two_steps.py), [maxabs_measure.json](./maxabs_measure.json) and [maxabs_quant.json](./maxabs_quant.json). + +First, measure the tensor quantization statistic: +```shell +python sample_two_steps.py --quant_config=maxabs_measure.json +``` + +Then quantize the model based on previous measurements: +```shell +python sample_two_steps.py --quant_config=maxabs_quant.json +``` + +#### one step to get quantized model + +```diff +import torch ++ from neural_compressor.torch.quantization import FP8Config, convert, prepare, finalize_calibration +import habana_frameworks.torch.core as htcore + +class M(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.fc1 = torch.nn.Linear(10, 5) + self.fc2 = torch.nn.Linear(5, 10) + + def forward(self, inp): + x1 = self.fc1(inp) + x2 = self.fc2(x1) + return x2 + +model = M().to("hpu") + ++ config = FP8Config.from_json_file(args.quant_config) # args.quant_config is the path of json file ++ model = prepare(model, config) + +# user code run to do calibration +with torch.no_grad(): + output = model(torch.randn(1, 10).to("hpu")) + print(output) + ++ finalize_calibration(model) ++ model = convert(model) + +# user code to run benchmark for quantized model +with torch.no_grad(): + output = model(torch.randn(1, 10).to("hpu")) + print(output) +``` + +Whole script and config refer to [sample_one_step.py](./sample_one_step.py). + +```shell +python sample_one_step.py --quant_config=quant_config.json +``` diff --git a/examples/fp8_sample/maxabs_measure.json b/examples/fp8_sample/maxabs_measure.json new file mode 100644 index 00000000000..8d55f33e57a --- /dev/null +++ b/examples/fp8_sample/maxabs_measure.json @@ -0,0 +1,7 @@ +{ + "mode": "MEASURE", + "observer": "maxabs", + "allowlist": {"types": [], "names": []}, + "blocklist": {"types": [], "names": []}, + "dump_stats_path": "./hqt_output/measure" +} diff --git a/examples/fp8_sample/maxabs_quant.json b/examples/fp8_sample/maxabs_quant.json new file mode 100644 index 00000000000..d1f76f8f630 --- /dev/null +++ b/examples/fp8_sample/maxabs_quant.json @@ -0,0 +1,8 @@ +{ + "mode": "QUANTIZE", + "observer": "maxabs", + "scale_method": "maxabs_hw", + "allowlist": {"types": [], "names": []}, + "blocklist": {"types": [], "names": []}, + "dump_stats_path": "./hqt_output/measure" +} diff --git a/examples/fp8_sample/quant_config.json b/examples/fp8_sample/quant_config.json new file mode 100644 index 00000000000..c139d13bbea --- /dev/null +++ b/examples/fp8_sample/quant_config.json @@ -0,0 +1,8 @@ +{ + "mode": "AUTO", + "observer": "maxabs", + "scale_method": "maxabs_hw", + "allowlist": {"types": [], "names": []}, + "blocklist": {"types": [], "names": []}, + "dump_stats_path": "./hqt_output/measure" +} diff --git a/examples/fp8_sample/sample_one_step.py b/examples/fp8_sample/sample_one_step.py new file mode 100644 index 00000000000..18eb7bfba4c --- /dev/null +++ b/examples/fp8_sample/sample_one_step.py @@ -0,0 +1,57 @@ +import argparse +import torch +import habana_frameworks.torch.core as htcore +htcore.hpu_set_env() + +from neural_compressor.torch.quantization import FP8Config, convert, finalize_calibration, prepare + +torch.manual_seed(1) + + +# 1. python sample_one_step.py --quant_config=quant_config.json + + +class M(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.fc1 = torch.nn.Linear(10, 5) + self.fc2 = torch.nn.Linear(5, 10) + + def forward(self, inp): + x1 = self.fc1(inp) + x2 = self.fc2(x1) + return x2 + + +def eval_func(model): + # user's eval func + input = torch.randn(1, 10) + model(input.to("hpu")) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Habana FP8 sample code.", formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("--quant_config", type=str, help="json file of quantization config") + args = parser.parse_args() + + model = M().eval().to("hpu") + htcore.hpu_initialize() + + config = FP8Config.from_json_file(args.quant_config) + model = prepare(model, config) + + # for calibration + with torch.no_grad(): + # model.to("hpu") + output = model(torch.randn(1, 10).to("hpu")) + + finalize_calibration(model) + model = convert(model) + print(model) + + # for benchmark + with torch.no_grad(): + output = model(torch.randn(1, 10).to("hpu")) + print(output) diff --git a/examples/fp8_sample/sample_two_steps.py b/examples/fp8_sample/sample_two_steps.py new file mode 100644 index 00000000000..9e17748b9b0 --- /dev/null +++ b/examples/fp8_sample/sample_two_steps.py @@ -0,0 +1,50 @@ +import argparse +import torch +import habana_frameworks.torch.core as htcore +htcore.hpu_set_env() + +from neural_compressor.torch.quantization import FP8Config, convert, finalize_calibration, prepare + +torch.manual_seed(1) + +# 1. python sample_two_steps.py --quant_config=maxabs_measure.json +# 2. python sample_two_steps.py --quant_config=maxabs_quant.json + + +class M(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.fc1 = torch.nn.Linear(10, 5) + self.fc2 = torch.nn.Linear(5, 10) + + def forward(self, inp): + x1 = self.fc1(inp) + x2 = self.fc2(x1) + return x2 + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Habana FP8 sample code.", formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("--quant_config", type=str, help="json file of quantization config") + args = parser.parse_args() + + model = M().eval() + config = FP8Config.from_json_file(args.quant_config) + + if config.measure: + model = prepare(model, config) + + if config.quantize: + htcore.hpu_initialize() + model = convert(model, config) + print(model) + + with torch.no_grad(): + model.to("hpu") + output = model(torch.randn(1, 10).to("hpu")) + print(output) + + if config.measure: + finalize_calibration(model) diff --git a/neural_compressor/torch/algorithms/habana_fp8/__init__.py b/neural_compressor/torch/algorithms/fp8_quant/__init__.py similarity index 70% rename from neural_compressor/torch/algorithms/habana_fp8/__init__.py rename to neural_compressor/torch/algorithms/fp8_quant/__init__.py index fe3a05d7d0b..d16760b5e81 100644 --- a/neural_compressor/torch/algorithms/habana_fp8/__init__.py +++ b/neural_compressor/torch/algorithms/fp8_quant/__init__.py @@ -12,5 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .fp8_quant import quantize_dynamic, quantize, white_list -from .save_load import save, load +from neural_compressor.torch.algorithms.fp8_quant.common import ( + update_mode, + save_calib_result, + restore_patched_module, + with_patched_module, +) +from neural_compressor.torch.algorithms.fp8_quant.fp8_quant import FP8Quantizer diff --git a/neural_compressor/torch/algorithms/fp8_quant/common.py b/neural_compressor/torch/algorithms/fp8_quant/common.py new file mode 100644 index 00000000000..b038a367a78 --- /dev/null +++ b/neural_compressor/torch/algorithms/fp8_quant/common.py @@ -0,0 +1,98 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import tempfile +from collections import namedtuple +from pathlib import Path +from typing import Union + +import torch + + +def save_calib_result(model): + import habana_quantization_toolkit as hqt + hqt.finish_measurements(model) + + +def update_mode(config_path, measure_step=False, quant_step=False): + with open(config_path, 'r') as file: + config = json.load(file) + + if (measure_step and config.get("mode") == "MEASURE") or (quant_step and config.get("mode") == "QUANTIZE"): + return config_path + else: + if measure_step: + config["mode"] = "MEASURE" + if quant_step: + config["mode"] = "QUANTIZE" + + temp_file = tempfile.NamedTemporaryFile(suffix=".json", delete=False) + temp_file_path = temp_file.name + + with open(temp_file_path, 'w') as temp_file: + json.dump(config, temp_file) + + return temp_file_path + + +def generate_model_info(model): + mod_inst_info = namedtuple("ModInstInfo", ["name", "parent"]) + parent_child_mod_dict = {} + + def create_mod_info_recursion(parent): + for name, mod in parent.named_children(): + parent_child_mod_dict[mod] = mod_inst_info(name=name, parent=parent) + create_mod_info_recursion(mod) + + create_mod_info_recursion(model) + return parent_child_mod_dict + + +def get_patched_mod_list(): + from habana_quantization_toolkit._core.common import mod_default_dict + + patched_mod_list = [] + for patched_mod in mod_default_dict.values(): + patched_mod_list.append(patched_mod.patched_module.__name__) + return patched_mod_list + + +def restore_patched_module(patched_model): + from neural_compressor.torch.algorithms.fp8_quant.helper_modules import helper_mods + patched_mod_list = get_patched_mod_list() + + parent_child_mod_dict = generate_model_info(patched_model) + with torch.no_grad(): + for name, patched_mod in patched_model.named_modules(): + patched_mod_type_str = patched_mod.__class__.__name__ + if patched_mod_type_str in patched_mod_list: + parent = parent_child_mod_dict[patched_mod].parent + name = parent_child_mod_dict[patched_mod].name + class_name_org = getattr(patched_mod, "class_name_org", None) or \ + patched_mod.__class__.__name__.split("Patched")[-1] + origin_mod = helper_mods[class_name_org](patched_mod) + origin_mod.forward = patched_mod.forward_orig + setattr(parent, name, origin_mod) + + +def with_patched_module(model): + patched_mod_list = get_patched_mod_list() + + for name, mod in model.named_modules(): + mod_type = mod.__class__.__name__ + if mod_type in patched_mod_list: + return True + return False diff --git a/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py b/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py new file mode 100644 index 00000000000..f9ce9145569 --- /dev/null +++ b/neural_compressor/torch/algorithms/fp8_quant/fp8_quant.py @@ -0,0 +1,61 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from neural_compressor.common.utils import FP8_QUANT +from neural_compressor.torch.algorithms import Quantizer +from neural_compressor.torch.algorithms.fp8_quant import ( + restore_patched_module, + update_mode, + with_patched_module, +) + + +class FP8Quantizer(Quantizer): + def __init__(self, quant_config): + super().__init__(quant_config) + if isinstance(quant_config, dict): + json_file = [cfg.json_file for cfg in quant_config.values()] + assert len(json_file) > 0, "Cannot get json file from config." + self.quant_config = json_file[0] + + def prepare(self, model): + _prepare(model, self.quant_config) + return model + + def convert(self, model): + if with_patched_module(model): + # for INC flow, it calls `prepare` and then `convert` user-facing API in one run + restore_patched_module(model) + _convert(model, self.quant_config) + return model + + +def _convert(model, config_path): + import habana_quantization_toolkit as hqt + + # update mode to QUANTIZE + config_path = update_mode(config_path, quant_step=True) + + return hqt.prep_model(model, config_path) + + +def _prepare(model, config_path): + import habana_quantization_toolkit as hqt + + # update mode to MEASURE + config_path = update_mode(config_path, measure_step=True) + + return hqt.prep_model(model, config_path) diff --git a/neural_compressor/torch/algorithms/fp8_quant/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/helper_modules.py new file mode 100644 index 00000000000..6c7154328d7 --- /dev/null +++ b/neural_compressor/torch/algorithms/fp8_quant/helper_modules.py @@ -0,0 +1,118 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +# For mapping revert patched module to origin module + +helper_mods = {} + +def helper_mod_register(name): + def decorator(mod): + helper_mods[name] = mod + return mod + return decorator + +@helper_mod_register(name="Matmul") +class Matmul(torch.nn.Module): + def __init__(self, patched_mod, *args, **kwargs): + super().__init__() + self.__dict__.update(patched_mod.__dict__) + self.extra_repr = patched_mod.extra_repr_org + +@helper_mod_register(name="Linear") +class Linear(torch.nn.Module): + def __init__(self, patched_mod, *args, **kwargs): + super().__init__() + self.__dict__.update(patched_mod.__dict__) + self.extra_repr = patched_mod.extra_repr_org + +@helper_mod_register(name="FalconLinear") +class FalconLinear(torch.nn.Module): + def __init__(self, patched_mod, *args, **kwargs): + super().__init__() + self.__dict__.update(patched_mod.__dict__) + self.extra_repr = patched_mod.extra_repr_org + +@helper_mod_register(name="KVCache") +class KVCache(torch.nn.Module): + def __init__(self, patched_mod, *args, **kwargs): + super().__init__() + self.allocate = patched_mod.org_allocate + self.get_shape = patched_mod.get_shape + self.forward = patched_mod.forward + self.update = patched_mod.update + +@helper_mod_register(name="Conv2d") +class Conv2d(torch.nn.Module): + def __init__(self, patched_mod, *args, **kwargs): + super().__init__() + self.__dict__.update(patched_mod.__dict__) + self.extra_repr = patched_mod.extra_repr_org + +@helper_mod_register(name="LoRACompatibleLinear") +class LoRACompatibleLinear(torch.nn.Module): + def __init__(self, patched_mod, *args, **kwargs): + super().__init__() + self.__dict__.update(patched_mod.__dict__) + self.extra_repr = patched_mod.extra_repr_org + +@helper_mod_register(name="LoRACompatibleConv") +class LoRACompatibleConv(torch.nn.Module): + def __init__(self, patched_mod, *args, **kwargs): + super().__init__() + self.__dict__.update(patched_mod.__dict__) + self.extra_repr = patched_mod.extra_repr_org + +@helper_mod_register(name="Softmax") +class Softmax(torch.nn.Module): + def __init__(self, patched_mod, *args, **kwargs): + super().__init__() + self.__dict__.update(patched_mod.__dict__) + self.extra_repr = patched_mod.extra_repr_org + +@helper_mod_register(name="LinearLayer") +class LinearLayer(torch.nn.Module): + def __init__(self, patched_mod, *args, **kwargs): + super().__init__() + self.__dict__.update(patched_mod.__dict__) + self.extra_repr = patched_mod.extra_repr_org + +@helper_mod_register(name="LinearAllreduce") +class LinearAllreduce(torch.nn.Module): + def __init__(self, patched_mod, *args, **kwargs): + super().__init__() + self.__dict__.update(patched_mod.__dict__) + self.extra_repr = patched_mod.extra_repr_org + +@helper_mod_register(name="ScopedLinearAllReduce") +class ScopedLinearAllReduce(torch.nn.Module): + def __init__(self, patched_mod, *args, **kwargs): + super().__init__() + self.__dict__.update(patched_mod.__dict__) + self.extra_repr = patched_mod.extra_repr_org + +@helper_mod_register(name="LmHeadLinearAllreduce") +class LmHeadLinearAllreduce(torch.nn.Module): + def __init__(self, patched_mod, *args, **kwargs): + super().__init__() + self.__dict__.update(patched_mod.__dict__) + self.extra_repr = patched_mod.extra_repr_org + +@helper_mod_register(name="ModuleFusedSDPA") +class ModuleFusedSDPA(torch.nn.Module): + def __init__(self, patched_mod, *args, **kwargs): + super().__init__() + self.__dict__.update(patched_mod.__dict__) + self.extra_repr = patched_mod.extra_repr_org diff --git a/neural_compressor/torch/algorithms/habana_fp8/fp8_quant.py b/neural_compressor/torch/algorithms/habana_fp8/fp8_quant.py deleted file mode 100644 index 0330bd475ad..00000000000 --- a/neural_compressor/torch/algorithms/habana_fp8/fp8_quant.py +++ /dev/null @@ -1,221 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# pylint:disable=import-error - -import copy -import os - -import habana_frameworks.torch.core as htcore -import torch -from deepspeed.module_inject import LinearAllreduce, LinearLayer -from deepspeed.module_inject.layers import LmHeadLinearAllreduce -from habana_frameworks.torch.core.quantization import _check_params_as_const, _mark_params_as_const - -from neural_compressor.torch.utils import fetch_module, logger, set_module - -from .modules import ( # fp32; dynamic modules; static modules; dtype amax - Autocast, - BatchMatmul, - FP8BatchMatmul, - FP8Cast, - FP8DynamicBatchMatmul, - FP8DynamicLinear, - FP8DynamicMatmul, - FP8Linear, - FP8LinearAllreduce, - FP8LinearLayer, - FP8LmHeadLinearAllreduce, - FP8Matmul, - Matmul, -) -from .observer import observer_mapping - -quantization_mapping = { - LinearAllreduce: FP8LinearAllreduce, - LinearLayer: FP8LinearLayer, - LmHeadLinearAllreduce: FP8LmHeadLinearAllreduce, - torch.nn.Linear: FP8Linear, - BatchMatmul: FP8BatchMatmul, - Matmul: FP8Matmul, - Autocast: FP8Cast, - # torch.matmul: fp8_matmul -} -white_list = tuple(quantization_mapping.keys()) - - -FP8_DTYPE = [torch.float8_e5m2, torch.float8_e4m3fn, "fp8_e5m2", "fp8_e4m3"] -dtype_mapping = {"fp8_e5m2": torch.float8_e5m2, "fp8_e4m3": torch.float8_e4m3fn} -# enable inference optimizations -htcore.hpu_initialize() - - -def _replace_module(module, qconfig): - assert qconfig.w_dtype == qconfig.act_dtype, "weight and activation should be the same dtype." - dtype = dtype_mapping[qconfig.w_dtype] - # only modules that have weight should use this observer - if hasattr(module, "weight"): - observer_cls = observer_mapping[qconfig.w_observer] - observer_obj = observer_cls(dtype=dtype) - if qconfig.approach == "static": - if isinstance(module, white_list): - QModule = quantization_mapping[type(module)] - qmodule = QModule(module, dtype) - elif qconfig.approach == "dynamic": - if isinstance(module, torch.nn.Linear): - # need module for initialization - qmodule = FP8DynamicLinear(module, dtype) - elif isinstance(module, Matmul): - qmodule = FP8DynamicMatmul(dtype) - elif isinstance(module, BatchMatmul): - qmodule = FP8DynamicBatchMatmul(dtype) - elif isinstance(module, Autocast): - qmodule = FP8Cast(dtype=dtype) - # only modules that have weight should use this API - if hasattr(qmodule, "from_float"): - qmodule.from_float(module, observer_obj) - return qmodule - - -def quantize_dynamic(model, dtype=torch.float8_e4m3fn, inplace=True): - torch.set_grad_enabled(False) - q_model = model if inplace else copy.deepcopy(model) - if isinstance(dtype, str): - dtype = dtype_mapping[dtype] - for n, m in q_model.named_modules(): - if isinstance(m, torch.nn.Linear): - observer_cls = observer_mapping["minmax_per_channel"] - observer_obj = observer_cls(dtype=dtype) - new_m = FP8DynamicLinear(m, dtype) # need m for init - new_m.from_float(m, observer_obj) - set_module(q_model, n, new_m) - elif isinstance(m, Matmul): - new_m = FP8DynamicMatmul(dtype) - set_module(q_model, n, new_m) - elif isinstance(m, BatchMatmul): - new_m = FP8DynamicBatchMatmul(dtype) - set_module(q_model, n, new_m) - elif isinstance(m, Autocast): - new_m = FP8Cast(dtype=dtype) - set_module(q_model, n, new_m) - htcore.mark_step() - _mark_params_as_const(q_model) - _check_params_as_const(q_model) - return q_model - - -def _add_observer(module, qconfig): - act_observer = qconfig.act_observer - - def input_observer_forward_pre_hook(self, input): - try: - if isinstance(input[0], torch.Tensor): - self.input_activation_post_process(input[0]) - if hasattr(self, "input_activation_post_process1") and isinstance(input[1], torch.Tensor): - self.input_activation_post_process1(input[1]) - return input - except Exception as e: - # The KL act_observer may encounter a overflow error on EltwiseAdd. - pass - - ### Insert input observer into model, only for fp8_e4m3 static quantization ### - observer_cls = observer_mapping[act_observer] - # import pdb;pdb.set_trace() - - if isinstance(module, white_list): - observer_obj = observer_cls(dtype=dtype_mapping[qconfig.act_dtype]) - module.add_module("input_activation_post_process", observer_obj) - if isinstance(module, (BatchMatmul, Matmul)): - observer_obj = observer_cls(dtype=dtype_mapping[qconfig.act_dtype]) - module.add_module("input_activation_post_process1", observer_obj) - module.register_forward_pre_hook(input_observer_forward_pre_hook) - - -def _remove_observer(module): - import deepspeed.comm as dist - from torch.distributed import ReduceOp - - if hasattr(module, "input_activation_post_process"): - scale = module.input_activation_post_process.calculate_qparams() - if dist.is_initialized(): - scale = scale.to("hpu") - dist.all_reduce(scale, op=ReduceOp.MAX) - if hasattr(module, "input_activation_post_process1"): - module.register_parameter("scale1", torch.nn.Parameter(scale)) - else: - module.register_parameter("scale", torch.nn.Parameter(scale)) - delattr(module, "input_activation_post_process") - if hasattr(module, "input_activation_post_process1"): - scale = module.input_activation_post_process1.calculate_qparams() - if dist.is_initialized(): - scale = scale.to("hpu") - dist.all_reduce(scale, op=ReduceOp.MAX) - module.register_parameter("scale2", torch.nn.Parameter(scale)) - delattr(module, "input_activation_post_process1") - - # remove observer hooks - hook_map = module._forward_pre_hooks - handle_ids_to_remove = set() - for handle_id, hook_fn in hook_map.items(): - if hasattr(hook_fn, "__name__") and hook_fn.__name__ == "input_observer_forward_pre_hook": - handle_ids_to_remove.add(handle_id) - for handle_id in handle_ids_to_remove: - hook_map.pop(handle_id) - - -def prepare(model, qconfig_mapping): - model.qconfig = qconfig_mapping - for (op_name, op_type), qconfig in qconfig_mapping.items(): - if qconfig.approach == "dynamic": - continue - if qconfig.w_dtype not in FP8_DTYPE: - continue - module = fetch_module(model, op_name) - if module is None: - logger.info(f"{op_name} is not found in model.") - continue - _add_observer(module, qconfig) - set_module(model, op_name, module) - return model - - -def convert(model): - for (op_name, op_type), qconfig in model.qconfig.items(): - if qconfig.w_dtype not in FP8_DTYPE: - continue - module = fetch_module(model, op_name) - if module is None: - logger.info(f"{op_name} is not found in model.") - continue - if qconfig.approach != "dynamic": - _remove_observer(module) - module = _replace_module(module, qconfig) - set_module(model, op_name, module) - htcore.mark_step() - return model - - -def quantize(model, qconfig_mapping, run_fn=None, run_args=None, inplace=True): - torch.set_grad_enabled(False) - q_model = model if inplace else copy.deepcopy(model) - q_model = prepare(q_model, qconfig_mapping) - if run_fn is not None: - if run_args is not None: - run_fn(q_model, *run_args) - else: - run_fn(q_model) - q_model = convert(q_model) - _mark_params_as_const(q_model) - _check_params_as_const(q_model) - return q_model diff --git a/neural_compressor/torch/algorithms/habana_fp8/modules.py b/neural_compressor/torch/algorithms/habana_fp8/modules.py deleted file mode 100644 index 99b9faf1f72..00000000000 --- a/neural_compressor/torch/algorithms/habana_fp8/modules.py +++ /dev/null @@ -1,487 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# pylint:disable=import-error - -import os - -import habana_frameworks.torch.core as htcore -import habana_frameworks.torch.hpex -import torch -import torch.nn as nn -from torch.nn import functional as F - -from neural_compressor.common import logger - -from .observer import calculate_qparams - - -##################### FP32 modules ####################### -class Matmul(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x, y): - return torch.matmul(x, y) - - -class BatchMatmul(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x, y): - return torch.bmm(x, y) - - -class Autocast(nn.Module): - def __init__(self): - super().__init__() - - def forward(self, x): - return x - - -##################### FP8 modules ####################### -class FP8DynamicLinear(torch.nn.Module): - def __init__(self, org_module, dtype=torch.float8_e4m3fn) -> None: - super().__init__() - # attributes - self.use_amax = True - self.dtype = dtype - self.in_features = org_module.in_features - self.out_features = org_module.out_features - self.weight_dtype = self.dtype - self.out_dtype = org_module.weight.dtype - # register weight, bias - self.register_buffer( - "weight", - torch.empty( - self.in_features, - self.out_features, - device="hpu", - dtype=self.weight_dtype, - ), - ) - if org_module.bias is not None: - self.register_buffer( - "bias", - torch.empty( - self.out_features, - device="hpu", - dtype=self.out_dtype, - ), - ) - else: - self.bias = None - - def from_float(self, org_module, w_observer): - # register scale - if not org_module.weight.device.type == "meta": - w_observer(org_module.weight) - weight_scale = w_observer.calculate_qparams() - else: - weight_scale = torch.tensor([1.0]) - self.register_buffer( - "weight_scale", - torch.tensor( - weight_scale, - device="hpu", - dtype=torch.float32, - ), - ) - self.register_buffer( - "weight_scale_inv", - torch.tensor( - torch.reciprocal(weight_scale), - device="hpu", - dtype=torch.float32, - ), - ) - # copy weight and bias - if not org_module.weight.device.type == "meta": - org_module.to("hpu") - self.weight.data.copy_( - torch.ops.hpu.cast_to_fp8_v2(org_module.weight.T, self.weight_scale_inv, False, False, self.dtype)[0] - ) - if org_module.bias is not None: - self.bias.data.copy_(org_module.bias.data.type(self.out_dtype)) - - def forward(self, inp): - assert inp.shape[-1] == self.in_features, "GEMM not possible" - org_middle_shape = inp.shape[1:-1] - inp = inp.view(-1, self.in_features) - if inp.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]: - if self.use_amax: - input_scale = calculate_qparams(inp.min(), inp.max(), self.dtype) - input_scale_inv = torch.reciprocal(input_scale) - else: - input_scale, input_scale_inv = None, None - inp = torch.ops.hpu.cast_to_fp8_v2(inp, input_scale_inv, False, False, self.dtype)[0] - else: - input_scale, input_scale_inv = None, None - out = torch.ops.hpu.fp8_gemm_v2( - inp, - False, - self.weight, - False, - None, - self.out_dtype, - input_scale, # inv is used for recover scale - self.weight_scale, - self.bias, - False, - ) - out = out.view(-1, *org_middle_shape, out.shape[-1]) - return out - - def extra_repr(self) -> str: - return "in_features={}, out_features={}, bias={}, format={}".format( - self.in_features, - self.out_features, - self.bias is not None, - self.dtype, - ) - - -class FP8DynamicMatmul(torch.nn.Module): - def __init__(self, dtype) -> None: - super().__init__() - self.dtype = dtype - self.use_amax = True - self.out_dtype = torch.float32 - - def forward(self, input1, input2): - dim1 = input1.shape[-1] - dim2 = input2.shape[-2] - assert dim1 == dim2, "GEMM not possible" - - # process input1 - if input1.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]: - self.out_dtype = input1.dtype - if self.use_amax: - input1_scale = calculate_qparams(input1.min(), input1.max(), self.dtype) - input1_scale_inv = torch.reciprocal(input1_scale) - else: - input1_scale, input1_scale_inv = None, None - input1 = torch.ops.hpu.cast_to_fp8_v2(input1, input1_scale_inv, False, False, self.dtype)[0] - else: - # skip cast for input1 - input1_scale, input1_scale_inv = None, None - # process input2 - if input2.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]: - self.out_dtype = input2.dtype - if self.use_amax: - input2_scale = calculate_qparams(input2.min(), input2.max(), self.dtype) - input2_scale_inv = torch.reciprocal(input2_scale) - else: - input2_scale, input2_scale_inv = None, None - input2 = torch.ops.hpu.cast_to_fp8_v2(input2, input2_scale_inv, False, False, self.dtype)[0] - else: - # skip cast for input2 - input2_scale, input2_scale_inv = None, None - # calculate - out = torch.ops.hpu.fp8_gemm_v2( - input1, - False, - input2, - False, - None, - self.out_dtype, - input1_scale, # inv is used for recover scale - input2_scale, - None, - False, - ) - return out - - def extra_repr(self) -> str: - return "format={}".format(self.dtype) - - -class FP8DynamicBatchMatmul(FP8DynamicMatmul): - pass - - -class FP8Linear(torch.nn.Module): - def __init__(self, org_module, dtype) -> None: - super().__init__() - # attributes - self.in_features = org_module.in_features - self.out_features = org_module.out_features - self.dtype = dtype - self.weight_dtype = self.dtype - self.out_dtype = org_module.weight.dtype - self.register_buffer( - "weight", - torch.empty( - self.in_features, - self.out_features, - device="hpu", - dtype=self.weight_dtype, - ), - ) - if org_module.bias is not None: - self.register_buffer( - "bias", - torch.empty( - self.out_features, - device="hpu", - dtype=self.out_dtype, - ), - ) - else: - self.bias = None - - def from_float(self, org_module, w_observer): - # register scale - if not org_module.weight.device.type == "meta": - w_observer(org_module.weight) - weight_scale = w_observer.calculate_qparams() - else: - weight_scale = torch.tensor([1.0]) - self.register_buffer( - "weight_scale", - torch.tensor( - weight_scale, - device="hpu", - dtype=torch.float32, - ), - ) - self.register_buffer( - "weight_scale_inv", - torch.tensor( - torch.reciprocal(weight_scale), - device="hpu", - dtype=torch.float32, - ), - ) - # copy weight and bias - if not org_module.weight.device.type == "meta": - org_module.to("hpu") - self.weight.data.copy_( - torch.ops.hpu.cast_to_fp8_v2(org_module.weight.T, self.weight_scale_inv, False, False, self.dtype)[0] - ) - if org_module.bias is not None: - self.bias.data.copy_(org_module.bias.data.type(self.out_dtype)) - # register input scale - input_scale = org_module.scale if hasattr(org_module, "scale") else torch.tensor([1.0]) - self.register_buffer( - "input_scale", - torch.tensor( - input_scale, - device="hpu", - dtype=torch.float32, - ), - ) - self.register_buffer( - "input_scale_inv", - torch.tensor( - torch.reciprocal(input_scale), - device="hpu", - dtype=torch.float32, - ), - ) - - def forward(self, inp): - assert inp.shape[-1] == self.in_features, "GEMM not possible" - org_middle_shape = inp.shape[1:-1] - inp = inp.view(-1, self.in_features) - inp = torch.ops.hpu.cast_to_fp8_v2(inp, self.input_scale_inv, False, False, self.dtype)[0] - out = torch.ops.hpu.fp8_gemm_v2( - inp, - False, - self.weight, - False, - None, - self.out_dtype, - self.input_scale, # inv is used for recover scale - self.weight_scale, - self.bias, - False, - ) - out = out.view(-1, *org_middle_shape, out.shape[-1]) - return out - - def extra_repr(self) -> str: - return "in_features={}, out_features={}, bias={}, scale={}, format={}".format( - self.in_features, - self.out_features, - self.bias is not None, - self.input_scale.tolist() if hasattr(self, "input_scale") else None, - self.dtype, - ) - - -class FP8Matmul(torch.nn.Module): - def __init__(self, org_module, dtype) -> None: - super().__init__() - org_module.to("hpu") - self.dtype = dtype - self.out_dtype = torch.float32 - scale1 = org_module.scale1 if hasattr(org_module, "scale1") else 1.0 - scale2 = org_module.scale2 if hasattr(org_module, "scale2") else 1.0 - self.register_buffer( - "scale1", - torch.tensor( - scale1, - device="hpu", - dtype=self.out_dtype, - ), - ) - self.register_buffer( - "scale2", - torch.tensor( - scale2, - device="hpu", - dtype=self.out_dtype, - ), - ) - - def forward(self, input1, input2): - dim1 = input1.shape[-1] - dim2 = input2.shape[-2] - assert dim1 == dim2, "GEMM not possible" - - if input1.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]: - self.out_dtype = input1.dtype - self.scale1_inv = torch.reciprocal(self.scale1) - input1 = torch.ops.hpu.cast_to_fp8_v2(input1, self.scale1_inv, False, False, self.dtype)[0] - else: - self.scale1_inv = None - if input2.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]: - self.out_dtype = input2.dtype - self.scale2_inv = torch.reciprocal(self.scale2) - input2 = torch.ops.hpu.cast_to_fp8_v2(input2, self.scale2_inv, False, False, self.dtype)[0] - else: - self.scale2_inv = None - out = torch.ops.hpu.fp8_gemm_v2( - input1, - False, - input2, - False, - None, - self.out_dtype, - self.scale1, # inv is used for recover scale - self.scale2, - None, - False, - ) - return out - - def extra_repr(self) -> str: - return "scales={}, format={}".format( - (self.scale1.tolist(), self.scale2.tolist()), - self.dtype, - ) - - -class FP8BatchMatmul(FP8Matmul): - pass - - -class FP8Cast(torch.nn.Module): - def __init__(self, org_module=None, dtype=torch.float8_e4m3fn) -> None: - super().__init__() - self.dtype = dtype - if org_module is not None: - org_module.to("hpu") - scale = org_module.scale if hasattr(org_module, "scale") else 1.0 - self.register_buffer( - "scale", - torch.tensor( - scale, - device="hpu", - dtype=torch.float32, - ), - ) - self.scale, self.scale_inv = None, None # due to next matmul doesn't know this scale - else: - self.scale, self.scale_inv = None, None - - def forward(self, input): - if input.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]: - out = torch.ops.hpu.cast_to_fp8_v2(input, self.scale_inv, False, False, self.dtype)[0] - else: - out = input - return out - - def extra_repr(self) -> str: - return "scales={}, format={}".format( - self.scale, - self.dtype, - ) - - -FP8LinearLayer = FP8Linear - - -class FP8LinearAllreduce(FP8Linear): - def forward(self, inp): - assert inp.shape[-1] == self.in_features, "GEMM not possible" - inputmat = inp.view(-1, self.in_features) - inputmat = torch.ops.hpu.cast_to_fp8_v2(inputmat, self.input_scale_inv, False, False, self.dtype)[0] - out = torch.ops.hpu.fp8_gemm_v2( - inputmat, - False, - self.weight, - False, - None, - self.out_dtype, - self.input_scale, - self.weight_scale, - None, - False, - ) - from deepspeed import comm as dist - - if self.mp_group is not None: - dist.inference_all_reduce(out, group=self.mp_group) - if self.bias is not None: - out += self.bias - return out.view(-1, *inp.shape[1:-1], out.shape[-1]) - - -class FP8LmHeadLinearAllreduce(FP8Linear): - def forward(self, inp): - # from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list - # input_shard_size = get_shard_size(inp.shape[-1], self.world_size) - # input_shard_offset = sum(get_shard_size_list(inp.shape[-1], self.world_size)[0:self.rank]) - - # inputmat = inp[:, :, input_shard_offset:input_shard_offset + input_shard_size] - assert ( - inp.shape[-1] % self.world_size == 0 - ), "Please ensure that self.world_size is divisible by input.shape[-1]" - input_shard = inp.shape[-1] // self.world_size - inp_part = inp[:, :, self.rank * input_shard : (self.rank + 1) * input_shard] - inputmat = inp_part.view(-1, input_shard) # dim=2 will help kernel speed - inputmat = torch.ops.hpu.cast_to_fp8_v2(inputmat, self.input_scale_inv, False, False, self.dtype)[0] - out = torch.ops.hpu.fp8_gemm_v2( - inputmat, - False, - self.weight, - False, - None, - self.out_dtype, - self.input_scale, - self.weight_scale, - None, - False, - ) - from deepspeed import comm as dist - - if self.mp_group is not None: - dist.inference_all_reduce(out, group=self.mp_group) - if self.bias is not None: - out += self.bias - return out.view(-1, *inp.shape[1:-1], out.shape[-1]) diff --git a/neural_compressor/torch/algorithms/habana_fp8/observer.py b/neural_compressor/torch/algorithms/habana_fp8/observer.py deleted file mode 100644 index fd29892ddb7..00000000000 --- a/neural_compressor/torch/algorithms/habana_fp8/observer.py +++ /dev/null @@ -1,440 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# pylint:disable=import-error - -import os -from typing import Tuple - -import habana_frameworks.torch.core as htcore -import torch -from torch.ao.quantization.observer import * - -E4M3_AMAX = torch.tensor(240, dtype=torch.float).to("cpu") -E5M2_AMAX = torch.tensor(57344, dtype=torch.float).to("cpu") -USE_HW_SCALE = bool(os.getenv("USE_HW_SCALE", False)) -USE_POW2_SCALE = bool(os.getenv("USE_POW2_SCALE", False)) -observer_mapping = {} - - -def observer_registry(name): - def new_observer(observer_cls): - global observer_mapping - observer_mapping[name] = observer_cls - return observer_cls - - return new_observer - - -def _map_gaudi_scale(scale): - if USE_HW_SCALE: - scale_list = torch.tensor([16, 1, 1 / 16, 1 / 256]) - return torch.clip( - 2 ** (torch.ceil(torch.log2(scale) / 4) * 4), - torch.tensor(scale_list[-1], dtype=scale.dtype, device=scale.device), - torch.tensor(scale_list[0], dtype=scale.dtype, device=scale.device), - ) - elif USE_POW2_SCALE: - return 2 ** torch.ceil(torch.log2(scale)) - else: - return scale - - -def calculate_qparams(min_val, max_val, dtype): - amax = torch.max(torch.abs(min_val), torch.abs(max_val)) - dtype_amax = E4M3_AMAX if dtype == torch.float8_e4m3fn else E5M2_AMAX - scale = amax / dtype_amax - scale = scale.reshape(-1) - return _map_gaudi_scale(scale) - - -@observer_registry(name="minmax") -class FP8MinMaxObserver(ObserverBase): - def __init__( - self, - dtype: torch.dtype = torch.float8_e4m3fn, - ) -> None: - # bins: The number of bins used for histogram calculation. - super().__init__(dtype=dtype) - assert isinstance(dtype, torch.dtype), "Please make sure the dtype of observer is torch.dtype." - factory_kwargs = {"device": "cpu", "dtype": torch.float32} - self.register_buffer("min_val", torch.tensor(float("inf"), **factory_kwargs)) - self.register_buffer("max_val", torch.tensor(float("-inf"), **factory_kwargs)) - - def forward(self, x_orig): - r"""Records the running minimum and maximum of ``x``.""" - if x_orig.numel() == 0: - return x_orig - x = x_orig.detach() - x = x.to(self.min_val.dtype) - min_val_cur, max_val_cur = torch.aminmax(x) - min_val = torch.min(min_val_cur, self.min_val) - max_val = torch.max(max_val_cur, self.max_val) - self.min_val.copy_(min_val) - self.max_val.copy_(max_val) - return x_orig - - def calculate_qparams(self): - r"""Calculates the quantization parameters.""" - scale = calculate_qparams(self.min_val, self.max_val, self.dtype) - return scale - - def extra_repr(self): - return f"min_val={self.min_val}, max_val={self.max_val}" - - def reset_min_max_vals(self): - """Resets the min/max values.""" - self.min_val.copy_(torch.tensor(float("inf"))) - self.max_val.copy_(torch.tensor(float("-inf"))) - - -@observer_registry(name="minmax_per_channel") -class FP8PerChannelMinMaxObserver(ObserverBase): - def __init__( - self, - dtype: torch.dtype = torch.float8_e4m3fn, - ch_axis=0, # weight_shape = (out_features, in_features) - ) -> None: - # bins: The number of bins used for histogram calculation. - super().__init__(dtype=dtype) - assert isinstance(dtype, torch.dtype), "Please make sure the dtype of observer is torch.dtype." - self.ch_axis = ch_axis - factory_kwargs = {"device": "cpu", "dtype": torch.float32} - self.register_buffer("min_val", torch.tensor([], **factory_kwargs)) - self.register_buffer("max_val", torch.tensor([], **factory_kwargs)) - - def forward(self, x_orig): - if x_orig.numel() == 0: - return x_orig - x = x_orig.detach() - min_val = self.min_val - max_val = self.max_val - x_dim = x.size() - - new_axis_list = [i for i in range(len(x_dim))] - new_axis_list[self.ch_axis] = 0 - new_axis_list[0] = self.ch_axis - y = x.permute(new_axis_list) - # Need to match dtype of min/max because the updates to buffers - # are done in place and types need to match for comparisons - y = y.to(self.min_val.dtype) - y = torch.flatten(y, start_dim=1) - if min_val.numel() == 0 or max_val.numel() == 0: - min_val, max_val = torch.aminmax(y, dim=1) - else: - min_val_cur, max_val_cur = torch.aminmax(y, dim=1) - min_val = torch.min(min_val_cur, min_val) - max_val = torch.max(max_val_cur, max_val) - self.min_val.resize_(min_val.shape) - self.max_val.resize_(max_val.shape) - self.min_val.copy_(min_val) - self.max_val.copy_(max_val) - return x_orig - - def calculate_qparams(self): - r"""Calculates the quantization parameters.""" - scale = calculate_qparams(self.min_val, self.max_val, self.dtype) - return scale - - def extra_repr(self): - return f"min_val={self.min_val}, max_val={self.max_val}" - - def reset_min_max_vals(self): - """Resets the min/max values.""" - self.min_val.copy_(torch.tensor(float("inf"))) - self.max_val.copy_(torch.tensor(float("-inf"))) - - -@observer_registry(name="kl") -class FP8HistogramObserver(ObserverBase): - def __init__( - self, - dtype: torch.dtype = torch.float8_e4m3fn, - bins: int = 2048, - upsample_rate: int = 128, - qscheme=torch.per_tensor_affine, - eps=torch.finfo(torch.float32).eps, - ) -> None: - # bins: The number of bins used for histogram calculation. - super().__init__(dtype=dtype) - assert isinstance(dtype, torch.dtype), "Please make sure the dtype of observer is torch.dtype." - self.bins = bins - factory_kwargs = {"device": "cpu", "dtype": torch.float32} - self.register_buffer("histogram", torch.zeros(self.bins, **factory_kwargs)) - self.register_buffer("min_val", torch.tensor(float("inf"), **factory_kwargs)) - self.register_buffer("max_val", torch.tensor(float("-inf"), **factory_kwargs)) - self.dst_nbins = 2 ** torch.finfo(self.dtype).bits - self.upsample_rate = upsample_rate - - def calculate_qparams(self, **kwargs): - new_min, new_max = self._non_linear_param_search() - scale = calculate_qparams(new_min, new_max, self.dtype) - return scale - - def _get_norm(self, delta_begin: torch.Tensor, delta_end: torch.Tensor, density: torch.Tensor) -> torch.Tensor: - r"""Compute the norm of the values uniformaly distributed between - delta_begin and delta_end. - Currently only L2 norm is supported. - - norm = density * (integral_{begin, end} x^2) - = density * (end^3 - begin^3) / 3 - """ - norm = (delta_end * delta_end * delta_end - delta_begin * delta_begin * delta_begin) / 3 - return density * norm - - def _get_dst_bin(self, src_bin_begin, src_bin_end, dst_bin_max): - # get dst bin value - FP8_amax = E4M3_AMAX if self.dtype == torch.float8_e4m3fn else E5M2_AMAX - scale = FP8_amax / dst_bin_max - if torch.isinf(torch.tensor(scale)): - scale = torch.tensor(3.4e38) - tmp = torch.ops.hpu.cast_to_fp8_v2(src_bin_begin.to("hpu"), scale.to("hpu"), False, False, self.dtype)[0] - dst_bin_begin = torch.ops.hpu.cast_from_fp8(tmp, None, torch.float32).to("cpu") - tmp = torch.ops.hpu.cast_to_fp8_v2(src_bin_end.to("hpu"), scale.to("hpu"), False, False, self.dtype)[0] - dst_bin_end = torch.ops.hpu.cast_from_fp8(tmp, None, torch.float32).to("cpu") - # get bin width of dst bin value, dst_bin_begin must contain 0 and the max qvalue. - dst_bin = list(set(dst_bin_begin.detach().cpu().numpy())) - dst_bin.sort() - width_dict = {} - bin_of_dst_dict = {} - for i, bin in enumerate(dst_bin): - bin_of_dst_dict[bin] = i - if bin == 0: - width_dict[bin] = {"left": 0, "right": dst_bin[i + 1]} - elif i == len(dst_bin) - 1: - width_dict[bin] = {"left": dst_bin[i] - dst_bin[i - 1], "right": dst_bin[i] - dst_bin[i - 1]} - else: - width_dict[bin] = {"left": dst_bin[i] - dst_bin[i - 1], "right": dst_bin[i + 1] - dst_bin[i]} - dst_bin_of_begin = [bin_of_dst_dict[float(i)] for i in dst_bin_begin] - dst_bin_of_end = [bin_of_dst_dict[float(i)] for i in dst_bin_end] - left_dst_bin_end_width = [width_dict[float(i)]["left"] for i in dst_bin_end] - right_dst_bin_begin_width = [width_dict[float(i)]["right"] for i in dst_bin_begin] - return ( - dst_bin_begin, - dst_bin_end, - torch.tensor(dst_bin_of_begin), - torch.tensor(dst_bin_of_end), - torch.tensor(left_dst_bin_end_width), - torch.tensor(right_dst_bin_begin_width), - ) - - def _compute_quantization_error(self, next_start_bin: int, next_end_bin: int): - r"""Compute the quantization error if we use start_bin to end_bin as the - min and max to do the quantization.""" - bin_width = (self.max_val.item() - self.min_val.item()) / self.bins - dst_bin_max = bin_width * (next_end_bin - next_start_bin + 1) - - src_bin = torch.arange(self.bins, device=self.histogram.device) - src_bin_begin = src_bin * bin_width - src_bin_end = src_bin_begin + bin_width - ( - dst_bin_begin, - dst_bin_end, - dst_bin_of_begin, - dst_bin_of_end, - left_dst_bin_end_width, - right_dst_bin_begin_width, - ) = self._get_dst_bin(src_bin_begin, src_bin_end, dst_bin_max) - - dst_bin_of_begin_center = dst_bin_begin + right_dst_bin_begin_width - dst_bin_of_end_center = dst_bin_end + left_dst_bin_end_width - - density = self.histogram / bin_width - - norm = torch.zeros(self.bins, device=self.histogram.device) - - delta_begin = src_bin_begin - dst_bin_of_begin_center - delta_end = right_dst_bin_begin_width - - norm += self._get_norm(delta_begin, delta_end, density) - - norm += (dst_bin_of_end - dst_bin_of_begin - 1) * self._get_norm( - torch.tensor(-left_dst_bin_end_width), torch.tensor(right_dst_bin_begin_width), density - ) - - delta_begin = -left_dst_bin_end_width - delta_end = src_bin_end - dst_bin_of_end_center - norm += self._get_norm(delta_begin, delta_end, density) - - return norm.sum().item() - - def _non_linear_param_search(self) -> Tuple[torch.Tensor, torch.Tensor]: - r"""Non-linear parameter search. - - An approximation for L2 error minimization for selecting min/max. - By selecting new min/max, we filter out outliers in input distribution. - This follows the implementation of NormMinimization::NonlinearQuantizationParamsSearch in - caffe2/quantization/server/norm_minimization.cc - """ - assert self.histogram.size()[0] == self.bins, "bins mismatch" - bin_width = (self.max_val - self.min_val) / self.bins - - # cumulative sum - total = torch.sum(self.histogram).item() - cSum = torch.cumsum(self.histogram, dim=0) - - stepsize = 1e-5 # granularity - alpha = 0.0 # lower bound - beta = 1.0 # upper bound - start_bin = 0 - end_bin = self.bins - 1 - norm_min = float("inf") - - while alpha < beta: - # Find the next step - next_alpha = alpha - next_beta = beta - stepsize - - # find the right bins between the quantile bounds - # keep the left bins at zero due to fp8 symmetry - l = 0 - r = end_bin - while r > start_bin and cSum[r] > next_beta * total: - r = r - 1 - - # decide the next move - next_start_bin = start_bin - next_end_bin = end_bin - if (l - start_bin) <= (end_bin - r): - # move the end bin - next_end_bin = r - beta = next_beta - - if next_start_bin == start_bin and next_end_bin == end_bin: - continue - - # calculate the quantization error using next_start_bin and next_end_bin - norm = self._compute_quantization_error(next_start_bin, next_end_bin) - - if norm > norm_min: - break - norm_min = norm - start_bin = next_start_bin - end_bin = next_end_bin - - new_min = self.min_val + bin_width * start_bin - new_max = self.min_val + bin_width * (end_bin + 1) - return new_min, new_max - - def _adjust_min_max( - self, combined_min: torch.Tensor, combined_max: torch.Tensor, upsample_rate: int - ) -> Tuple[torch.Tensor, torch.Tensor, int, int]: - # We ensure that: - # (combined_max - combined_min)/(downsample_rate*Nbins) = (max - min)/(upsample_rate*Nbins) - # This allows us to have a common grid of resolution s, where we can align - # the input histogram - # start_idx maps min_val to the histogram bin index. - - # Compute the width of histogram bins is a straightforward solution, where - # hist_bin_width = (self.max_val - self.min_val) / (self.bins * upsample_rate) - # Underflow happens if the numerator is close to the smallest positive subnormal number of FP32 - # Therefore, we avoid such division operation. - downsample_rate = int( - torch.ceil((combined_max - combined_min) * upsample_rate / (self.max_val - self.min_val)).item() - ) - e = downsample_rate * (self.max_val - self.min_val) / upsample_rate - (combined_max - combined_min) - start_idx = int( - torch.round( - (self.min_val - combined_min) * self.bins * upsample_rate / (self.max_val - self.min_val) - ).item() - ) - combined_max = combined_max + e - combined_min = combined_min - return combined_min, combined_max, downsample_rate, start_idx - - def _combine_histograms( - self, - orig_hist: torch.Tensor, - new_hist: torch.Tensor, - upsample_rate: int, - downsample_rate: int, - start_idx: int, - Nbins: int, - ) -> torch.Tensor: - # First up-sample the histogram with new data by a factor of L - # This creates an approximate probability density that's piecewise constant - upsampled_histogram = new_hist.repeat_interleave(upsample_rate) - # Now insert the upsampled histogram into the output - # histogram, which is initialized with zeros. - # The offset at which the histogram is introduced is determined - # by the start index as the output histogram can cover a wider range - histogram_with_output_range = torch.zeros((Nbins * downsample_rate), device=orig_hist.device) - histogram_with_output_range[start_idx : Nbins * upsample_rate + start_idx] = upsampled_histogram - # Compute integral histogram, double precision is needed to ensure - # that there are no overflows - integral_histogram = torch.cumsum(histogram_with_output_range, 0, dtype=torch.double)[ - downsample_rate - 1 :: downsample_rate - ] - # Finally perform interpolation - shifted_integral_histogram = torch.zeros((Nbins), device=orig_hist.device) - shifted_integral_histogram[1:Nbins] = integral_histogram[0:-1] - interpolated_histogram = (integral_histogram - shifted_integral_histogram) / upsample_rate - orig_hist = orig_hist + interpolated_histogram.to(torch.float) - return orig_hist - - def forward(self, x_orig: torch.Tensor) -> torch.Tensor: - if x_orig.numel() == 0: - return x_orig - x = x_orig.detach() - # use abs due to fp8 symmetry - x = torch.abs(x) - min_val = self.min_val - max_val = self.max_val - same_values = min_val.item() == max_val.item() - is_uninitialized = min_val == float("inf") and max_val == float("-inf") - if is_uninitialized or same_values: - min_val, max_val = torch.aminmax(x) - self.min_val.resize_(min_val.shape) - self.min_val.copy_(min_val) - self.max_val.resize_(max_val.shape) - self.max_val.copy_(max_val) - assert min_val.numel() == 1 and max_val.numel() == 1, "histogram min/max values must be scalar." - torch.histc(x, self.bins, min=int(min_val), max=int(max_val), out=self.histogram) - else: - new_min, new_max = torch.aminmax(x) - combined_min = torch.min(new_min, min_val) - combined_max = torch.max(new_max, max_val) - # combine the existing histogram and new histogram into 1 histogram - # We do this by first upsampling the histogram to a dense grid - # and then downsampling the histogram efficiently - ( - combined_min, - combined_max, - downsample_rate, - start_idx, - ) = self._adjust_min_max(combined_min, combined_max, self.upsample_rate) - assert combined_min.numel() == 1 and combined_max.numel() == 1, "histogram min/max values must be scalar." - combined_histogram = torch.histc(x, self.bins, min=int(combined_min), max=int(combined_max)) - if combined_min == min_val and combined_max == max_val: - combined_histogram += self.histogram - else: - combined_histogram = self._combine_histograms( - combined_histogram, - self.histogram, - self.upsample_rate, - downsample_rate, - start_idx, - self.bins, - ) - - self.histogram.detach_().resize_(combined_histogram.shape) - self.histogram.copy_(combined_histogram) - self.min_val.detach_().resize_(combined_min.shape) - self.min_val.copy_(combined_min) - self.max_val.detach_().resize_(combined_max.shape) - self.max_val.copy_(combined_max) - return x_orig - - def extra_repr(self): - return f"min_val={self.min_val}, max_val={self.max_val}" diff --git a/neural_compressor/torch/algorithms/habana_fp8/save_load.py b/neural_compressor/torch/algorithms/habana_fp8/save_load.py deleted file mode 100644 index 8079a130625..00000000000 --- a/neural_compressor/torch/algorithms/habana_fp8/save_load.py +++ /dev/null @@ -1,105 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# pylint:disable=import-error - -import json -import os - -import habana_frameworks.torch.core as htcore -import torch - -from neural_compressor.common.utils import load_config_mapping, save_config_mapping -from neural_compressor.torch.utils import QCONFIG_NAME, WEIGHT_NAME, logger - -from .fp8_quant import FP8_DTYPE, dtype_mapping -from .modules import ( # fp32; dynamic modules - Autocast, - BatchMatmul, - FP8Cast, - FP8DynamicBatchMatmul, - FP8DynamicLinear, - FP8DynamicMatmul, - Matmul, -) -from .observer import observer_mapping - - -def save(model, output_dir="./saved_results"): - if not os.path.exists(output_dir): - os.mkdir(output_dir) - qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME) - qconfig_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), QCONFIG_NAME) - # saving process - save_config_mapping(model.qconfig, qconfig_file_path) - - import fp8_convert - - stat_dict = {} - for k, v in model.state_dict().items(): - if v.dtype in FP8_DTYPE: - v = fp8_convert.to_u8(v.to("cpu")) - stat_dict[k] = v.to("cpu") - torch.save(stat_dict, qmodel_file_path) - - logger.info("Save state_dict of quantized model to {}.".format(qmodel_file_path)) - logger.info("Save configuration of quantized model to {}.".format(qconfig_file_path)) - - -def load(model, output_dir="./saved_results"): - from neural_compressor.torch.utils import fetch_module, set_module - - from .fp8_quant import quantization_mapping, white_list - - qmodel_file_path = os.path.join(os.path.abspath(os.path.expanduser(output_dir)), WEIGHT_NAME) - stat_dict = torch.load(qmodel_file_path) - import fp8_convert - - for (op_name, op_type), op_qconfig in model.qconfig.items(): - dtype = dtype_mapping[op_qconfig.w_dtype] - # only modules that have weight should use this observer - observer_cls = observer_mapping[op_qconfig.w_observer] - observer_obj = observer_cls(dtype=dtype) - choice = 1 if dtype == torch.float8_e4m3fn else 0 - if op_name + ".weight" in stat_dict: - stat_dict[op_name + ".weight"] = fp8_convert.from_u8(stat_dict[op_name + ".weight"], choice) - if dtype not in FP8_DTYPE: - continue - module = fetch_module(model, op_name) - # replace module - if op_qconfig.approach == "static": - if isinstance(module, white_list): - QModule = quantization_mapping[type(module)] - qmodule = QModule(module, dtype) - else: - if isinstance(module, torch.nn.Linear): - # need module for initialization - qmodule = FP8DynamicLinear(module, dtype) - elif isinstance(module, Matmul): - qmodule = FP8DynamicMatmul(dtype) - elif isinstance(module, BatchMatmul): - qmodule = FP8DynamicBatchMatmul(dtype) - elif isinstance(module, Autocast): - qmodule = FP8Cast(dtype=dtype) - # only modules that have weight should use this API - if hasattr(qmodule, "from_float"): - qmodule.from_float(module, observer_obj) - # replace module with qmodule - set_module(model, op_name, qmodule) - htcore.mark_step() - model.load_state_dict(stat_dict, assign=True) - model.to("hpu") - htcore.mark_step() - logger.info("Quantized model loading successful.") - return model diff --git a/neural_compressor/torch/algorithms/habana_fp8/scale.py b/neural_compressor/torch/algorithms/habana_fp8/scale.py deleted file mode 100644 index 1dfaee24502..00000000000 --- a/neural_compressor/torch/algorithms/habana_fp8/scale.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# pylint:disable=import-error - -import habana_frameworks.torch.core as htcore -import torch - -scale_method_mapping = {} - - -def scale_method_registry(name): - def new_scale_method(scale_method_cls): - global scale_method_mapping - scale_method_mapping[name] = scale_method_cls - return scale_method_cls - - return new_scale_method - - -@scale_method_registry("hw") -def hardware_scale_method(scale): - scale_list = torch.tensor([16, 1, 1 / 16, 1 / 256]) - return torch.clip( - 2 ** (torch.ceil(torch.log2(scale) / 4) * 4), - torch.tensor(scale_list[-1], dtype=scale.dtype, device=scale.device), - torch.tensor(scale_list[0], dtype=scale.dtype, device=scale.device), - ) - - -@scale_method_registry("pow2") -def pow2_scale_method(scale): - return 2 ** torch.ceil(torch.log2(scale)) - - -@scale_method_registry("unit") -def unit_scale_method(scale): - return torch.tensor(1.0) - - -@scale_method_registry("self") -def self_scale_method(scale): - return scale - - -def map_gaudi_scale(scale, method): - scale_method = scale_method_mapping[method] - return scale_method(scale) diff --git a/neural_compressor/torch/algorithms/habana_fp8/tensor/__init__.py b/neural_compressor/torch/algorithms/habana_fp8/tensor/__init__.py deleted file mode 100644 index 28f108cb636..00000000000 --- a/neural_compressor/torch/algorithms/habana_fp8/tensor/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/neural_compressor/torch/algorithms/habana_fp8/tensor/convert.cpp b/neural_compressor/torch/algorithms/habana_fp8/tensor/convert.cpp deleted file mode 100644 index f22c5c82c89..00000000000 --- a/neural_compressor/torch/algorithms/habana_fp8/tensor/convert.cpp +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright (c) 2024 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Temporary implementation of fp8 tensor saving and loading -// Will remove after Habana torch applies below patch: -// https://github.com/pytorch/pytorch/pull/114662 - - -#include - - -// function prototype declaration -torch::Tensor to_u8(torch::Tensor tensor); -torch::Tensor from_u8(torch::Tensor tensor, int choice=1); - - -torch::Tensor to_u8(torch::Tensor tensor) { - auto p = tensor.data_ptr(); - // RuntimeError: HPU device type not enabled. - auto options = torch::TensorOptions().device(torch::kCPU).dtype(torch::kUInt8); - auto tmp = torch::from_blob(p, tensor.sizes(), options); - // copy to avoid memory leak. - torch::Tensor tensor_uint8 = torch::empty_like(tensor, torch::kUInt8).copy_(tmp); - return tensor_uint8; -}; - - -/* -choice=1 means torch.float8_e4m3fn; -others means torch.float8_e5m2; -*/ -torch::Tensor from_u8(torch::Tensor tensor, int choice) { - auto p = tensor.data_ptr(); - torch::ScalarType dtype; - if (choice == 1) { - dtype = torch::kFloat8_e4m3fn; - } - else { - dtype = torch::kFloat8_e5m2; - } - auto options = torch::TensorOptions().device(torch::kCPU).dtype(dtype); - auto tmp = torch::from_blob(p, tensor.sizes(), options); - // copy to avoid memory leak. - torch::Tensor tensor_fp8 = torch::empty_like(tensor, dtype).copy_(tmp); - return tensor_fp8; -}; - - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("to_u8", &to_u8, "Convert tensor to u8 for saving."); - m.def("from_u8", &from_u8, "Recover tensor from u8 for loading."); -}; diff --git a/neural_compressor/torch/amp/__init__.py b/neural_compressor/torch/amp/__init__.py deleted file mode 100644 index 87a0c8287d0..00000000000 --- a/neural_compressor/torch/amp/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .autocast import autocast diff --git a/neural_compressor/torch/amp/autocast.py b/neural_compressor/torch/amp/autocast.py deleted file mode 100644 index 7375b80c0f5..00000000000 --- a/neural_compressor/torch/amp/autocast.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from typing import Any, Optional - -import torch -from torch.types import _dtype - - -class autocast: - r"""Instances of :class:`autocast` serve as context managers or decorators that - allow regions of your script to run in mixed precision. - - In these regions, ops run in an op-specific dtype chosen by autocast - to improve performance while maintaining accuracy. - - When entering an autocast-enabled region, Tensors may be any type. - You should not call ``half()`` or ``bfloat16()`` on your model(s) or inputs when using autocasting. - - :class:`autocast` should wrap only the forward pass(es) of your network, including the loss - computation(s). Backward passes under autocast are not recommended. - Backward ops run in the same type that autocast used for corresponding forward ops. - - # Enables autocasting for the inference pass - with torch.autocast(device_type="hpu", dtype=torch.float8_e4m3fn): - output = model(input) - - :class:`autocast` can also be used as a decorator, e.g., on the ``forward`` method of your model:: - - class AutocastModel(nn.Module): - ... - @torch.autocast(device_type="cuda") - def forward(self, input): - ... - - The autocast state is thread-local. If you want it enabled in a new thread, the context manager or decorator - must be invoked in that thread. This affects :class:`torch.nn.DataParallel` and - :class:`torch.nn.parallel.DistributedDataParallel` when used with more than one GPU per process - (see :ref:`Working with Multiple GPUs`). - - Args: - device_type(str, required): Device type to use. Possible values are: 'cuda', 'cpu', 'xpu' and 'hpu'. - The type is the same as the `type` attribute of a :class:`torch.device`. - Thus, you may obtain the device type of a tensor using `Tensor.device.type`. - enabled(bool, optional): Whether autocasting should be enabled in the region. - Default: ``True`` - dtype(torch_dtype, optional): Whether to use torch.float16 or torch.bfloat16. - cache_enabled(bool, optional): Whether the weight cache inside autocast should be enabled. - Default: ``True`` - """ - - def __init__( - self, - device_type: str, - dtype: Optional[_dtype] = None, - enabled: bool = True, - cache_enabled: Optional[bool] = None, - ): - self.device = device_type - if dtype is not None: - self.fast_dtype = dtype - if cache_enabled is not None: - self._cache_enabled = cache_enabled - if not (device_type == "hpu" and dtype in [torch.float8_e4m3fn, torch.float8_e5m2]): - self._autocast = torch.autocast(device_type, dtype, enabled, cache_enabled) - - def __enter__(self) -> None: - if self.device == "hpu" and self.fast_dtype in [torch.float8_e4m3fn, torch.float8_e5m2]: - from neural_compressor.torch.amp.fp8.functions import replace_func - - # This function will replace F.linear and torch.matmul with the fp8 one - replace_func(self.fast_dtype) - else: - self._autocast.__enter__() - - def __exit__(self, exc_type, exc_value, traceback) -> None: - if self.device == "hpu" and self.fast_dtype in [torch.float8_e4m3fn, torch.float8_e5m2]: - from neural_compressor.torch.amp.fp8.functions import recover_func - - # This function will recover F.linear and torch.matmul with the original one - recover_func() - else: - self._autocast.__exit__(exc_type, exc_value, traceback) diff --git a/neural_compressor/torch/amp/fp8/__init__.py b/neural_compressor/torch/amp/fp8/__init__.py deleted file mode 100644 index 28f108cb636..00000000000 --- a/neural_compressor/torch/amp/fp8/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/neural_compressor/torch/amp/fp8/functions.py b/neural_compressor/torch/amp/fp8/functions.py deleted file mode 100644 index f8f19a64b17..00000000000 --- a/neural_compressor/torch/amp/fp8/functions.py +++ /dev/null @@ -1,134 +0,0 @@ -# Copyright (c) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# pylint:disable=import-error - -import os - -import habana_frameworks.torch.core as htcore -import habana_frameworks.torch.hpex -import torch -from torch.nn import functional as F - -from neural_compressor.torch.algorithms.habana_fp8.observer import calculate_qparams -from neural_compressor.torch.utils import logger - -_F_linear = F.linear -_torch_matmul = torch.matmul -_torch_bmm = torch.bmm - - -DATA_TYPE = torch.float8_e4m3fn -USE_AMAX = bool(os.getenv("PT_USE_FP8_AMAX", False)) - - -def fp8_linear_forward(input, weight, bias=None): - out_dtype = torch.float32 - org_middle_shape = input.shape[1:-1] - input = input.view((-1, weight.shape[-1])) - # process input - if input.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]: - out_dtype = input.dtype - if USE_AMAX: - input_scale = calculate_qparams(input.min(), input.max(), DATA_TYPE) - input_scale_inv = torch.reciprocal(input_scale) - else: - input_scale, input_scale_inv = None, None - input = torch.ops.hpu.cast_to_fp8_v2(input, input_scale_inv, False, False, DATA_TYPE)[0] - else: - # skip cast for input - input_scale, input_scale_inv = None, None - # process weight - if weight.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]: - out_dtype = weight.dtype - if USE_AMAX: - weight_scale = calculate_qparams(weight.min(), weight.max(), DATA_TYPE) - weight_scale_inv = torch.reciprocal(weight_scale) - else: - weight_scale, weight_scale_inv = None, None - weight = torch.ops.hpu.cast_to_fp8_v2(weight, weight_scale_inv, False, False, DATA_TYPE)[0] - else: - # skip cast for weight - weight_scale, weight_scale_inv = None, None - out = torch.ops.hpu.fp8_gemm_v2( - input, - False, - weight, - True, - None, - out_dtype, - input_scale, - weight_scale, - bias, - False, - ) - out = out.view(-1, *org_middle_shape, out.shape[-1]) - return out - - -def fp8_matmul(input1, input2): - out_dtype = torch.float32 - # process input1 - if input1.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]: - out_dtype = input1.dtype - if USE_AMAX: - input1_scale = calculate_qparams(input1.min(), input1.max(), DATA_TYPE) - input1_scale_inv = torch.reciprocal(input1_scale) - else: - input1_scale, input1_scale_inv = None, None - input1 = torch.ops.hpu.cast_to_fp8_v2(input1, input1_scale_inv, False, False, DATA_TYPE)[0] - else: - # skip cast for input1 - input1_scale, input1_scale_inv = None, None - # process input2 - if input2.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]: - out_dtype = input2.dtype - if USE_AMAX: - input2_scale = calculate_qparams(input2.min(), input2.max(), DATA_TYPE) - input2_scale_inv = torch.reciprocal(input2_scale) - else: - input2_scale, input2_scale_inv = None, None - input2 = torch.ops.hpu.cast_to_fp8_v2(input2, input2_scale_inv, False, False, DATA_TYPE)[0] - else: - # skip cast for input2 - input2_scale, input2_scale_inv = None, None - # calculate - out = torch.ops.hpu.fp8_gemm_v2( - input1, - False, - input2, - False, - None, - out_dtype, - input1_scale, - input2_scale, - None, - False, - ) - return out - - -def replace_func(dtype): - global DATA_TYPE - DATA_TYPE = dtype - F.linear = fp8_linear_forward - torch.matmul = fp8_matmul - torch.bmm = fp8_matmul - logger.debug("F.linear and torch.matmul are replaced with the fp8 one") - - -def recover_func(): - F.linear = _F_linear - torch.matmul = _torch_matmul - torch.bmm = _torch_bmm - logger.debug("F.linear and torch.matmul are recovered") diff --git a/neural_compressor/torch/quantization/__init__.py b/neural_compressor/torch/quantization/__init__.py index 3bc12580848..9f459bbd67f 100644 --- a/neural_compressor/torch/quantization/__init__.py +++ b/neural_compressor/torch/quantization/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from neural_compressor.torch.quantization.quantize import quantize, prepare, convert +from neural_compressor.torch.quantization.quantize import quantize, prepare, convert, finalize_calibration from neural_compressor.torch.quantization.config import ( RTNConfig, get_default_rtn_config, diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py index 3e107718e51..ba9f69001c4 100644 --- a/neural_compressor/torch/quantization/algorithm_entry.py +++ b/neural_compressor/torch/quantization/algorithm_entry.py @@ -519,20 +519,22 @@ def hqq_entry( ###################### Habana FP8 Algo Entry ################################## -from neural_compressor.torch.utils import is_hpex_available - -if is_hpex_available(): - from neural_compressor.torch.algorithms.habana_fp8 import quantize, save - - @register_algo(FP8_QUANT) - def fp8_quant_entry( - model: torch.nn.Module, configs_mapping: Dict[Tuple[str], FP8Config], *args, **kwargs - ) -> torch.nn.Module: - kwargs.pop("example_inputs") - model = quantize(model, configs_mapping, *args, **kwargs) - model.qconfig = configs_mapping - model.save = MethodType(save, model) - return model +@register_algo(FP8_QUANT) +@torch.no_grad() +def fp8_entry( + model: torch.nn.Module, + configs_mapping: Dict[Tuple[str], FP8Config], + mode: Mode = Mode.QUANTIZE, + *args, + **kwargs, +) -> torch.nn.Module: + """The main entry to apply fp8 quantization.""" + from neural_compressor.torch.algorithms.fp8_quant import FP8Quantizer + + quantizer = get_quantizer(model, quantizer_cls=FP8Quantizer, quant_config=configs_mapping) + model = quantizer.execute(model, mode=mode) + postprocess_model(model, mode, quantizer) + return model ###################### MX Quant Algo Entry ################################## diff --git a/neural_compressor/torch/quantization/config.py b/neural_compressor/torch/quantization/config.py index 05a2a956b56..6a20f2bba42 100644 --- a/neural_compressor/torch/quantization/config.py +++ b/neural_compressor/torch/quantization/config.py @@ -16,6 +16,8 @@ # limitations under the License. # pylint:disable=import-error +import json +import importlib from collections import OrderedDict from typing import Callable, Dict, List, NamedTuple, Optional from typing import OrderedDict as OrderedDictType @@ -1230,81 +1232,142 @@ def get_default_hqq_config() -> HQQConfig: return HQQConfig() -######################## FP8 Config ############################### +######################## FP8 Quant Config ############################### +# refer to habana_quantization_toolkit/_core/common.py +FP8_WHITE_LIST = ( + "Matmul", "Linear", "FalconLinear", "KVCache", "Conv2d", + "LoRACompatibleLinear", "LoRACompatibleConv", "Softmax", "ModuleFusedSDPA") +if importlib.util.find_spec("deepspeed"): + FP8_WHITE_LIST.append( + "LinearLayer", "LinearAllreduce","ScopedLinearAllReduce", "LmHeadLinearAllreduce") + @register_config(framework_name=FRAMEWORK_NAME, algo_name=FP8_QUANT) class FP8Config(BaseConfig): """Config class for FP8 quantization.""" name = FP8_QUANT - supported_configs: List[OperatorConfig] = [] + + # tunable params params_list = [ - "w_dtype", - "w_observer", - "act_dtype", - "act_observer", - "approach", - "device", + "fp8_config", + "scale_method", + "observer", + "measure_exclude", ] def __init__( self, - w_dtype: str = "fp8_e4m3", - w_observer: Union[str, List[str]] = "minmax_per_channel", - act_dtype: str = "fp8_e4m3", - act_observer: Union[str, List[str]] = "minmax", - approach: Union[str, List[str]] = "static", - device: Union[str, List[str]] = "hpu", - white_list: Optional[List[OP_NAME_OR_MODULE_TYPE]] = DEFAULT_WHITE_LIST, + dump_stats_path: str = "./hqt_output/measure", + fp8_config: str = "E4M3", + hp_dtype: torch.dtype = torch.bfloat16, + blocklist: dict = {'names': [], 'types': ()}, + allowlist: dict = {'names': [], 'types': FP8_WHITE_LIST}, + mode: str = "AUTO", + scale_method: str = "maxabs_hw", + scale_params: dict = {}, + observer: str = "maxabs", + mod_dict: dict = {}, + measure_exclude: str = "OUTPUT", + **kwargs, ): - """Init FP8 config. + """Init FP8 config.""" + super().__init__() + self.dump_stats_path =dump_stats_path + self.fp8_config = fp8_config + self.hp_dtype = hp_dtype + self.blocklist = blocklist + self.allowlist = allowlist + self.mode = mode + self.scale_method = scale_method + self.scale_params = scale_params + self.observer = observer + self.mod_dict = mod_dict + self._json_file = None + + @property + def measure(self): + return self.mode == "MEASURE" + + @property + def quantize(self): + return self.mode == "QUANTIZE" + + @property + def json_file(self): + if self._json_file is None: + import tempfile + from pathlib import Path + + json_file_tmp = tempfile.NamedTemporaryFile(suffix=".json") + self.to_json_file(json_file_tmp.name) + self.json_file(json_file_tmp.name) + return self._json_file + + @json_file.setter + def json_file(self, json_file): + self._json_file = json_file - Args: - """ - super().__init__(white_list=white_list) - self.w_dtype = w_dtype - self.w_observer = w_observer - self.act_dtype = act_dtype - self.act_observer = act_observer - self.approach = approach - self.device = device - self._post_init() + @classmethod + def from_json_file(cls, filename): + with open(filename, "r", encoding="utf-8") as file: + config_dict = json.load(file) + config = cls.from_dict(config_dict) + config.json_file = filename + return config @classmethod - def register_supported_configs(cls) -> List[OperatorConfig]: + def get_config_set_for_tuning(cls) -> Union[None, "FP8Config", List["FP8Config"]]: + # just a simple example here + # usually write parameter combinations that are more suitable to tune based on experience. + return FP8Config( + fp8_config=["E4M3", "E5M2"], + scale_method=["without_scale", "maxabs_hw"], + measure_exclude=["NONE", "OUTPUT"]) + + @classmethod + def register_supported_configs(cls): + """Add all supported configs.""" supported_configs = [] - fp8_config = FP8Config( - w_dtype=["fp8_e5m2", "fp8_e4m3"], - w_observer=["minmax", "minmax_per_channel"], - act_dtype=["fp8_e5m2", "fp8_e4m3"], - act_observer=["minmax", "kl"], - approach=["static", "dynamic"], - device=["hpu"], + linear_rtn_config = FP8Config( + mode=["AUTO", "MEASURE", "QUANTIZE"], + fp8_config=["E4M3", "E5M2"], + scale_method=["without_scale", "unit_scale", "max", "maxabs_hw", + "maxabs_pow2", "maxabs_hw_opt_weight", "maxabs_pow2_opt_weight", + "smoothquant_weights_output_channel_maxabs_pow2", + "weaksmoothquant_weights_output_channel_maxabs_pow2", + "act_maxabs_hw_weights_pcs_maxabs_pow2", + "act_maxabs_hw_weights_pcs_opt_pow2", + "act_maxabs_pow2_weights_pcs_maxabs_pow2", + "act_maxabs_pow2_weights_pcs_opt_pow2", + "smoothquant_opt"], + observer=["shape", "maxabs", "maxabs_per_channel", "save"], + measure_exclude=["NONE", "OUTPUT", "INPUT", "ALL"], ) - if is_hpex_available(): - from neural_compressor.torch.algorithms.habana_fp8 import white_list - - operators = white_list - else: - operators = () - supported_configs.append(OperatorConfig(config=fp8_config, operators=operators)) + operators = list(FP8_WHITE_LIST) + supported_configs.append(OperatorConfig(config=linear_rtn_config, operators=operators)) cls.supported_configs = supported_configs @staticmethod def get_model_info(model: torch.nn.Module) -> List[Tuple[str, Callable]]: - from neural_compressor.torch.algorithms.habana_fp8 import white_list - filter_result = [] for op_name, module in model.named_modules(): - if isinstance(module, white_list): - pair = (op_name, type(module).__name__) + if module.__class__.__name__ in FP8_WHITE_LIST or \ + module.__class__.__name__.split("Patched")[-1] in FP8_WHITE_LIST: + pair = (op_name, module.__class__.__name__) filter_result.append(pair) logger.debug(f"Get model info: {filter_result}") return filter_result - @classmethod - def get_config_set_for_tuning(cls) -> Union[None, "FP8Config", List["FP8Config"]]: - # TODO fwk owner needs to update it. - return FP8Config(act_observer=["minmax", "kl"]) + def to_config_mapping( + self, config_list: List[BaseConfig] = None, model_info: List[Tuple[str, str]] = None + ): + config_mapping = OrderedDict() + if config_list is None: + config_list = [self] + for config in config_list: + for op_name, op_type in model_info: + config_mapping[(op_name, op_type)] = self + return config_mapping def get_default_fp8_config() -> FP8Config: diff --git a/neural_compressor/torch/quantization/load_entry.py b/neural_compressor/torch/quantization/load_entry.py index fb870a92e77..cb2f8b4010f 100644 --- a/neural_compressor/torch/quantization/load_entry.py +++ b/neural_compressor/torch/quantization/load_entry.py @@ -61,7 +61,6 @@ def load(output_dir="./saved_results", model=None): return load(output_dir) model.qconfig = config_mapping - if isinstance(config_object, FP8Config): # FP8 - from neural_compressor.torch.algorithms.habana_fp8 import load - - return load(model, output_dir) # pylint: disable=E1121 + if isinstance(config_object, FP8Config): + # TODO: support loading FP8 model + raise NotImplementedError("`load` function for FP8 model is not supported yet.") diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py index 57197a91972..0c2fdcd94d8 100644 --- a/neural_compressor/torch/quantization/quantize.py +++ b/neural_compressor/torch/quantization/quantize.py @@ -19,7 +19,7 @@ from neural_compressor.common.base_config import BaseConfig, ComposableConfig, config_registry from neural_compressor.common.utils import Mode, log_process -from neural_compressor.torch.quantization.config import SmoothQuantConfig, StaticQuantConfig +from neural_compressor.torch.quantization.config import SmoothQuantConfig, StaticQuantConfig, FP8Config from neural_compressor.torch.utils import is_ipex_available, logger from neural_compressor.torch.utils.utility import WHITE_MODULE_LIST, algos_mapping, get_model_info @@ -60,8 +60,8 @@ def quantize( assert isinstance( quant_config, BaseConfig ), f"Please pass a dict or config instance as the quantization configuration, but got {type(quant_config)}." - logger.info("Quantize model with config:") - logger.info(quant_config.to_dict()) + logger.debug("Quantize model with config:") + logger.debug(quant_config.to_dict()) # select quantization algo according to config if is_ipex_available and ( @@ -129,8 +129,8 @@ def prepare( assert isinstance( quant_config, BaseConfig ), f"Please pass a dict or config instance as the quantization configuration, but got {type(quant_config)}." - logger.info("Prepare model with config:") - logger.info(quant_config.to_dict()) + logger.debug("Prepare model with config:") + logger.debug(quant_config.to_dict()) # select quantization algo according to config if is_ipex_available and ( @@ -176,8 +176,9 @@ def convert( """ q_model = model if inplace else copy.deepcopy(model) - # TODO: Optimize the check for prepared flag after adding HQT FP8 Quant - assert getattr(model, "prepared", False), "Please run prepare function before convert." + assert ( + getattr(model, "prepared", False) or quant_config is not None + ), "Please pass quant_config to convert function." if getattr(model, "prepared", False): if quant_config is None: @@ -192,8 +193,8 @@ def convert( assert isinstance( quant_config, BaseConfig ), f"Please pass a dict or config instance as the quantization configuration, but got {type(quant_config)}." - logger.info("Convert model with config:") - logger.info(quant_config.to_dict()) + logger.debug("Convert model with config:") + logger.debug(quant_config.to_dict()) # select quantization algo according to config if is_ipex_available and ( @@ -216,3 +217,12 @@ def convert( mode=Mode.CONVERT, ) return q_model + + +def finalize_calibration(model): + if hasattr(model, "quant_config") and isinstance(model.quant_config, FP8Config): # FP8 + from neural_compressor.torch.algorithms.fp8_quant import save_calib_result + + save_calib_result(model) + else: + raise NotImplementedError("`finalize_calibration` only supports FP8 measurement now.") diff --git a/setup.py b/setup.py index cccb6c9feea..a1d4b8de02c 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,8 @@ def get_build_version(): return __version__ try: result = subprocess.run(["git", "describe", "--tags"], capture_output=True, text=True, check=True) - _, distance, commit = result.stdout.strip().split("-") + distance = result.stdout.strip().split("-")[-2] + commit = result.stdout.strip().split("-")[-1] return f"{__version__}.dev{distance}+{commit}" except subprocess.CalledProcessError: return __version__ diff --git a/test/3x/torch/amp/test_fp8_amp.py b/test/3x/torch/amp/test_fp8_amp.py deleted file mode 100644 index a5212467723..00000000000 --- a/test/3x/torch/amp/test_fp8_amp.py +++ /dev/null @@ -1,75 +0,0 @@ -import copy -import os -import shutil -import unittest - -import torch - -from neural_compressor.torch.amp import autocast -from neural_compressor.torch.utils import is_hpex_available - -# if not is_hpex_available(): -# exit() - - -class M(torch.nn.Module): - def __init__(self) -> None: - super().__init__() - self.fc1 = torch.nn.Linear(10, 5) - self.fc2 = torch.nn.Linear(5, 10) - - def forward(self, inp): - x1 = self.fc1(inp) - x2 = self.fc2(x1) - x3 = torch.matmul(inp.T, x2) - x3 = x3.unsqueeze(0) - x3 = torch.bmm(x3, x3) - return x3 - - -@unittest.skipIf(not is_hpex_available(), "HPEX is required for HPU inference") -class TestPytorchFP8Adaptor(unittest.TestCase): - @classmethod - def setUpClass(self): - self.model = M().to("hpu") - self.inp = torch.randn(1, 10).to("hpu") - - @classmethod - def tearDownClass(self): - shutil.rmtree("./saved", ignore_errors=True) - shutil.rmtree("./.graph_dumps", ignore_errors=True) - shutil.rmtree("runs", ignore_errors=True) - - def test_autocast(self): - m = copy.deepcopy(self.model) - inp = self.inp - fp32_out = m(inp) - with autocast("hpu", dtype=torch.bfloat16) and torch.no_grad(): - bf16_out = m(inp) - print("BF16 MSE:", (bf16_out - fp32_out).pow(2).sum()) - - with autocast("hpu", dtype=torch.float8_e5m2) and torch.no_grad(): - e5m2_out = m(inp) - print("FP8_E5M2 MSE:", (e5m2_out - fp32_out).pow(2).sum()) - - with autocast("hpu", dtype=torch.float8_e4m3fn) and torch.no_grad(): - e4m3_out = m(inp) - print("FP8_E4M3 MSE:", (e4m3_out - fp32_out).pow(2).sum()) - - def test_autocast_use_amax(self): - os.environ["PT_USE_FP8_AMAX"] = str(1) - m = copy.deepcopy(self.model) - inp = self.inp - fp32_out = m(inp) - with autocast("hpu", dtype=torch.float8_e5m2) and torch.no_grad(): - e5m2_out = m(inp) - print("FP8_E5M2 using amax MSE:", (e5m2_out - fp32_out).pow(2).sum()) - - with autocast("hpu", dtype=torch.float8_e4m3fn) and torch.no_grad(): - e4m3_out = m(inp) - print("FP8_E4M3 using amax MSE:", (e4m3_out - fp32_out).pow(2).sum()) - os.environ.pop("PT_USE_FP8_AMAX", None) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/3x/torch/quantization/habana_fp8/test_fp8.py b/test/3x/torch/quantization/habana_fp8/test_fp8.py deleted file mode 100644 index 8fafc302f65..00000000000 --- a/test/3x/torch/quantization/habana_fp8/test_fp8.py +++ /dev/null @@ -1,189 +0,0 @@ -import copy -import shutil - -import pytest -import torch - -from neural_compressor.torch.utils import is_hpex_available - -if is_hpex_available(): - from neural_compressor.torch.algorithms.habana_fp8 import quantize_dynamic - from neural_compressor.torch.algorithms.habana_fp8.modules import ( - BatchMatmul, - FP8BatchMatmul, - FP8DynamicBatchMatmul, - FP8DynamicLinear, - FP8DynamicMatmul, - FP8Linear, - FP8Matmul, - Matmul, - ) - from neural_compressor.torch.quantization import ( - FP8Config, - TuningConfig, - autotune, - get_default_fp8_config, - get_default_fp8_config_set, - quantize, - ) - - torch.set_grad_enabled(False) - - -class M(torch.nn.Module): - def __init__(self) -> None: - super().__init__() - self.fc1 = torch.nn.Linear(10, 5) - self.fc2 = torch.nn.Linear(5, 10) - self.mm = Matmul() - self.bmm = BatchMatmul() - - def forward(self, inp): - x1 = self.fc1(inp) - x2 = self.fc2(x1) - x3 = self.mm(inp.T, x2) - x3 = x3.unsqueeze(0) - x4 = self.mm(inp.T, x2) - x4 = x4.unsqueeze(0) + 1 ## SW-178838 - x5 = self.bmm(x3, x4) - x6 = self.bmm(x3, x4) - out = x5 + x6 - return out - - -@pytest.mark.skipif(not is_hpex_available(), reason="no hpex in environment here.") -class TestPytorchFP8Adaptor: - def setup_class(self): - self.model = M().to("hpu") - self.inp = torch.randn(1, 10).to("hpu") - self.fp32_out = self.model(self.inp) - - def teardown_class(self): - shutil.rmtree("./saved", ignore_errors=True) - shutil.rmtree("./.graph_dumps", ignore_errors=True) - shutil.rmtree("runs", ignore_errors=True) - - def test_dynamic_accu(self): - m = copy.deepcopy(self.model) - inp = self.inp - fp32_out = m(inp) - m = quantize_dynamic(m, dtype="fp8_e5m2", inplace=True) - assert isinstance(m.fc1, FP8DynamicLinear), "Unexpected result. Please double check." - assert isinstance(m.mm, FP8DynamicMatmul), "Unexpected result. Please double check." - assert isinstance(m.bmm, FP8DynamicBatchMatmul), "Unexpected result. Please double check." - print(m) - fp8_out = m(inp) - print("Dynamic quantization FP8_E5M2 MSE:", (fp32_out - fp8_out).pow(2).sum()) - - m = copy.deepcopy(self.model) - inp = self.inp - fp32_out = m(inp) - m = quantize_dynamic(m, dtype="fp8_e4m3", inplace=True) - assert isinstance(m.fc1, FP8DynamicLinear), "Unexpected result. Please double check." - assert isinstance(m.mm, FP8DynamicMatmul), "Unexpected result. Please double check." - assert isinstance(m.bmm, FP8DynamicBatchMatmul), "Unexpected result. Please double check." - print(m) - fp8_out = m(inp) - print("Dynamic quantization FP8_E4M3 MSE:", (fp32_out - fp8_out).pow(2).sum()) - - m = copy.deepcopy(self.model) - inp = self.inp - fp32_out = m(inp) - qconfig = FP8Config(approach="dynamic") - m = quantize(m, qconfig, inplace=True) - assert isinstance(m.fc1, FP8DynamicLinear), "Unexpected result. Please double check." - assert isinstance(m.mm, FP8DynamicMatmul), "Unexpected result. Please double check." - assert isinstance(m.bmm, FP8DynamicBatchMatmul), "Unexpected result. Please double check." - print(m) - fp8_out = m(inp) - print("Dynamic quantization FP8_E4M3 MSE:", (fp32_out - fp8_out).pow(2).sum()) - - @pytest.mark.parametrize("dtype", ["fp8_e5m2", "fp8_e4m3"]) - @pytest.mark.parametrize("w_observer", ["minmax", "minmax_per_channel"]) - @pytest.mark.parametrize("act_observer", ["minmax", "kl"]) - def test_static_accu(self, dtype, w_observer, act_observer): - m = copy.deepcopy(self.model) - inp = self.inp - qconfig = FP8Config( - w_dtype=dtype, w_observer=w_observer, act_dtype=dtype, act_observer=act_observer, approach="static" - ) - - def calib_func(model): - model(inp) - - m = quantize(m, qconfig, run_fn=calib_func, inplace=True) - assert isinstance(m.fc1, FP8Linear), "Unexpected result. Please double check." - assert isinstance(m.mm, FP8Matmul), "Unexpected result. Please double check." - assert isinstance(m.bmm, FP8BatchMatmul), "Unexpected result. Please double check." - fp8_out = m(inp) - print("Static quantization config:", dtype, w_observer, act_observer) - print("Static quantization MSE:", (self.fp32_out - fp8_out).pow(2).sum()) - - def test_convert(self): - # Temporary implementation of fp8 tensor saving and loading - # Will remove after Habana torch applies below patch: - # https://github.com/pytorch/pytorch/pull/114662 - # e4m3 - fp8_inp = torch.ops.hpu.cast_to_fp8_v2(self.inp, 500, dtype=torch.float8_e4m3fn)[0].to("cpu") - import fp8_convert - - int8_inp = fp8_convert.to_u8(fp8_inp) - torch.save(int8_inp, "tmp.pt") - saved_int8_inp = torch.load("tmp.pt") - recovered_inp = fp8_convert.from_u8(saved_int8_inp, 1) - assert (fp8_inp == recovered_inp).all(), "Unexpected result. Please double check." - # e5m2 - fp8_inp = torch.ops.hpu.cast_to_fp8_v2(self.inp, 500, dtype=torch.float8_e5m2)[0].to("cpu") - int8_inp = fp8_convert.to_u8(fp8_inp) - recovered_inp = fp8_convert.from_u8(int8_inp, 0) - assert (fp8_inp == recovered_inp).all(), "Unexpected result. Please double check." - - def test_save_load(self): - m = copy.deepcopy(self.model) - inp = self.inp - qconfig = get_default_fp8_config() - - def calib_func(model): - model(inp) - - m = quantize(m, qconfig, run_fn=calib_func, inplace=True) - fp8_out = m(inp) - m.save("saved_results") - - from neural_compressor.torch.quantization import load - - m = copy.deepcopy(self.model) - m = load("saved_results", m) - recovered_out = m(inp) - assert (recovered_out == fp8_out).all(), "Unexpected result. Please double check." - assert isinstance(m.fc1, FP8Linear), "Unexpected result. Please double check." - assert isinstance(m.mm, FP8Matmul), "Unexpected result. Please double check." - assert isinstance(m.bmm, FP8BatchMatmul), "Unexpected result. Please double check." - - def test_autotune(self): - m = copy.deepcopy(self.model) - inp = self.inp - fp32_out = m(inp) - - def calib_func(model): - model(inp) - - accu_list = [1.0, 0.9, 0.99] - - def eval_func(model): - nonlocal accu_list - return accu_list.pop() - - tune_config = TuningConfig( - config_set=get_default_fp8_config_set(), - tolerable_loss=0.01, - ) - best_model = autotune( - model=m, - tune_config=tune_config, - run_fn=calib_func, - eval_fns=eval_func, - ) - assert isinstance(best_model.fc1, FP8Linear), "Unexpected result. Please double check." - assert isinstance(best_model.mm, FP8Matmul), "Unexpected result. Please double check." - assert isinstance(best_model.bmm, FP8BatchMatmul), "Unexpected result. Please double check."